In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn.preprocessing as pp
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
#load csv file
data = pd.read_csv('bank_marketing.csv',sep=';')

In [3]:
# drop none value
data.dropna(inplace=True)
print(data.shape)
data.head()

(39999, 22)


Unnamed: 0,id,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscription
0,0,56,housemaid,married,basic.4y,no,no,no,telephone,may,...,1,999.0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,1,57,services,married,high.school,unknown,no,no,telephone,may,...,1,999.0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,2,37,services,married,high.school,no,yes,no,telephone,may,...,1,999.0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,3,40,admin.,married,basic.6y,no,no,no,telephone,may,...,1,999.0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,4,56,services,married,high.school,no,no,yes,telephone,may,...,1,999.0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
# Task 1
num_feature = data[['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']]
print(num_feature.shape)
# normalized numerical features
scaler = pp.MinMaxScaler()
scaler_num_feature = pd.DataFrame(scaler.fit_transform(num_feature))
scaler_num_feature.columns = ['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']
scaler_num_feature.head()
print(scaler_num_feature.shape)

(39999, 10)
(39999, 10)


In [5]:
# one hot code
cate_feature = data[['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome','subscription']]
enc = OrdinalEncoder()
enc.fit(cate_feature)
one_hot_cate_feature = pd.DataFrame(enc.transform(cate_feature))
one_hot_cate_feature.columns = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome','subscription']

In [7]:
# Logistic Regression Model 1
LR_data1 = pd.concat([scaler_num_feature, one_hot_cate_feature], axis=1)
x_data1 = LR_data1.loc[:, ['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']]
y_data1 = LR_data1.loc[:, 'subscription']
kf1 = KFold(n_splits=4)
for train, test in kf1.split(x_data1, y_data1):
    x_train = x_data1.iloc[train]
    x_test = x_data1.iloc[test]
    y_train = y_data1.iloc[train]
    y_test = y_data1.iloc[test]
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    lr_y_predict = lr.predict(x_test)
    print("Precision Score: ", precision_score(y_test ,lr_y_predict, average='weighted'))
    print("Recall Score: ", recall_score(y_test ,lr_y_predict, average='weighted'))
    print("F1 Score: ", f1_score(y_test ,lr_y_predict, average='weighted'))
    print("Accuracy Score: ", accuracy_score(y_test ,lr_y_predict))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Precision Score:  0.8957974454530094
Recall Score:  0.9083
F1 Score:  0.8970024731449188
Accuracy Score:  0.9083


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Precision Score:  0.9012217908847184
Recall Score:  0.9124
F1 Score:  0.9028204393257417
Accuracy Score:  0.9124


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Precision Score:  0.8993675154653596
Recall Score:  0.9114
F1 Score:  0.899173008203252
Accuracy Score:  0.9114
Precision Score:  0.894199769951937
Recall Score:  0.907090709070907
F1 Score:  0.8950651464301824
Accuracy Score:  0.907090709070907


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
# Logistic Regression Model 2
LR_data2 = pd.concat([num_feature, one_hot_cate_feature], axis=1)
LR_data2.dropna(inplace=True)
x_data2 = LR_data2.loc[:, ['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']]
y_data2 = LR_data2.loc[:, 'subscription']
kf2 = KFold(n_splits=4)
for train, test in kf2.split(x_data2, y_data2):
    x_train = x_data2.iloc[train]
    x_test = x_data2.iloc[test]
    y_train = y_data2.iloc[train]
    y_test = y_data2.iloc[test]
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    lr_y_predict = lr.predict(x_test)
    print("Precision Score: ", precision_score(y_test , lr_y_predict, average='weighted'))
    print("Recall Score: ", recall_score(y_test ,lr_y_predict, average='weighted'))
    print("F1 Score: ", f1_score(y_test ,lr_y_predict, average='weighted'))
    print("Accuracy Score: ", accuracy_score(y_test ,lr_y_predict))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


Precision Score:  0.7858822499999999
Recall Score:  0.8865
F1 Score:  0.8331643254704478
Accuracy Score:  0.8865


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


Precision Score:  0.79085449
Recall Score:  0.8893
F1 Score:  0.837193129730588
Accuracy Score:  0.8893


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


Precision Score:  0.7888793779868035
Recall Score:  0.8881888188818882
F1 Score:  0.8355937394586918
Accuracy Score:  0.8881888188818882
Precision Score:  0.783912864733818
Recall Score:  0.8853885388538854
F1 Score:  0.831566383882182
Accuracy Score:  0.8853885388538854


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Task 2
for train, test in kf1.split(LR_data1):
    # Undersample
    #np.random.shuffle(train)
    print(type(train))
    train_data = LR_data1.iloc[train]
    positive_data = train_data[train_data['subscription']==1.0]
    negitive_data = train_data[train_data['subscription']==0.0]
    positive_size = len(positive_data)
    undersample_data = negitive_data.loc[negitive_data.index.intersection(train[0:10000])]
    train_data = pd.concat([positive_data, undersample_data], axis=0)
    train_data.dropna(inplace=True)
    x_train = train_data.loc[:, ['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']]
    y_train = train_data.loc[:, 'subscription']
    
    x_data1 = LR_data1.loc[:, ['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']]
    y_data1 = LR_data1.loc[:, 'subscription']
    x_test = x_data1.iloc[test]    
    y_test = y_data1.iloc[test]
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    lr_y_predict = lr.predict(x_test)
    print("Precision Score: ", precision_score(y_test ,lr_y_predict, average='weighted'))
    print("Recall Score: ", recall_score(y_test ,lr_y_predict, average='weighted'))
    print("F1 Score: ", f1_score(y_test ,lr_y_predict, average='weighted'))
    print("Accuracy Score: ", accuracy_score(y_test ,lr_y_predict))

<class 'numpy.ndarray'>
Precision Score:  0.9077491139772211
Recall Score:  0.9002
F1 Score:  0.9034818137736432
Accuracy Score:  0.9002
<class 'numpy.ndarray'>


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Precision Score:  0.9147199938733896
Recall Score:  0.904
F1 Score:  0.9083905963392129
Accuracy Score:  0.904
<class 'numpy.ndarray'>
Precision Score:  0.9129963597965074
Recall Score:  0.9051
F1 Score:  0.9084705605993464
Accuracy Score:  0.9051
<class 'numpy.ndarray'>
Precision Score:  0.9103645172725048
Recall Score:  0.9016901690169017
F1 Score:  0.905361680385156
Accuracy Score:  0.9016901690169017


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [10]:
# Task 3
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

categorical_feature = one_hot_cate_feature[['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']]
label = one_hot_cate_feature[['subscription']]
# chi-square for categorical data (K = 1)
cate_feature_new1 = pd.DataFrame(SelectKBest(chi2, k=1).fit_transform(categorical_feature, label))
# mutual information for numerical data (K = 1)
num_feature_new1 = pd.DataFrame(SelectKBest(mutual_info_classif, k=1).fit_transform(num_feature, label))
train_x1 = pd.concat([cate_feature_new1, num_feature_new1], axis=1)

kf1 = KFold(n_splits=4)
for train, test in kf1.split(train_x1, label):
    x_train = train_x1.iloc[train]
    x_test = train_x1.iloc[test]
    y_train = label.iloc[train]
    y_test = label.iloc[test]
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    lr_y_predict = lr.predict(x_test)
    print("Precision Score: ", precision_score(y_test ,lr_y_predict, average='weighted'))
    print("Recall Score: ", recall_score(y_test ,lr_y_predict, average='weighted'))
    print("F1 Score: ", f1_score(y_test ,lr_y_predict, average='weighted'))
    print("Accuracy Score: ", accuracy_score(y_test ,lr_y_predict))

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


Precision Score:  0.8684971689269634
Recall Score:  0.8929
F1 Score:  0.8675885497634065
Accuracy Score:  0.8929
Precision Score:  0.8707394285714286
Recall Score:  0.895
F1 Score:  0.8711441040187183
Accuracy Score:  0.895
Precision Score:  0.8740026023304929
Recall Score:  0.8963
F1 Score:  0.870338206914265
Accuracy Score:  0.8963
Precision Score:  0.8695561961658463
Recall Score:  0.892989298929893
F1 Score:  0.8655515893470793
Accuracy Score:  0.892989298929893


  return f(*args, **kwargs)


In [11]:
# Task 3
# chi-square for categorical data (K = 3)
cate_feature_new3 = pd.DataFrame(SelectKBest(chi2, k=3).fit_transform(categorical_feature, label))
# mutual information for numerical data (K = 3)
num_feature_new3 = pd.DataFrame(SelectKBest(mutual_info_classif, k=3).fit_transform(num_feature, label))
train_x3 = pd.concat([cate_feature_new3, num_feature_new3], axis=1)

kf3 = KFold(n_splits=4)
for train, test in kf3.split(train_x3, label):
    x_train = train_x3.iloc[train]
    x_test = train_x3.iloc[test]
    y_train = label.iloc[train]
    y_test = label.iloc[test]
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    lr_y_predict = lr.predict(x_test)
    print("Precision Score: ", precision_score(y_test ,lr_y_predict, average='weighted'))
    print("Recall Score: ", recall_score(y_test ,lr_y_predict, average='weighted'))
    print("F1 Score: ", f1_score(y_test ,lr_y_predict, average='weighted'))
    print("Accuracy Score: ", accuracy_score(y_test ,lr_y_predict))

  return f(*args, **kwargs)
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)


Precision Score:  0.8868231248433213
Recall Score:  0.9026
F1 Score:  0.8875732001081961
Accuracy Score:  0.9026


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)


Precision Score:  0.8825037848062953
Recall Score:  0.9001
F1 Score:  0.8852000415233597
Accuracy Score:  0.9001


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)


Precision Score:  0.8857395927226018
Recall Score:  0.9026
F1 Score:  0.885401030124836
Accuracy Score:  0.9026
Precision Score:  0.8806703147964859
Recall Score:  0.8985898589858986
F1 Score:  0.8813471856905575
Accuracy Score:  0.8985898589858986


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
# Task 3
# chi-square for categorical data (K = 5)
cate_feature_new5 = pd.DataFrame(SelectKBest(chi2, k=5).fit_transform(categorical_feature, label))
# mutual information for numerical data (K = 5)
num_feature_new5 = pd.DataFrame(SelectKBest(mutual_info_classif, k=5).fit_transform(num_feature, label))
train_x5 = pd.concat([cate_feature_new5, num_feature_new5], axis=1)

kf5 = KFold(n_splits=4)
for train, test in kf5.split(train_x5, label):
    x_train = train_x5.iloc[train]
    x_test = train_x5.iloc[test]
    y_train = label.iloc[train]
    y_test = label.iloc[test]
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    lr_y_predict = lr.predict(x_test)
    print("Precision Score: ", precision_score(y_test ,lr_y_predict, average='weighted'))
    print("Recall Score: ", recall_score(y_test ,lr_y_predict, average='weighted'))
    print("F1 Score: ", f1_score(y_test ,lr_y_predict, average='weighted'))
    print("Accuracy Score: ", accuracy_score(y_test ,lr_y_predict))

  return f(*args, **kwargs)
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)


Precision Score:  0.8949227495305259
Recall Score:  0.9078
F1 Score:  0.8958554549845088
Accuracy Score:  0.9078


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)


Precision Score:  0.894034607475863
Recall Score:  0.9071
F1 Score:  0.896427278394831
Accuracy Score:  0.9071


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)


Precision Score:  0.8959548687718331
Recall Score:  0.909
F1 Score:  0.8963126228182082
Accuracy Score:  0.909
Precision Score:  0.8878372941222994
Recall Score:  0.902990299029903
F1 Score:  0.8883775489262843
Accuracy Score:  0.902990299029903


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [13]:
# Task 4
from sklearn import tree
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
# imbalanced dataset
x_data1 = LR_data1.loc[:, ['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']]
y_data1 = LR_data1.loc[:, 'subscription']
kf1 = KFold(n_splits=4)

#x_train, x_test, y_train, y_test = train_test_split(x_data1, y_data1, test_size=0.25, random_state=42)
for train, test in kf1.split(x_data1, y_data1):
    x_train = x_data1.iloc[train]
    x_test = x_data1.iloc[test]
    y_train = y_data1.iloc[train]
    y_test = y_data1.iloc[test]
    # decision tree
    decision_tree_clf = tree.DecisionTreeClassifier()
    decision_tree_clf.fit(x_train, y_train)
    decision_tree_y_predict = decision_tree_clf.predict(x_test)
    print("Decision Tree Precision Score: ", precision_score(y_test, decision_tree_y_predict, average='weighted'))
    print("Decision Tree Recall Score: ", recall_score(y_test, decision_tree_y_predict, average='weighted'))
    print("Decision Tree F1 Score: ", f1_score(y_test, decision_tree_y_predict, average='weighted'))
    print("Decision Tree Accuracy Score: ", accuracy_score(y_test ,decision_tree_y_predict))
    # SVM
    svm_clf = SVC()
    svm_clf.fit(x_train, y_train)
    svm_y_predict = svm_clf.predict(x_test)
    print("SVM Precision Score: ", precision_score(y_test, svm_y_predict, average='weighted'))
    print("SVM Recall Score: ", recall_score(y_test, svm_y_predict, average='weighted'))
    print("SVM F1 Score: ", f1_score(y_test, svm_y_predict, average='weighted'))
    print("SVM Accuracy Score: ", accuracy_score(y_test , svm_y_predict))
    # multi-layer perceptron neural network
    MLP_clf = MLPClassifier()
    MLP_clf.fit(x_train, y_train)
    MLP_y_predict = MLP_clf.predict(x_test)
    print("MLP Precision Score: ", precision_score(y_test, MLP_y_predict, average='weighted'))
    print("MLP Recall Score: ", recall_score(y_test, MLP_y_predict, average='weighted'))
    print("MLP F1 Score: ", f1_score(y_test, MLP_y_predict, average='weighted'))
    print("MLP Accuracy Score: ", accuracy_score(y_test, MLP_y_predict))

Decision Tree Precision Score:  0.8902104729172452
Decision Tree Recall Score:  0.8884
Decision Tree F1 Score:  0.8892811712121995
Decision Tree Accuracy Score:  0.8884
SVM Precision Score:  0.874150536107728
SVM Recall Score:  0.8957
SVM F1 Score:  0.8710502870244938
SVM Accuracy Score:  0.8957




MLP Precision Score:  0.9004338104355811
MLP Recall Score:  0.9084
MLP F1 Score:  0.9032302363385137
MLP Accuracy Score:  0.9084
Decision Tree Precision Score:  0.8932070065448051
Decision Tree Recall Score:  0.8889
Decision Tree F1 Score:  0.8909269938298079
Decision Tree Accuracy Score:  0.8889
SVM Precision Score:  0.8794818584789831
SVM Recall Score:  0.8995
SVM F1 Score:  0.8763284732038703
SVM Accuracy Score:  0.8995




MLP Precision Score:  0.9102473644735993
MLP Recall Score:  0.9133
MLP F1 Score:  0.9116247128643253
MLP Accuracy Score:  0.9133
Decision Tree Precision Score:  0.8886502945368558
Decision Tree Recall Score:  0.8871
Decision Tree F1 Score:  0.8878585494876268
Decision Tree Accuracy Score:  0.8871
SVM Precision Score:  0.8809794951545322
SVM Recall Score:  0.8997
SVM F1 Score:  0.8754923194944989
SVM Accuracy Score:  0.8997




MLP Precision Score:  0.9064120460773573
MLP Recall Score:  0.9123
MLP F1 Score:  0.9087129493716033
MLP Accuracy Score:  0.9123
Decision Tree Precision Score:  0.8936222458404582
Decision Tree Recall Score:  0.8923892389238924
Decision Tree F1 Score:  0.892993079496038
Decision Tree Accuracy Score:  0.8923892389238924
SVM Precision Score:  0.8747690927978105
SVM Recall Score:  0.8955895589558955
SVM F1 Score:  0.8716463849958358
SVM Accuracy Score:  0.8955895589558955
MLP Precision Score:  0.9021384502516386
MLP Recall Score:  0.9096909690969097
MLP F1 Score:  0.9047690774253871
MLP Accuracy Score:  0.9096909690969097




In [14]:
# Task 4
# balanced dataset
for train, test in kf1.split(LR_data1):
    # Undersample
    #np.random.shuffle(train)
    print(type(train))
    train_data = LR_data1.iloc[train]
    positive_data = train_data[train_data['subscription']==1.0]
    negitive_data = train_data[train_data['subscription']==0.0]
    positive_size = len(positive_data)
    undersample_data = negitive_data.loc[negitive_data.index.intersection(train[0:positive_size])]
    train_data = pd.concat([positive_data, undersample_data], axis=0)
    train_data.dropna(inplace=True)
    x_train = train_data.loc[:, ['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']]
    y_train = train_data.loc[:, 'subscription']
    
    x_data1 = LR_data1.loc[:, ['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']]
    y_data1 = LR_data1.loc[:, 'subscription']
    x_test = x_data1.iloc[test]    
    y_test = y_data1.iloc[test]
    # decision tree
    decision_tree_clf = tree.DecisionTreeClassifier()
    decision_tree_clf.fit(x_train, y_train)
    decision_tree_y_predict = decision_tree_clf.predict(x_test)
    print("Decision Tree Precision Score: ", precision_score(y_test, decision_tree_y_predict, average='weighted'))
    print("Decision Tree Recall Score: ", recall_score(y_test, decision_tree_y_predict, average='weighted'))
    print("Decision Tree F1 Score: ", f1_score(y_test, decision_tree_y_predict, average='weighted'))
    print("Decision Tree Accuracy Score: ", accuracy_score(y_test ,decision_tree_y_predict))
    # SVM
    svm_clf = SVC()
    svm_clf.fit(x_train, y_train)
    svm_y_predict = svm_clf.predict(x_test)
    print("SVM Precision Score: ", precision_score(y_test, svm_y_predict, average='weighted'))
    print("SVM Recall Score: ", recall_score(y_test, svm_y_predict, average='weighted'))
    print("SVM F1 Score: ", f1_score(y_test, svm_y_predict, average='weighted'))
    print("SVM Accuracy Score: ", accuracy_score(y_test , svm_y_predict))
    # multi-layer perceptron neural network
    MLP_clf = MLPClassifier()
    MLP_clf.fit(x_train, y_train)
    MLP_y_predict = MLP_clf.predict(x_test)
    print("MLP Precision Score: ", precision_score(y_test, MLP_y_predict, average='weighted'))
    print("MLP Recall Score: ", recall_score(y_test, MLP_y_predict, average='weighted'))
    print("MLP F1 Score: ", f1_score(y_test, MLP_y_predict, average='weighted'))
    print("MLP Accuracy Score: ", accuracy_score(y_test, MLP_y_predict))

<class 'numpy.ndarray'>
Decision Tree Precision Score:  0.9064649255124446
Decision Tree Recall Score:  0.8303
Decision Tree F1 Score:  0.8544171230268792
Decision Tree Accuracy Score:  0.8303
SVM Precision Score:  0.8735418505975729
SVM Recall Score:  0.7347
SVM F1 Score:  0.7799582718437092
SVM Accuracy Score:  0.7347




MLP Precision Score:  0.9168512619988707
MLP Recall Score:  0.8523
MLP F1 Score:  0.8721528549479334
MLP Accuracy Score:  0.8523
<class 'numpy.ndarray'>
Decision Tree Precision Score:  0.9102807393148349
Decision Tree Recall Score:  0.8324
Decision Tree F1 Score:  0.8569075092252484
Decision Tree Accuracy Score:  0.8324
SVM Precision Score:  0.8770213285819262
SVM Recall Score:  0.7415
SVM F1 Score:  0.7860902081430152
SVM Accuracy Score:  0.7415
MLP Precision Score:  0.9217256975538032
MLP Recall Score:  0.8288
MLP F1 Score:  0.8558167101856434
MLP Accuracy Score:  0.8288
<class 'numpy.ndarray'>
Decision Tree Precision Score:  0.9110197471086545
Decision Tree Recall Score:  0.8405
Decision Tree F1 Score:  0.8627812536332286
Decision Tree Accuracy Score:  0.8405
SVM Precision Score:  0.876016885801308
SVM Recall Score:  0.7393
SVM F1 Score:  0.7840728441422461
SVM Accuracy Score:  0.7393
MLP Precision Score:  0.9184393717377545
MLP Recall Score:  0.8626
MLP F1 Score:  0.880117421568708



In [16]:
# Task 5
from sklearn.model_selection import GridSearchCV

# chi-square for categorical data (K = 5)
cate_feature_new5 = pd.DataFrame(SelectKBest(chi2, k=5).fit_transform(categorical_feature, label))
# mutual information for numerical data (K = 5)
num_feature_new5 = pd.DataFrame(SelectKBest(mutual_info_classif, k=5).fit_transform(num_feature, label))
train_x5 = pd.concat([cate_feature_new5, num_feature_new5], axis=1)
x_train, x_test, y_train, y_test = train_test_split(train_x5, y_data1, test_size=0.25, random_state=42)

# Logisitic Regression Grid Search
lr = LogisticRegression()
lr_parameters = {'penalty': ('l1', 'l2'),'C': (0.01, 0.1, 1, 2, 5, 10)}
lr_grid_search = GridSearchCV(lr, lr_parameters, scoring='accuracy', cv=5)
lr_grid_search.fit(x_train, y_train)
print('LogisticRegression best score：%0.3f' % lr_grid_search.best_score_)
lr_best_parameters= lr_grid_search.best_estimator_.get_params()
print('LogisticRegression best parameters：\n', lr_best_parameters)

# SVM Grid Search
#svm_clf = SVC()
#svm_parameters = {"gamma":[0.1,1,10,50],
#             "C":[0.1,1,10,50]}
#svm_grid_search = GridSearchCV(svm_clf, svm_parameters, scoring='accuracy', cv=5)
#svm_grid_search.fit(x_train, y_train)
#print('SVM：%0.3f' % svm_grid_search.best_score_)
#svm_best_parameters= svm_grid_search.best_estimator_.get_params()
#print('SVM：\n', svm_best_parameters)

# Decision Tree Grid Search
decision_tree_clf = tree.DecisionTreeClassifier()
decision_tree_parameters = {'max_depth':[20,30,50,60,100],'min_samples_leaf':[2,3,5,10,20]}
decision_tree_grid_search = GridSearchCV(decision_tree_clf, decision_tree_parameters,  scoring='accuracy', cv=5)
decision_tree_grid_search.fit(x_train, y_train)
print('Decision tree best score：%0.3f' % decision_tree_grid_search.best_score_)
decision_tree_best_parameters= decision_tree_grid_search.best_estimator_.get_params()
print('Decision tree best parameters：\n', decision_tree_best_parameters)

# multi-layer perceptron neural network
#MLP_clf = MLPClassifier()
#MLP_parameters = {"hidden_layer_sizes":[(100,),(100,30)],
#                 "solver":['adam','sgd','lbfgs'],
#                 "max_iter":[20, 50, 100],
#                 "verbose":[True]}
#MLP_grid_search = GridSearchCV(MLP_clf, MLP_parameters,  scoring='accuracy', cv=5)
#MLP_grid_search.fit(x_train, y_train)
#print('MLP：%0.3f' % MLP_grid_search.best_score_)
#MLP_best_parameters= MLP_grid_search.best_estimator_.get_params()
#print('MLP：\n', MLP_best_parameters)

  return f(*args, **kwargs)
Traceback (most recent call last):
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual

Traceback (most recent call last):
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "D:\Anaconda3\anaco

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Traceback (most recent call last):
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "D:\Anaconda3\anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "D:\Anaconda3\a

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegression：0.906
LogisticRegression：
 {'C': 1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
BestDecisionGain：0.909
BestDecisionGain：
 {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 30, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 20, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}


In [20]:
# Task 5
best_tree_clf = tree.DecisionTreeClassifier(max_depth=30, min_samples_leaf=20)
best_tree_clf.fit(x_train, y_train)
best_y_predict = best_tree_clf.predict(x_test)
print("Precision Score: ", precision_score(y_test, best_y_predict, average='weighted'))
print("Recall Score: ", recall_score(y_test, best_y_predict, average='weighted'))
print("F1 Score: ", f1_score(y_test, best_y_predict, average='weighted'))
print("Accuracy Score: ", accuracy_score(y_test, best_y_predict))

Precision Score:  0.9047905206767384
Recall Score:  0.9106
F1 Score:  0.907097021076788
Accuracy Score:  0.9106
