In [2]:
# data analysis
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, auc, roc_curve
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [5]:
train_data = pd.read_csv('../input/santander-customer-transaction-prediction/train.csv')

In [16]:
train_data = train_data.loc[0:100000]

In [17]:
x_train = train_data.drop(['ID_code','target'],axis = 1)
y_train = train_data['target']

In [53]:
def tuning(clf, param_name, param_list, X_train, Y_train, k):
    scores = []
    kf = KFold(n_splits=k)
    
    for param in param_list:
        score = [] # record scores of K folds
        clf.__dict__[param_name] = param
        for train,test in tqdm(kf.split(X_train, Y_train)):
            clf.fit(X_train.iloc[train], Y_train.iloc[train])
            Y_predict = clf.decision_function(X_train.iloc[test])
            score.append(roc_auc_score(Y_train.iloc[test], Y_predict))
#         print(f'score is{score}')
        scores.append(score)
    return scores

In [9]:
def table_score(param_name, param_list, score_list, k):
    '''
        use example:
        result = table_score('C', [0.005, 0.01, 0.05, 0.1, 0.5], res, 5)
        here res is the return score list from tuning function
        return a DataFrame
    '''
    plot_c = {}
    i = 0
    for param in param_list:
        plot_c[f'{param_name}={param}']=score_list[i]
        i += 1
    result = pd.DataFrame(plot_c)
    result = result.append(result.describe().loc['mean'])
    result = result.append(result.describe().loc['std'])
    for j in range(k):
        result = result.rename(index={j: f"score_{j}"})
    return result

In [48]:
# x_train2, x_test2, y_train2, y_test2 = train_test_split(x_train,y_train,train_size=0.8,random_state=0)

In [47]:
# logistic_model = LogisticRegression(solver='lbfgs', penalty='l2',C=0.01,max_iter=3000,class_weight='balanced')
# logistic_model.fit(x_train2,y_train2)
# logistic_predict = logistic_model.predict_proba(x_test2)
# print(roc_auc_score(y_test2, logistic_predict[:,1]))

In [18]:
sgdclf = SGDClassifier(class_weight='balanced')
res = tuning(sgdclf,'alpha',[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],x_train, y_train,5)

5it [01:10, 14.15s/it]
5it [01:00, 12.07s/it]
5it [00:34,  6.96s/it]
5it [00:15,  3.17s/it]
5it [00:09,  1.89s/it]
5it [00:06,  1.30s/it]
5it [00:05,  1.06s/it]
5it [00:04,  1.23it/s]
5it [00:03,  1.26it/s]


In [22]:
table_score('alpha',[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],res,5)

Unnamed: 0,alpha=0.0001,alpha=0.001,alpha=0.01,alpha=0.1,alpha=1,alpha=10,alpha=100,alpha=1000,alpha=10000
score_0,0.82041,0.834788,0.841144,0.84204,0.830338,0.801805,0.731696,0.730097,0.729761
score_1,0.833488,0.833413,0.842931,0.845226,0.829611,0.797472,0.7282,0.728847,0.728324
score_2,0.832611,0.828661,0.841585,0.842817,0.829463,0.799961,0.734848,0.735145,0.734926
score_3,0.817138,0.821091,0.841072,0.841805,0.830316,0.801892,0.73724,0.737684,0.737732
score_4,0.822066,0.832363,0.844638,0.838786,0.830662,0.797109,0.741104,0.742936,0.742923
mean,0.825143,0.830063,0.842274,0.842135,0.830078,0.799648,0.734618,0.734942,0.734733
std,0.006654,0.004926,0.001358,0.002067,0.000461,0.002048,0.004443,0.00514,0.005327


In [23]:
sgdclf2 = SGDClassifier(class_weight='balanced')
res2 = tuning(sgdclf2,'alpha',[0.005, 0.01, 0.05, 0.1,0.5],x_train, y_train,5)
table_score('alpha',[0.005, 0.01, 0.05, 0.1,0.5],res2,5)

5it [00:40,  8.03s/it]
5it [00:35,  7.18s/it]
5it [00:18,  3.79s/it]
5it [00:16,  3.36s/it]
5it [00:11,  2.23s/it]


Unnamed: 0,alpha=0.005,alpha=0.01,alpha=0.05,alpha=0.1,alpha=0.5
score_0,0.833237,0.841253,0.844697,0.841525,0.83386
score_1,0.835178,0.844613,0.843809,0.843185,0.83512
score_2,0.837296,0.847976,0.846452,0.841744,0.835672
score_3,0.836264,0.84137,0.840295,0.840786,0.833721
score_4,0.840536,0.841641,0.845735,0.843849,0.834203
mean,0.836502,0.843371,0.844198,0.842218,0.834515
std,0.002424,0.002617,0.002148,0.001127,0.000757


In [25]:
sgdclf3 = SGDClassifier()
res3 = tuning(sgdclf3,'alpha',[0.005, 0.01, 0.05, 0.1,0.5],x_train, y_train,5)
table_score('alpha',[0.005, 0.01, 0.05, 0.1,0.5],res3,5)

5it [00:25,  5.18s/it]
5it [00:18,  3.77s/it]
5it [00:10,  2.13s/it]
5it [00:07,  1.59s/it]
5it [00:04,  1.01it/s]


Unnamed: 0,alpha=0.005,alpha=0.01,alpha=0.05,alpha=0.1,alpha=0.5
score_0,0.83796,0.839644,0.84078,0.84014,0.83817
score_1,0.843493,0.84369,0.83929,0.841442,0.834452
score_2,0.839933,0.843978,0.841203,0.840587,0.830398
score_3,0.842401,0.833728,0.840502,0.838178,0.836845
score_4,0.837614,0.835247,0.840199,0.839442,0.837952
mean,0.84028,0.839258,0.840395,0.839958,0.835563
std,0.002342,0.004213,0.000643,0.001101,0.0029


In [27]:
sgdclf4 = SGDClassifier(loss = 'squared_hinge',class_weight='balanced')
res4 = tuning(sgdclf4,'alpha',[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],x_train, y_train,5)
table_score('alpha',[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],res4,5)

5it [01:42, 20.53s/it]
5it [01:44, 20.88s/it]
5it [01:27, 17.53s/it]
5it [00:43,  8.80s/it]
5it [00:31,  6.30s/it]
5it [00:31,  6.34s/it]
5it [00:29,  5.88s/it]
5it [00:05,  1.13s/it]
5it [00:04,  1.03it/s]


Unnamed: 0,alpha=0.0001,alpha=0.001,alpha=0.01,alpha=0.1,alpha=1,alpha=10,alpha=100,alpha=1000,alpha=10000
score_0,0.825545,0.815018,0.757117,0.746556,0.724468,0.682722,0.628185,0.733811,0.667684
score_1,0.829734,0.808059,0.74893,0.731679,0.719524,0.699133,0.636585,0.730157,0.66002
score_2,0.820137,0.819625,0.763709,0.742176,0.725009,0.686967,0.631468,0.703865,0.664993
score_3,0.825679,0.801156,0.757939,0.746863,0.726124,0.710583,0.672331,0.643548,0.574396
score_4,0.828543,0.810791,0.754087,0.743486,0.727805,0.708237,0.646039,0.740097,0.577659
mean,0.825928,0.81093,0.756356,0.742152,0.724586,0.697529,0.642921,0.710296,0.62895
std,0.003318,0.006263,0.004847,0.005533,0.002777,0.011122,0.015894,0.035587,0.043293


In [54]:
sgdclf5 = SGDClassifier(loss = 'squared_hinge',penalty='l1',max_iter=3000,class_weight='balanced')
res5 = tuning(sgdclf5,'alpha',[100, 1000, 10000],x_train, y_train,5)
table_score('alpha',[100, 1000, 10000],res5,5)

5it [00:06,  1.33s/it]
5it [00:08,  1.76s/it]
5it [00:06,  1.34s/it]


Unnamed: 0,alpha=100,alpha=1000,alpha=10000
score_0,0.5,0.5,0.5
score_1,0.5,0.5,0.5
score_2,0.5,0.5,0.5
score_3,0.5,0.5,0.5
score_4,0.5,0.5,0.5
mean,0.5,0.5,0.5
std,0.0,0.0,0.0


In [57]:
sgdclf6 = SGDClassifier(class_weight='balanced',penalty='l1',max_iter=3000)
res6 = tuning(sgdclf6,'alpha',[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],x_train, y_train,5)
table_score('alpha',[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],res6,5)

5it [04:45, 57.07s/it]
5it [05:00, 60.13s/it]
5it [04:04, 48.99s/it]
5it [25:12, 302.40s/it]
5it [00:08,  1.66s/it]
5it [08:46, 105.31s/it]
5it [00:06,  1.34s/it]
5it [00:06,  1.34s/it]
5it [00:06,  1.35s/it]


Unnamed: 0,alpha=0.0001,alpha=0.001,alpha=0.01,alpha=0.1,alpha=1,alpha=10,alpha=100,alpha=1000,alpha=10000
score_0,0.809932,0.805834,0.796025,0.656409,0.5,0.5,0.5,0.5,0.5
score_1,0.795083,0.804832,0.790993,0.711124,0.5,0.5,0.5,0.5,0.5
score_2,0.832619,0.807038,0.793602,0.701542,0.5,0.493491,0.5,0.5,0.5
score_3,0.798756,0.808663,0.781877,0.702522,0.5,0.5,0.5,0.5,0.5
score_4,0.816424,0.807042,0.798227,0.748345,0.5,0.5,0.5,0.5,0.5
mean,0.810563,0.806682,0.792145,0.703988,0.5,0.498698,0.5,0.5,0.5
std,0.013418,0.00129,0.005673,0.029293,0.0,0.002604,0.0,0.0,0.0


In [65]:
# hinge + l2 + adaptive
sgdclf = SGDClassifier(class_weight='balanced',learning_rate='adaptive',eta0=0.0001)
res = tuning(sgdclf,'alpha',[0.001,0.005,0.01,0.05, 0.1],x_train, y_train,5)
table_score('alpha',[0.001,0.005,0.01,0.05, 0.1],res,5)

5it [00:22,  4.56s/it]
5it [00:23,  4.65s/it]
5it [00:19,  3.92s/it]
5it [00:17,  3.45s/it]
5it [00:14,  2.92s/it]


Unnamed: 0,alpha=0.001,alpha=0.005,alpha=0.01,alpha=0.05,alpha=0.1
score_0,0.846552,0.848501,0.848796,0.84671,0.84362
score_1,0.845542,0.850142,0.850711,0.84872,0.845349
score_2,0.850372,0.852481,0.851376,0.847358,0.843808
score_3,0.847275,0.847294,0.848201,0.846378,0.843699
score_4,0.849163,0.848969,0.850009,0.847127,0.844199
mean,0.847781,0.849478,0.849819,0.847259,0.844135
std,0.001755,0.001758,0.001176,0.000805,0.000639


In [68]:
# hinge + l2 + adaptive
sgdclf = SGDClassifier(class_weight='balanced',learning_rate='adaptive',eta0=0.01)
res = tuning(sgdclf,'alpha',[0.001,0.005,0.01,0.05, 0.1],x_train, y_train,5)
table_score('alpha',[0.001,0.005,0.01,0.05, 0.1],res,5)

5it [00:45,  9.13s/it]
5it [00:36,  7.25s/it]
5it [00:29,  5.92s/it]
5it [00:25,  5.09s/it]
5it [00:27,  5.46s/it]


Unnamed: 0,alpha=0.001,alpha=0.005,alpha=0.01,alpha=0.05,alpha=0.1
score_0,0.835792,0.850311,0.850435,0.847721,0.843355
score_1,0.837974,0.849575,0.850996,0.848704,0.842166
score_2,0.846101,0.854652,0.853455,0.849702,0.84639
score_3,0.83538,0.848374,0.850403,0.848674,0.845875
score_4,0.840801,0.84958,0.852472,0.848145,0.845558
mean,0.83921,0.850498,0.851552,0.848589,0.844669
std,0.003947,0.002168,0.001212,0.000665,0.001625


In [69]:
sgdclf = SGDClassifier(class_weight='balanced',learning_rate='adaptive',eta0=0.001)
res = tuning(sgdclf,'alpha',[0.001,0.005,0.01,0.05, 0.1],x_train, y_train,5)
table_score('alpha',[0.001,0.005,0.01,0.05, 0.1],res,5)

5it [00:41,  8.27s/it]
5it [00:30,  6.09s/it]
5it [00:27,  5.55s/it]
5it [00:22,  4.43s/it]
5it [00:22,  4.60s/it]


Unnamed: 0,alpha=0.001,alpha=0.005,alpha=0.01,alpha=0.05,alpha=0.1
score_0,0.842013,0.847219,0.848475,0.84714,0.844889
score_1,0.84166,0.849404,0.851443,0.84875,0.845932
score_2,0.841646,0.851932,0.852443,0.848325,0.8454
score_3,0.835559,0.846508,0.847951,0.846764,0.844478
score_4,0.844867,0.849723,0.850745,0.847477,0.844671
mean,0.841149,0.848957,0.850212,0.847691,0.845074
std,0.003044,0.001932,0.001727,0.000739,0.000528


In [66]:
sgdclf = SGDClassifier(class_weight='balanced',learning_rate='adaptive',eta0=0.00001)
res = tuning(sgdclf,'alpha',[0.001,0.005,0.01,0.05, 0.1],x_train, y_train,5)
table_score('alpha',[0.001,0.005,0.01,0.05, 0.1],res,5)

5it [00:14,  2.83s/it]
5it [00:14,  2.94s/it]
5it [00:15,  3.05s/it]
5it [00:15,  3.12s/it]
5it [00:14,  2.81s/it]


Unnamed: 0,alpha=0.001,alpha=0.005,alpha=0.01,alpha=0.05,alpha=0.1
score_0,0.846751,0.847612,0.845529,0.844846,0.843182
score_1,0.849094,0.848818,0.84834,0.84709,0.845474
score_2,0.848165,0.847288,0.848677,0.846004,0.843956
score_3,0.845813,0.845341,0.846901,0.845436,0.842676
score_4,0.846438,0.847987,0.846856,0.846086,0.843054
mean,0.847252,0.847409,0.847261,0.845892,0.843668
std,0.001201,0.001154,0.001137,0.000747,0.000994
