### Path

In [1]:
import os
os.chdir(r'/Users/iflab/Desktop')

### Package

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle

## Data

In [3]:
data = pd.read_csv('mix_platforms.csv', encoding='utf-8',low_memory=False)

In [4]:
data.head(3)

Unnamed: 0,platforms,status,project_id,category,location,backers,pledged_percent,goal_usd,pledged_usd
0,Kickstarter,1,620302213,Art,USD,6,10000.0,0.01,100.0
1,Kickstarter,0,9572984,Film & Video,USD,0,0.0,0.15,0.0
2,Kickstarter,1,1379346088,Art,MXN,7,33.489796,0.49,16.41


In [5]:
data.drop(columns=['platforms','pledged_percent','pledged_usd'],inplace=True)

In [6]:
data.set_index('project_id',inplace=True)

In [7]:
data.head(3)

Unnamed: 0_level_0,status,category,location,backers,goal_usd
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
620302213,1,Art,USD,6,0.01
9572984,0,Film & Video,USD,0,0.15
1379346088,1,Art,MXN,7,0.49


In [8]:
data.shape

(593011, 5)

In [9]:
data['category'].loc[data['category']=='art']='Art'
data['category'].loc[data['category']=='comics']='Comics'
data['category'].loc[data['category']=='music']='Music'
data['category'].loc[data['category']=='design']='Design'
data['category'].loc[data['category']=='technology']='Technology'
data['category'].loc[data['category']=='food']='Food'
data['category'].loc[data['category']=='photography']='Photography'
data['category'].loc[data['category']=='theatre']='Theater'
data['category'].loc[data['category']=='fashion']='Fashion'
data['category'].loc[data['category']=='dance']='Dance'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [10]:
data=pd.get_dummies(data,drop_first=True)

In [11]:
data.head(3)

Unnamed: 0_level_0,status,backers,goal_usd,category_Comics,category_Crafts,category_Dance,category_Design,category_Fashion,category_Film & Video,category_Food,...,location_EUR,location_GBP,location_HKD,location_JPY,location_MXN,location_NOK,location_NZD,location_SEK,location_SGD,location_USD
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
620302213,1,6,0.01,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9572984,0,0,0.15,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1379346088,1,7,0.49,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [12]:
data.shape

(593011, 61)

In [13]:
data.to_csv('mix_platforms_statue_preprocessing.csv')

## Train Test Spliting

In [14]:
data_v1 = data.copy()

In [15]:
X = data_v1.drop("status", axis=1).values
y = data_v1["status"].values.reshape(-1,1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

## Model

In [17]:
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

### Decision Tree

In [20]:
pipeline = Pipeline([
                    ('scaler', Normalizer()),
                    ('classifier', DecisionTreeClassifier())
                     ])

parameters = { 
              'scaler': [Normalizer(), MinMaxScaler(), None],
              'classifier__criterion': [ 'gini', 'entropy'],
              'classifier__max_depth': [4, 6, 8, 10]
              }

scoring = 'roc_auc'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [21]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  3.1min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('classifier',
                                        DecisionTreeClassifier(class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features=None,
                                                               max_leaf_nodes=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=...
                                                               presort=False

In [22]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'scaler': None}
0.9601455920111107


In [23]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (roc_auc): 0.9600978176859354


In [24]:
r_dt=SearchCV.cv_results_

In [25]:
print(r_dt)

{'mean_fit_time': array([5.20049219, 4.29624605, 3.58728123, 5.40625482, 5.08885183,
       4.79821391, 6.56266737, 5.88333182, 6.17168427, 9.51352153,
       7.47713976, 6.44319873, 4.00899906, 3.88414183, 3.57450371,
       5.39100165, 4.96095896, 4.74614143, 6.32947536, 5.77044873,
       5.53561282, 7.02204485, 6.24118204, 5.24659524]), 'std_fit_time': array([0.24518477, 0.2678904 , 0.05192029, 0.06952739, 0.03078661,
       0.04762901, 0.0921489 , 0.09182156, 0.6027781 , 0.24096897,
       0.64381748, 0.14759734, 0.04534248, 0.05949743, 0.04005452,
       0.15046898, 0.05465048, 0.04548251, 0.13435847, 0.07330425,
       0.07508168, 0.16950327, 0.09277074, 0.90622653]), 'mean_score_time': array([0.11657238, 0.07766356, 0.05360799, 0.08378282, 0.08628554,
       0.05831547, 0.09131985, 0.09028053, 0.06235561, 0.11002049,
       0.10229483, 0.06740384, 0.0859375 , 0.09252434, 0.05519285,
       0.0864821 , 0.09036307, 0.0571651 , 0.08618269, 0.08714581,
       0.06082692, 0.08496642

In [26]:
print('parans:\n',r_dt['params'],'\n')
print('mean_test_score:\n',r_dt['mean_test_score'],'\n')
print('std_test_score:\n',r_dt['std_test_score'],'\n')
print('mean_train_score:\n',r_dt['mean_train_score'],'\n')
print('std_train_score:\n',r_dt['std_train_score'],'\n')

parans:
 [{'classifier__criterion': 'gini', 'classifier__max_depth': 4, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__criterion': 'gini', 'classifier__max_depth': 4, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__criterion': 'gini', 'classifier__max_depth': 4, 'scaler': None}, {'classifier__criterion': 'gini', 'classifier__max_depth': 6, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__criterion': 'gini', 'classifier__max_depth': 6, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__criterion': 'gini', 'classifier__max_depth': 6, 'scaler': None}, {'classifier__criterion': 'gini', 'classifier__max_depth': 8, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__criterion': 'gini', 'classifier__max_depth': 8, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__criterion': 'gini', 'classifier__max_depth': 8, 'scaler': None}, {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'scaler':

In [27]:
filename = 'SearchCV_statue_DecisionTree'
pickle.dump(SearchCV, open(filename, 'wb'))

### Random Forest

In [18]:
pipeline = Pipeline([
                      ('scaler', Normalizer()),
                      ('classifier', RandomForestClassifier())
                     ])

parameters = {    
              'scaler': [Normalizer(), MinMaxScaler(), None],
              'classifier__criterion': [ 'gini', 'entropy'],          
              'classifier__max_depth': [4, 6, 8, 10]
              }

scoring = 'roc_auc'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [19]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  3.4min finished
  self._final_estimator.fit(Xt, y, **fit_params)


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('classifier',
                                        RandomForestClassifier(bootstrap=True,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               min_impurity_decrease=0.0,
                                                               mi...
            

In [20]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'classifier__criterion': 'entropy', 'classifier__max_depth': 10, 'scaler': Normalizer(copy=True, norm='l2')}
0.9515777172842393


In [21]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (roc_auc): 0.9505423216794475


In [22]:
r_dt=SearchCV.cv_results_

In [23]:
print(r_dt)

{'mean_fit_time': array([6.30979581, 4.26329145, 3.68017139, 5.47624159, 5.20669417,
       4.86689534, 6.93174353, 6.31321912, 6.22902493, 8.25782485,
       7.54188237, 7.25761886, 4.12042818, 3.89211187, 3.61422486,
       5.5842185 , 5.17807541, 4.86977563, 7.03462801, 6.35437303,
       6.34176006, 8.29211502, 7.41634212, 6.35537834]), 'std_fit_time': array([0.97410081, 0.12481701, 0.21878071, 0.10486011, 0.03682026,
       0.102847  , 0.11739362, 0.10098715, 0.17355285, 0.11835767,
       0.1462855 , 0.15926482, 0.13458597, 0.15072992, 0.1511386 ,
       0.10190695, 0.1347136 , 0.05761386, 0.16943965, 0.09761123,
       0.10247178, 0.09859389, 0.06122323, 1.01501687]), 'mean_score_time': array([0.25732145, 0.15244989, 0.11084828, 0.14342885, 0.14847031,
       0.11805868, 0.1665134 , 0.16920724, 0.16083326, 0.168362  ,
       0.17859311, 0.14835434, 0.14268861, 0.14707046, 0.11458445,
       0.14883094, 0.15520658, 0.11453447, 0.16167483, 0.16480784,
       0.13138881, 0.17042375

In [24]:
print('parans:\n',r_dt['params'],'\n')
print('mean_test_score:\n',r_dt['mean_test_score'],'\n')
print('std_test_score:\n',r_dt['std_test_score'],'\n')
print('mean_train_score:\n',r_dt['mean_train_score'],'\n')
print('std_train_score:\n',r_dt['std_train_score'],'\n')

parans:
 [{'classifier__criterion': 'gini', 'classifier__max_depth': 4, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__criterion': 'gini', 'classifier__max_depth': 4, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__criterion': 'gini', 'classifier__max_depth': 4, 'scaler': None}, {'classifier__criterion': 'gini', 'classifier__max_depth': 6, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__criterion': 'gini', 'classifier__max_depth': 6, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__criterion': 'gini', 'classifier__max_depth': 6, 'scaler': None}, {'classifier__criterion': 'gini', 'classifier__max_depth': 8, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__criterion': 'gini', 'classifier__max_depth': 8, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__criterion': 'gini', 'classifier__max_depth': 8, 'scaler': None}, {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'scaler':

In [25]:
filename = 'SearchCV_statue_RandomForest'
pickle.dump(SearchCV, open(filename, 'wb'))

### AdaBoost

In [26]:
pipeline = Pipeline([
                    ('scaler', Normalizer()),                 
                    ('classifier', AdaBoostClassifier(random_state=0))
                     ])

parameters = {    
                'scaler': [Normalizer(), MinMaxScaler(), None],           
                'classifier__n_estimators': [100, 200, 300, 400],
                'classifier__learning_rate': [0.1, 0.01, 0.001]
              }

scoring = 'roc_auc'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

scoring = 'roc_auc'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=2)


In [27]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 51.2min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed: 239.5min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('classifier',
                                        AdaBoostClassifier(algorithm='SAMME.R',
                                                           base_estimator=None,
                                                           learning_rate=1.0,
                                                           n_estimators=50,
                                                           random_state=0))],
                                verbose=False),
             iid='warn', n_jobs=2,
             param_grid={'classifier__learning_rate': [0.1, 0.01, 0.001],
                         'classifier__n_estimators': [100, 200, 300, 400],
                         'scaler': [

In [28]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'classifier__learning_rate': 0.1, 'classifier__n_estimators': 400, 'scaler': None}
0.9610322885596403


In [30]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (roc_auc): 0.9603422460527806


In [31]:
r_ab=SearchCV.cv_results_

In [32]:
print('parans:\n',r_ab['params'],'\n')
print('mean_test_score:\n',r_ab['mean_test_score'],'\n')
print('std_test_score:\n',r_ab['std_test_score'],'\n')
print('mean_train_score:\n',r_ab['mean_train_score'],'\n')
print('std_train_score:\n',r_ab['std_train_score'],'\n')

parans:
 [{'classifier__learning_rate': 0.1, 'classifier__n_estimators': 100, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 100, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 100, 'scaler': None}, {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 200, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 200, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 200, 'scaler': None}, {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 300, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 300, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 300, 'scaler': None}, {'classifier__lear

In [33]:
filename = 'SearchCV_statue_AdaBoost'
pickle.dump(SearchCV, open(filename, 'wb'))

### Logistic Regression

In [34]:
pipeline = Pipeline([
                    ('scaler', Normalizer()),                 
                    ('classifier', LogisticRegression(random_state=0))
                     ])

parameters = {    
              'scaler': [Normalizer(), MinMaxScaler(), None],        
              'classifier__penalty': [ 'l1', 'l2'],
              'classifier__C': [1, 0.1, 0.01, 0.001]
              }

scoring = 'roc_auc'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)


In [35]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  9.8min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('classifier',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                                           max_iter=100,
                                                           multi_class='warn',
                                                           n

In [36]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'classifier__C': 1, 'classifier__penalty': 'l1', 'scaler': Normalizer(copy=True, norm='l2')}
0.9503160444489004


In [37]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (roc_auc): 0.9498375356651414


In [38]:
r_lr=SearchCV.cv_results_

In [39]:
print('parans:\n',r_lr['params'],'\n')
print('mean_test_score:\n',r_lr['mean_test_score'],'\n')
print('std_test_score:\n',r_lr['std_test_score'],'\n')
print('mean_train_score:\n',r_lr['mean_train_score'],'\n')
print('std_train_score:\n',r_lr['std_train_score'],'\n')

parans:
 [{'classifier__C': 1, 'classifier__penalty': 'l1', 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__C': 1, 'classifier__penalty': 'l1', 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__C': 1, 'classifier__penalty': 'l1', 'scaler': None}, {'classifier__C': 1, 'classifier__penalty': 'l2', 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__C': 1, 'classifier__penalty': 'l2', 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__C': 1, 'classifier__penalty': 'l2', 'scaler': None}, {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'scaler': None}, {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'scaler': MinMaxScaler(co

In [40]:
filename = 'SearchCV_statue_Logistic'
pickle.dump(SearchCV, open(filename, 'wb'))

### SVM

In [41]:
pipeline = Pipeline([
                     ('scaler', Normalizer()),
                      ('classifier', LinearSVC())
                     ])

parameters = {    
                'scaler': [Normalizer(), MinMaxScaler(), None],           
                'classifier__C': [1, 0.1, 0.01, 0.001]
              }
scoring = 'roc_auc'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)


In [42]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  7.8min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('classifier',
                                        LinearSVC(C=1.0, class_weight=None,
                                                  dual=True, fit_intercept=True,
                                                  intercept_scaling=1,
                                                  loss='squared_hinge',
                                                  max_iter=1000,
                                                  multi_class='ovr',
                                                  penalty='l2',
                                                  random_state=None, tol=0.0001,
                                                  verbose=0))],
                 

In [43]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'classifier__C': 1, 'scaler': Normalizer(copy=True, norm='l2')}
0.9499085813486021


In [44]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (roc_auc): 0.9494803901482932


In [45]:
r_lr=SearchCV.cv_results_

In [46]:
print('parans:\n',r_lr['params'],'\n')
print('mean_test_score:\n',r_lr['mean_test_score'],'\n')
print('std_test_score:\n',r_lr['std_test_score'],'\n')
print('mean_train_score:\n',r_lr['mean_train_score'],'\n')
print('std_train_score:\n',r_lr['std_train_score'],'\n')

parans:
 [{'classifier__C': 1, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__C': 1, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__C': 1, 'scaler': None}, {'classifier__C': 0.1, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__C': 0.1, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__C': 0.1, 'scaler': None}, {'classifier__C': 0.01, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__C': 0.01, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__C': 0.01, 'scaler': None}, {'classifier__C': 0.001, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__C': 0.001, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__C': 0.001, 'scaler': None}] 

mean_test_score:
 [0.94990858 0.7115486  0.91103264 0.9487627  0.64973486 0.89834239
 0.94845279 0.62540289 0.88865475 0.94830628 0.61766407 0.91519437] 

std_test_score:
 [0.00034978 0.01596721 0.03552251 0.00029834 0.01235598 0

In [47]:
filename = 'SearchCV_statue_SVM'
pickle.dump(SearchCV, open(filename, 'wb'))

### MLP

In [51]:
pipeline = Pipeline([
                    ('scaler', Normalizer()),                 
                    ('classifier', MLPClassifier())
                     ])

parameters = {    
                'scaler': [Normalizer(), MinMaxScaler(), None],          
                'classifier__hidden_layer_sizes': [(32), (64), (128), (64, 64)],
                'classifier__activation': ['relu'],
                'classifier__solver': ['lbfgs'],
                'classifier__alpha': [0.1, 0.01, 0.001],
                'classifier__batch_size': ['auto'], 
                'classifier__learning_rate': ['constant'],
                'classifier__learning_rate_init': [0.01, 0.001],
                'classifier__max_iter': [200]
              }

scoring = 'roc_auc'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [52]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 90.6min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 203.6min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('classifier',
                                        MLPClassifier(activation='relu',
                                                      alpha=0.0001,
                                                      batch_size='auto',
                                                      beta_1=0.9, beta_2=0.999,
                                                      early_stopping=False,
                                                      epsilon=1e-08,
                                                      hidden_layer_sizes=(100,),
                                                      learning...
                         'classifier__hidden_layer_sizes': [32, 64, 128,
 

In [53]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'classifier__activation': 'relu', 'classifier__alpha': 0.1, 'classifier__batch_size': 'auto', 'classifier__hidden_layer_sizes': 32, 'classifier__learning_rate': 'constant', 'classifier__learning_rate_init': 0.001, 'classifier__max_iter': 200, 'classifier__solver': 'lbfgs', 'scaler': Normalizer(copy=True, norm='l2')}
0.9483380206948665


In [54]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (roc_auc): 0.9480206560756209


In [55]:
r_lr=SearchCV.cv_results_

In [56]:
print('parans:\n',r_lr['params'],'\n')
print('mean_test_score:\n',r_lr['mean_test_score'],'\n')
print('std_test_score:\n',r_lr['std_test_score'],'\n')
print('mean_train_score:\n',r_lr['mean_train_score'],'\n')
print('std_train_score:\n',r_lr['std_train_score'],'\n')

parans:
 [{'classifier__activation': 'relu', 'classifier__alpha': 0.1, 'classifier__batch_size': 'auto', 'classifier__hidden_layer_sizes': 32, 'classifier__learning_rate': 'constant', 'classifier__learning_rate_init': 0.01, 'classifier__max_iter': 200, 'classifier__solver': 'lbfgs', 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__activation': 'relu', 'classifier__alpha': 0.1, 'classifier__batch_size': 'auto', 'classifier__hidden_layer_sizes': 32, 'classifier__learning_rate': 'constant', 'classifier__learning_rate_init': 0.01, 'classifier__max_iter': 200, 'classifier__solver': 'lbfgs', 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__activation': 'relu', 'classifier__alpha': 0.1, 'classifier__batch_size': 'auto', 'classifier__hidden_layer_sizes': 32, 'classifier__learning_rate': 'constant', 'classifier__learning_rate_init': 0.01, 'classifier__max_iter': 200, 'classifier__solver': 'lbfgs', 'scaler': None}, {'classifier__activation': 'relu', 'classifier__

In [57]:
filename = 'SearchCV_statue_mlp'
pickle.dump(SearchCV, open(filename, 'wb'))

### XGBoost

In [58]:
pipeline = Pipeline([
                      ('scaler', Normalizer()),
                      ('classifier', xgb.XGBClassifier())
                     ])

parameters = {    
              'scaler': [Normalizer(), MinMaxScaler(), None],
              'classifier__learning_rate': [1, 0.1, 0.01],
              'classifier__max_depth': [4, 6, 8, 10],
              'classifier__min_child_weight': [1, 3],
              'classifier__gamma': [0.0, 0.1, 0.2],
              'classifier__colsample_bytree': [0.3, 0.5, 0.7, 1]
              }

scoring = 'roc_auc'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [59]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 104.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 249.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 449.8min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 729.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 1143.5min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 1652.7min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 2317.6min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 3253.0min
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed: 3578.3min finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('classifier',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=1,
                                                      gamma=0,
                                                      learning_rate=0.1,
                                                      max_delta_step=0,
                                                      max_depth...
             param_g

In [60]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'classifier__colsample_bytree': 0.5, 'classifier__gamma': 0.0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 10, 'classifier__min_child_weight': 1, 'scaler': None}
0.9702318533407693


In [61]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (roc_auc): 0.9700551709137117


In [62]:
r_lr=SearchCV.cv_results_

In [63]:
print('parans:\n',r_lr['params'],'\n')
print('mean_test_score:\n',r_lr['mean_test_score'],'\n')
print('std_test_score:\n',r_lr['std_test_score'],'\n')
print('mean_train_score:\n',r_lr['mean_train_score'],'\n')
print('std_train_score:\n',r_lr['std_train_score'],'\n')

parans:
 [{'classifier__colsample_bytree': 0.3, 'classifier__gamma': 0.0, 'classifier__learning_rate': 1, 'classifier__max_depth': 4, 'classifier__min_child_weight': 1, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__colsample_bytree': 0.3, 'classifier__gamma': 0.0, 'classifier__learning_rate': 1, 'classifier__max_depth': 4, 'classifier__min_child_weight': 1, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'classifier__colsample_bytree': 0.3, 'classifier__gamma': 0.0, 'classifier__learning_rate': 1, 'classifier__max_depth': 4, 'classifier__min_child_weight': 1, 'scaler': None}, {'classifier__colsample_bytree': 0.3, 'classifier__gamma': 0.0, 'classifier__learning_rate': 1, 'classifier__max_depth': 4, 'classifier__min_child_weight': 3, 'scaler': Normalizer(copy=True, norm='l2')}, {'classifier__colsample_bytree': 0.3, 'classifier__gamma': 0.0, 'classifier__learning_rate': 1, 'classifier__max_depth': 4, 'classifier__min_child_weight': 3, 'scaler': MinMaxScaler(copy=Tr

In [64]:
filename = 'SearchCV_statue_XGBoost'
pickle.dump(SearchCV, open(filename, 'wb'))