In [62]:
import pandas as pd
import numpy as np

In [122]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [64]:
df = pd.read_csv("../data/train.csv", index_col=['Id'])
df.head()

Unnamed: 0_level_0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [66]:
X = df.drop(columns=['Class'], axis=1)
y = df.Class

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((493, 56), (124, 56), (493,), (124,))

In [123]:
categorical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown = 'ignore'))
])
numerical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="mean")),
    ("scaler", RobustScaler())
])
preprocessor = ColumnTransformer([
    ("categoric", categorical_pipeline, ['EJ']),
    ("numeric", numerical_pipeline, ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN', 'BP',
       'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU',
       'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY', 'EB',
       'EE', 'EG', 'EH', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI', 'FL',
       'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']),
])
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())
])

In [124]:
parameter = {
    "algo__n_neighbors": range(1, 51, 2),
    "algo__weights": ['uniform','distance'],
    "algo__p": range(1,10)
}
model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1, scoring='f1')
model.fit(X_train, y_train)

Fitting 3 folds for each of 450 candidates, totalling 1350 fits


In [125]:
model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test), model.best_params_

(1.0,
 0.607912457912458,
 0.5405405405405405,
 {'algo__n_neighbors': 1, 'algo__p': 1, 'algo__weights': 'uniform'})

In [82]:
from sklearn.svm import SVC

In [83]:
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", SVC())
])
parameter = {
    'algo__C' : range(1,20),
    'algo__gamma' : ('auto','scale'),
    'algo__tol': [.1,.01,.001,.0001]
}
model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1, scoring='f1')
model.fit(X_train, y_train)
model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test), model.best_params_

Fitting 3 folds for each of 152 candidates, totalling 456 fits


(0.925925925925926,
 0.5800075585789871,
 0.6486486486486486,
 {'algo__C': 3, 'algo__gamma': 'scale', 'algo__tol': 0.1})

In [86]:
from sklearn.linear_model import LogisticRegression

In [94]:
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", LogisticRegression())
])
parameter = {
    'algo__C' : range(1,20),
    'algo__verbose' : range(1,10),
    'algo__tol': [.1,.01,.001,.0001]
}
model = RandomizedSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1, scoring='f1')
model.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [95]:
model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test), model.best_params_

(0.7875,
 0.6217948717948718,
 0.7441860465116279,
 {'algo__verbose': 2, 'algo__tol': 0.0001, 'algo__C': 1})

In [97]:
from sklearn.ensemble import RandomForestClassifier

In [98]:
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", RandomForestClassifier())
])

In [99]:
pipeline.get_params()

{'memory': None,
 'steps': [('prep',
   ColumnTransformer(transformers=[('categoric',
                                    Pipeline(steps=[('inputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehot',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['EJ']),
                                   ('numeric',
                                    Pipeline(steps=[('inputer', SimpleImputer()),
                                                    ('scaler', StandardScaler())]),
                                    ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY',
                                     'AZ', 'BC', 'BD ', 'BN', 'BP', 'BQ', 'BR',
                                     'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL',
                                     'CR', 'CS', 'CU', 'CW ', 'DA', 'DE', 'DF',
                   

In [103]:

parameter = {
    # 'algo__min_samples_leaf' : range(1,10),
    # 'algo__max_depth' : range(1,10),
    # 'algo__max_samples' : range(1,10),
}
model = RandomizedSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1, scoring='f1', n_iter=20)
model.fit(X_train, y_train)
model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test), model.best_params_



Fitting 3 folds for each of 1 candidates, totalling 3 fits


(1.0, 0.6743443754313319, 0.7499999999999999, {})

In [104]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier

In [116]:
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", GradientBoostingClassifier())
])

In [117]:
pipeline.get_params()

{'memory': None,
 'steps': [('prep',
   ColumnTransformer(transformers=[('categoric',
                                    Pipeline(steps=[('inputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehot',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['EJ']),
                                   ('numeric',
                                    Pipeline(steps=[('inputer', SimpleImputer()),
                                                    ('scaler', StandardScaler())]),
                                    ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY',
                                     'AZ', 'BC', 'BD ', 'BN', 'BP', 'BQ', 'BR',
                                     'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL',
                                     'CR', 'CS', 'CU', 'CW ', 'DA', 'DE', 'DF',
                   

In [118]:

parameter = {
    'algo__learning_rate': [.1,.01,.001,.0001],
    'algo__tol': [.1,.01,.001,.0001],
    'algo__max_features': range(1,10),
    'algo__max_depth': range(1,10)
}
model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1, scoring='f1')
model.fit(X_train, y_train)
model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test), model.best_params_

Fitting 3 folds for each of 1296 candidates, totalling 3888 fits


(1.0,
 0.7745098039215687,
 0.7692307692307693,
 {'algo__learning_rate': 0.1,
  'algo__max_depth': 5,
  'algo__max_features': 8,
  'algo__tol': 0.01})