In [None]:
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks')

In [None]:
import Judge_pipelines
from Judge_pipelines import Judge

# Customized classes for Pipeline

In [None]:
class Encoding_Strategy_string(BaseEstimator,TransformerMixin):
    
    def __init__(self,column):
        self.column = column
        
        
    def fit(self,X,y=None):
        self.string_length = X[self.column].str.len().max()
        
        return self
        

    def transform(self,X):

        X_ = X.copy()

        for letter in range(self.string_length):

            X_["position"+ "_" + str(letter)] = X_[self.column].str.get(letter)
            X_["position"+ "_" + str(letter)] = X_["position"+ "_" + str(letter)].apply(lambda x: ord(x)-ord("A"))
            X_["len_unique_values"] = [len(set(s)) for s in X_[self.column]]
            
        X_.drop(columns=self.column,inplace=True)
        
        return X_

  

In [1]:
class Exploiting_found_interactions(BaseEstimator,TransformerMixin):
  
    def __init__(self):
      self.choices = [-1,0,1]  

    def fit(self,X,y=None):
        return self


    def transform(self,X):

      X_ = X.copy()


    

      X_["f_00_01"] = X_["f_00"] + X_["f_01"]

      conditions_1 = [(X_.f_21 + X_.f_02 < -5.3),(X_.f_21 + X_.f_02 >= -5.3) & (X_.f_21 + X_.f_02 <= 5.2),
                      (X_.f_21 + X_.f_02 > 5.2)]
      conditions_2 = [(X_.f_22 + X_.f_05 < -5.4),(X_.f_22 + X_.f_05 >= -5.3) & (X_.f_22 + X_.f_05 <= 5.1),
                      (X_.f_22 + X_.f_05 > 5.1)]
      conditions_3 = [(X_.f_00_01 + X_.f_26 < -5.3),(X_.f_00_01 + X_.f_26 >= -5.3) & 
                      (X_.f_00_01 + X_.f_26 <= 5.0),
                      (X_.f_00_01 + X_.f_26 > 5.0)]

      X_["f_02_21"] = np.select(conditions_1,self.choices)

      X_["f_05_22"] = np.select(conditions_2,self.choices)

      X_["f_00_01_26"] = np.select(conditions_3,self.choices)



      return X_




        
    
      
      
    
          


NameError: ignored

In [None]:
 import tensorflow as tf 
 tf.test.gpu_device_name() 

''

# Building Pipeline

In [None]:


pipe_name = np.array([
      "pipe_LGBM"
])

metrics = [
    "accuracy",
    "precision",
    "recall",
    "roc_auc",
    "f1_macro"
]

params_pipe ={
    
}

# Importing data and modeling

In [None]:
train = pd.read_csv('drive/MyDrive/Colab Notebooks/Kaggle_competition_may/train.csv')

In [None]:
X = train.drop(columns='target')
y = train['target']

In [None]:
X = X.drop(columns='id')

In [None]:
X.head(5)

Unnamed: 0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30
0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,1,...,3.058541,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0
1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,4,...,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0
2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,2,...,-0.784235,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,AAAEABCKAD,-195.599702,0,2
3,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3,2,1,...,-2.343819,0.572594,-1.653213,1.686035,-2.533098,-0.608601,BDBBAACBCB,210.826205,0,0
4,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3,3,0,...,1.133665,-3.912929,-1.430366,2.127649,-3.306784,4.371371,BDBCBBCHFE,-217.211798,0,1


In [None]:
cv_ex = KFold(n_splits=5,shuffle=True) 
cvl = cross_validate(pipe,X,y,scoring=["accuracy","precision","recall","roc_auc","f1_macro"],cv=cv_ex)
print(cvl)

[Pipeline] ................ (step 1 of 4) Processing es, total=  11.9s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   0.6s
[Pipeline] ............... (step 4 of 4) Processing clf, total= 9.2min
[Pipeline] ................ (step 1 of 4) Processing es, total=  11.8s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   0.6s
[Pipeline] ............... (step 4 of 4) Processing clf, total= 9.2min
[Pipeline] ................ (step 1 of 4) Processing es, total=  11.9s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   0.6s
[Pipeline] ............... (step 4 of 4) Processing clf, total= 9.2min
[Pipeline] ................ (step 1 of 4) Processing es, total=  11.8s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipel

In [None]:
cv_ex = KFold(n_splits=5,shuffle=True) 
cvl = cross_validate(pipe,X,y,scoring=["accuracy","precision","recall","roc_auc","f1_macro"],cv=cv_ex)
print(cvl)

[Pipeline] ................ (step 1 of 4) Processing es, total=  12.0s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   0.6s
[Pipeline] ............... (step 4 of 4) Processing clf, total= 9.3min
[Pipeline] ................ (step 1 of 4) Processing es, total=  11.9s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   0.6s
[Pipeline] ............... (step 4 of 4) Processing clf, total= 9.1min
[Pipeline] ................ (step 1 of 4) Processing es, total=  11.9s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   0.6s
[Pipeline] ............... (step 4 of 4) Processing clf, total= 9.2min
[Pipeline] ................ (step 1 of 4) Processing es, total=  11.9s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipel

In [None]:
judge = Judge("trial")

In [None]:
judge.set_data(X,y).set_params(params_pipe).set_pipes_and_names(pipe,pipe_name).set_metrics(metrics)

<Judge_pipelines.Judge at 0x7f30e2a41510>

In [None]:
judge.get_final_performance(4,4)

[Pipeline] ................ (step 1 of 4) Processing es, total=  14.9s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   0.6s
[Pipeline] ............... (step 4 of 4) Processing clf, total= 9.2min
[Pipeline] ................ (step 1 of 4) Processing es, total=  11.9s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   0.5s
[Pipeline] ............... (step 4 of 4) Processing clf, total= 9.0min
[Pipeline] ................ (step 1 of 4) Processing es, total=  11.9s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   0.5s
[Pipeline] ............... (step 4 of 4) Processing clf, total= 8.9min
[Pipeline] ................ (step 1 of 4) Processing es, total=  11.5s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipel

Unnamed: 0,model,accuracy,precision,recall,roc_auc,f1_macro
0,pipe_LGBM,96.95,96.98,96.74,99.6,96.95


Results seems to be best for LGBM with 5000 estimators. Let's look for others models.

In [None]:
Results_dataframe_2 = pd.DataFrame.from_dict(cvl)

In [None]:
Results_dataframe_2.to_csv('drive/MyDrive/Colab Notebooks/Kaggle_competition_may/results_2.csv',index=False)


In [None]:
X.head(5)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,...,3.058541,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,...,2.17905,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0
2,2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,...,-0.784235,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,AAAEABCKAD,-195.599702,0,2
3,3,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3,2,...,-2.343819,0.572594,-1.653213,1.686035,-2.533098,-0.608601,BDBBAACBCB,210.826205,0,0
4,4,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3,3,...,1.133665,-3.912929,-1.430366,2.127649,-3.306784,4.371371,BDBCBBCHFE,-217.211798,0,1


In [None]:
pip install xgboost --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[K     |████████████████████████████████| 192.9 MB 59 kB/s 
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 0.90
    Uninstalling xgboost-0.90:
      Successfully uninstalled xgboost-0.90
Successfully installed xgboost-1.6.1


In [None]:
# Pipe parameters for Judge


pipe_names = np.array([
    "pipe_lr",
    "pipe_dt",
    "pipe_ada",
    "pipe_XGB",
    "pipe LGBM"
])


metrics = [
    'accuracy',
    'precision',
    'recall',
    'roc_auc',
    'f1_macro'
]

params_pipe = {

}

In [None]:
# Let's try different pipelines

pipes = [Pipeline(steps= [
    ("es", Encoding_Strategy_string(column="f_27")),
    ("interaction", Exploiting_found_interactions()),
    ("norm", StandardScaler()),
    ("clf_1", LogisticRegression(solver="liblinear"))
],verbose=True),

Pipeline(steps= [
    ("es", Encoding_Strategy_string(column="f_27")),
    ("interaction", Exploiting_found_interactions()),
    ("norm", StandardScaler()),
    ("clf_2", DecisionTreeClassifier())
],verbose=True),


Pipeline(steps= [
    ("es", Encoding_Strategy_string(column="f_27")),
    ("interaction", Exploiting_found_interactions()),
    ("norm", StandardScaler()),
    ("clf_3", AdaBoostClassifier())
],verbose=True),


Pipeline(steps= [
    ("es", Encoding_Strategy_string(column="f_27")),
    ("interaction", Exploiting_found_interactions()),
    ("norm", StandardScaler()),
    ("clf_4", XGBClassifier(use_label_encoder=False,eval_metric="logloss"))
],verbose=True),

Pipeline(steps= [
    ("es", Encoding_Strategy_string(column="f_27")),
    ("interaction", Exploiting_found_interactions()),
    ("norm", StandardScaler()),
    ("clf_5", LGBMClassifier(n_estimators=5000,min_child_samples=80))
],verbose=True)]


Let's upgrade algorithms

In [None]:
pip install tree --upgrade 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tree
  Downloading Tree-0.2.4.tar.gz (6.5 kB)
Collecting svgwrite
  Downloading svgwrite-1.4.2-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.4 MB/s 
Building wheels for collected packages: tree
  Building wheel for tree (setup.py) ... [?25l[?25hdone
  Created wheel for tree: filename=Tree-0.2.4-py3-none-any.whl size=7873 sha256=e060e6e97a5c94d7ae73833ba04803463d989a37626f4f13a92bc79cb94ab636
  Stored in directory: /root/.cache/pip/wheels/64/46/53/3a413f321c09b0df8d4a26edd5fc3501c6c1b238497d166638
Successfully built tree
Installing collected packages: svgwrite, tree
Successfully installed svgwrite-1.4.2 tree-0.2.4


In [None]:
pip install sklearn --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Since in crossvalidation metrics seem not to vary much, we could set number of crossvalidation to 3 in order to decrease computational time

In [None]:
judge = Judge("machine_analysis")

In [None]:
judge.set_data(X,y).set_params(params_pipe).set_metrics(metrics).set_pipes_and_names(pipes,pipe_names)

<Judge_pipelines.Judge at 0x7fbbbcb8b710>

In [None]:
final_tab = judge.get_final_performance(3,3)


[Pipeline] ................ (step 1 of 4) Processing es, total=  20.7s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.2s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   1.1s
[Pipeline] ............. (step 4 of 4) Processing clf_1, total=   5.8s
[Pipeline] ................ (step 1 of 4) Processing es, total=  11.8s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   0.5s
[Pipeline] ............. (step 4 of 4) Processing clf_1, total=   5.3s
[Pipeline] ................ (step 1 of 4) Processing es, total=  13.2s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   0.5s
[Pipeline] ............. (step 4 of 4) Processing clf_1, total=   5.4s
[Pipeline] ................ (step 1 of 4) Processing es, total=  11.8s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.1s
[Pipel

In [None]:
final_tab

Unnamed: 0,model,accuracy,precision,recall,roc_auc,f1_macro
0,pipe_lr,73.97,73.84,72.0,81.83,73.93
1,pipe_dt,81.64,81.09,81.2,81.63,81.63
2,pipe_ada,75.79,75.83,73.74,83.61,75.75
3,pipe_XGB,94.36,94.53,93.83,98.74,94.35
4,pipe LGBM,96.91,96.95,96.68,99.59,96.91


In [None]:
# Let's save results 
Final_tab.to_csv('drive/MyDrive/Colab Notebooks/Kaggle_competition_may/final_tab.csv',index=False)

In [None]:
# At the end of the day, best model seems to be LGBM with 5000 estimators. Let's fit it. 

# Production model

Production_pipe = Pipeline(steps= [
    ("es", Encoding_Strategy_string(column="f_27")),
    ("interaction", Exploiting_found_interactions()),
    ("norm", StandardScaler()),
    ("clf_5", LGBMClassifier(n_estimators=5000,min_child_samples=80))
],verbose=True)

Production_Model = Production_pipe.fit(X,y)



[Pipeline] ................ (step 1 of 4) Processing es, total=  32.3s
[Pipeline] ....... (step 2 of 4) Processing interaction, total=   0.2s
[Pipeline] .............. (step 3 of 4) Processing norm, total=   1.1s
[Pipeline] ............. (step 4 of 4) Processing clf_5, total=14.6min


In [None]:
#let's load the test data
test = pd.read_csv('drive/MyDrive/Colab Notebooks/Kaggle_competition_may/test.csv')

In [None]:
# Let's make predictions
ids = test['id']
X_test = test.drop(columns='id')

In [None]:
predictions = Production_Model.predict(X_test)

In [None]:
predictions_series = pd.Series(predictions)

In [None]:
predictions_dataframe = pd.concat([ids,predictions_series],axis=1)

In [None]:
predictions_dataframe = predictions_dataframe.rename(columns={0:"predictions"})

In [None]:
predictions_dataframe

Unnamed: 0,id,predictions
0,900000,1
1,900001,1
2,900002,0
3,900003,0
4,900004,1
...,...,...
699995,1599995,1
699996,1599996,1
699997,1599997,0
699998,1599998,0


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Kaggle_competition_may')

In [None]:
filepath = '/content/drive/MyDrive/Colab Notebooks/Kaggle_competition_may'

In [None]:
# Let's save the model and the dataframe
import pickle

with open(filepath + 'finalized_model.sav','wb') as f:
  pickle.dump(Production_Model,f)

In [None]:
predictions_dataframe.to_csv('drive/MyDrive/Colab Notebooks/Kaggle_competition_may/predictions_test.csv')