In [151]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows',None)

In [152]:
import re
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, accuracy_score

In [153]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [154]:
from utils import (load_dataset,save_dataset)

In [155]:
from sklearn.model_selection import StratifiedKFold,GridSearchCV,train_test_split,RandomizedSearchCV
from sklearn.metrics import f1_score,classification_report,confusion_matrix,roc_auc_score

In [156]:
df_train = load_dataset('Train')
df_test = load_dataset('Test')
#df_sub = load_dataset('Sample_Submission')
print(df_train.shape)
print(df_test.shape)

(12666, 7)
(29555, 6)


In [157]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12666 entries, 0 to 12665
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Area_Code      12666 non-null  int64  
 1   Locality_Code  12666 non-null  int64  
 2   Region_Code    12666 non-null  int64  
 3   Height         12666 non-null  float64
 4   Diameter       12666 non-null  float64
 5   Class          12666 non-null  int64  
 6   Species        12666 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 692.8 KB


In [158]:
df_train.isnull().sum()

Area_Code        0
Locality_Code    0
Region_Code      0
Height           0
Diameter         0
Class            0
Species          0
dtype: int64

In [159]:
df_test.isnull().sum()

Area_Code        0
Locality_Code    0
Region_Code      0
Height           0
Diameter         0
Species          0
dtype: int64

In [160]:
df_train.head()

Unnamed: 0,Area_Code,Locality_Code,Region_Code,Height,Diameter,Class,Species
0,4694,17,1609,3.0,5.0,7,48
1,781,7,1380,4.0,17.0,2,54
2,21014,6,1122,3.0,11.0,1,22
3,7326,6,556,3.0,34.0,7,25
4,13122,17,1752,5.0,16.0,2,128


In [161]:
df_test['Class'] = -1
df_full = pd.concat([df_train,df_test],axis=0,ignore_index=True)

In [162]:
len(df_full)

42221

In [163]:
df_full['Area_Code_Locality_Code'] = df_full['Area_Code'].astype(str) + df_full['Locality_Code'].astype(str)
df_full['Region_Code_Locality_Code'] = df_full['Region_Code'].astype(str) + df_full['Locality_Code'].astype(str)
df_full['Area_Code_Region_Code'] = df_full['Area_Code'].astype(str) + df_full['Region_Code'].astype(str)
df_full['Area_Code_Region_Code_Locality_Code'] = df_full['Area_Code'].astype(str) + df_full['Region_Code'].astype(str) + df_full['Locality_Code'].astype(str)

In [164]:
def create_maps_and_add_cols(df_data):
    df = df_data.copy()
    ops = ['min', 'max','mean','std']
    
    key_list = ['Area_Code_Locality_Code','Region_Code_Locality_Code','Area_Code_Region_Code','Area_Code_Region_Code_Locality_Code']
    var_list = ['Height','Diameter']
    
    for key in key_list:
        for var in var_list:
            df_map = df.groupby(key)[var].agg(ops)
            df_map.reset_index(inplace=True)
            df_map.set_index(key,inplace=True)
            df_map.columns = [key + '_' + var + '_' + x for x in ops]
            df_map.reset_index(inplace=True)
            df = pd.merge(df,df_map,how='left',on=key)
    df['HeightDiameter'] = df['Height'] / df['Diameter']
    df['HeightS2'] = df['Height']*df['Height']
    df['HeightS2Diameter'] = df['HeightS2'] / df['Diameter']
    df['HeightLog'] = np.log(df['Height'])
    df['DiameterLog'] = np.log(df['Diameter'])
    df['HeightSqrt'] = np.sqrt(df['Height'])
    df['DiameterSqrt'] = np.sqrt(df['Diameter'])
    df['Volume'] = df['Diameter']*df['Diameter']*df['Height']
    
    return df

In [165]:
df_full_treated = create_maps_and_add_cols(df_full)
df_full_treated.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,Area_Code,Locality_Code,Region_Code,Height,Diameter,Class,Species,Area_Code_Locality_Code,Region_Code_Locality_Code,Area_Code_Region_Code,...,Area_Code_Region_Code_Locality_Code_Diameter_mean,Area_Code_Region_Code_Locality_Code_Diameter_std,HeightDiameter,HeightS2,HeightS2Diameter,HeightLog,DiameterLog,HeightSqrt,DiameterSqrt,Volume
0,4694,17,1609,3.0,5.0,7,48,469417,160917,46941609,...,20.102564,15.862559,0.6,9.0,1.8,1.098612,1.609438,1.732051,2.236068,75.0
1,781,7,1380,4.0,17.0,2,54,7817,13807,7811380,...,17.0,,0.235294,16.0,0.941176,1.386294,2.833213,2.0,4.123106,1156.0
2,21014,6,1122,3.0,11.0,1,22,210146,11226,210141122,...,11.0,,0.272727,9.0,0.818182,1.098612,2.397895,1.732051,3.316625,363.0
3,7326,6,556,3.0,34.0,7,25,73266,5566,7326556,...,34.0,,0.088235,9.0,0.264706,1.098612,3.526361,1.732051,5.830952,3468.0
4,13122,17,1752,5.0,16.0,2,128,1312217,175217,131221752,...,16.0,,0.3125,25.0,1.5625,1.609438,2.772589,2.236068,4.0,1280.0


In [166]:
df_full_treated.isnull().sum()

Area_Code                                                0
Locality_Code                                            0
Region_Code                                              0
Height                                                   0
Diameter                                                 0
Class                                                    0
Species                                                  0
Area_Code_Locality_Code                                  0
Region_Code_Locality_Code                                0
Area_Code_Region_Code                                    0
Area_Code_Region_Code_Locality_Code                      0
Area_Code_Locality_Code_Height_min                       0
Area_Code_Locality_Code_Height_max                       0
Area_Code_Locality_Code_Height_mean                      0
Area_Code_Locality_Code_Height_std                   28723
Area_Code_Locality_Code_Diameter_min                     0
Area_Code_Locality_Code_Diameter_max                    

In [167]:
df_full_treated.columns

Index(['Area_Code', 'Locality_Code', 'Region_Code', 'Height', 'Diameter',
       'Class', 'Species', 'Area_Code_Locality_Code',
       'Region_Code_Locality_Code', 'Area_Code_Region_Code',
       'Area_Code_Region_Code_Locality_Code',
       'Area_Code_Locality_Code_Height_min',
       'Area_Code_Locality_Code_Height_max',
       'Area_Code_Locality_Code_Height_mean',
       'Area_Code_Locality_Code_Height_std',
       'Area_Code_Locality_Code_Diameter_min',
       'Area_Code_Locality_Code_Diameter_max',
       'Area_Code_Locality_Code_Diameter_mean',
       'Area_Code_Locality_Code_Diameter_std',
       'Region_Code_Locality_Code_Height_min',
       'Region_Code_Locality_Code_Height_max',
       'Region_Code_Locality_Code_Height_mean',
       'Region_Code_Locality_Code_Height_std',
       'Region_Code_Locality_Code_Diameter_min',
       'Region_Code_Locality_Code_Diameter_max',
       'Region_Code_Locality_Code_Diameter_mean',
       'Region_Code_Locality_Code_Diameter_std',
       'A

In [255]:
train_cols = [
    'Area_Code',
    'Locality_Code',
    'Region_Code',
    'Height',
    'Diameter',
       #'Class',
    'Species',
    #'Area_Code_Locality_Code',
       #'Region_Code_Locality_Code',
    #'Area_Code_Region_Code',
       'Area_Code_Locality_Code_Height_min',
       'Area_Code_Locality_Code_Height_max',
       'Area_Code_Locality_Code_Height_mean',
       'Area_Code_Locality_Code_Height_std',
       'Area_Code_Locality_Code_Diameter_min',
       'Area_Code_Locality_Code_Diameter_max',
       'Area_Code_Locality_Code_Diameter_mean',
       'Area_Code_Locality_Code_Diameter_std',
       'Region_Code_Locality_Code_Height_min',
       'Region_Code_Locality_Code_Height_max',
       'Region_Code_Locality_Code_Height_mean',
       'Region_Code_Locality_Code_Height_std',
       'Region_Code_Locality_Code_Diameter_min',
       'Region_Code_Locality_Code_Diameter_max',
       'Region_Code_Locality_Code_Diameter_mean',
       'Region_Code_Locality_Code_Diameter_std',
       'Area_Code_Region_Code_Height_min',
    'Area_Code_Region_Code_Height_max',
       'Area_Code_Region_Code_Height_mean',
    'Area_Code_Region_Code_Height_std',
       'Area_Code_Region_Code_Diameter_min',
       'Area_Code_Region_Code_Diameter_max',
       'Area_Code_Region_Code_Diameter_mean',
       'Area_Code_Region_Code_Diameter_std',
    #'Area_Code_Region_Code_Locality_Code_Height_min',
       #'Area_Code_Region_Code_Locality_Code_Height_max',
       #'Area_Code_Region_Code_Locality_Code_Height_mean',
       #'Area_Code_Region_Code_Locality_Code_Height_std',
       #'Area_Code_Region_Code_Locality_Code_Diameter_min',
       #'Area_Code_Region_Code_Locality_Code_Diameter_max',
       #'Area_Code_Region_Code_Locality_Code_Diameter_mean',
       #'Area_Code_Region_Code_Locality_Code_Diameter_std',
    #'HeightDiameter',
    #'HeightS2',
    #'HeightS2Diameter',
    #'HeightLog',
    #'DiameterLog',
    #'HeightSqrt',
    #'DiameterSqrt',
    #'Volume'
    
]
target = 'Class'

In [285]:
df_train_treated = df_full_treated[df_full_treated[target]!=-1].copy()
df_test_treated = df_full_treated[df_full_treated[target]==-1].copy()

In [286]:
cv = StratifiedKFold(n_splits=10,random_state=22,shuffle=True)
results = pd.DataFrame(columns=['training_score', 'test_score'])
score_avg = []
for (train, test), i in zip(cv.split(df_train_treated[train_cols], df_train_treated[target]), range(10)):
    m = LGBMClassifier(n_jobs=-1,random_state=22,scale_pos_weight=3,learning_rate=0.1,n_estimators=151,
                       colsample_bytree=0.5,num_leaves=70,min_child_samples=20,lambda_l1=1.6,lambda_l2=4) 
    #m=XGBClassifier(random_state=0,n_estimators=39)
    #m = CatBoostClassifier(random_state=0,n_estimators=100,verbose=0)
    m.fit(df_train_treated[train_cols].iloc[train], df_train_treated[target].iloc[train])
    score_avg.append(log_loss(y_pred=m.predict_proba(df_train_treated[train_cols].iloc[test]),y_true=df_train_treated[target].iloc[test]))
print(pd.DataFrame(data=score_avg))
print(sum(score_avg)/len(score_avg))

          0
0  0.794255
1  0.753725
2  0.732240
3  0.759642
4  0.815753
5  0.732786
6  0.781516
7  0.754754
8  0.760731
9  0.750798
0.7636199098229893


In [287]:
#model = XGBClassifier(random_state=0,n_estimators=40)
#model = XGBClassifier(random_state=0,n_estimators=100)
#model = LGBMClassifier(random_state=0)
model = LGBMClassifier(n_jobs=-1,random_state=22,scale_pos_weight=3,learning_rate=0.1,n_estimators=151,
                       colsample_bytree=0.5,num_leaves=70,min_child_samples=20,lambda_l1=1.6,lambda_l2=4)

In [288]:
model.fit(df_train_treated[train_cols],df_train_treated[target])

LGBMClassifier(colsample_bytree=0.5, lambda_l1=1.6, lambda_l2=4,
               n_estimators=151, num_leaves=70, random_state=22,
               scale_pos_weight=3)

In [289]:
#plot_importance(model)

In [290]:
y_pred_train = model.predict(df_train_treated[train_cols])

In [291]:
accuracy_score(df_train_treated[target],y_pred_train)

0.9238907310911101

In [292]:
y_pred_prob_train = model.predict_proba(df_train_treated[train_cols])

In [293]:
log_loss(df_train_treated[target],y_pred_prob_train)

0.32910641555418435

In [294]:
test_preds = pd.DataFrame(model.predict_proba(df_test_treated[train_cols]))

In [295]:
save_dataset(test_preds,name='BASE_MODEL_LGBM_TUNED_N151_NOVOL')