In [12]:
##### Install the below libraries for reproducibility
#  !pip install catboost==0.23.2  ##### catboost classifier used
# !pip install prince             ##### Multiple correspondence analysis used for categorical columns using this library

In [1]:
import pandas as pd 
import numpy as np
import prince

from catboost import CatBoostClassifier
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.decomposition import *

import matplotlib
%matplotlib inline

### Some custom functions to be used later for aggregation

In [2]:
def my_groupby(df,primary_keys,dictionary_ops,renaming_dict):
    '''
        primary_keys is a list of primary keys.
        dictionary_ops is the dictionay having the operations to be performed (example :- {'location_number':'count'})
        renaming_dict is the column to be renamed after joining and resetting index
    '''
    return df.groupby(primary_keys).agg(dictionary_ops).reset_index().rename(columns=renaming_dict)

def data_left_join(df1,df2,primary_key):
    '''
        df1 :- First dataframe
        df2 :- Second Dataframe
        primary_key :- The list of primary keys on which one needs to left join
    '''
    return df1.merge(df2,how='left',on=primary_key)

def updated_df(df,primary_key,operation,columns):
    for cols in columns:
        print('Aggregate ',operation ,' on column- ',cols)
        df       = data_left_join(df,
                                   my_groupby(df,
                                              [primary_key],
                                              {cols:operation},
                                              {cols:primary_key+'_'+operation+'_'+cols}),
                                   primary_key)

    return df



## Step 1:- Loading train and test data 


In [3]:
train_path                          = r'D:\machine_hacks\machine_hack_17\Train.csv'
test_path                           = r'D:\machine_hacks\machine_hack_17\Test.csv'
submis_path                         = r'D:\machine_hacks\machine_hack_17\Sample_Submission.csv'

train_data                          = pd.read_csv(train_path)
test_data                           = pd.read_csv(test_path)
train_y                             = train_data.Class.values
train_data                          = train_data.drop(['Class'],axis=1)    

## Step 2:- Performing Multiple Correspondence Analysis for categorical features
We will create new feature for train and test dataset that best explains the correspondence/association between categorical features

In [4]:
concat_df                          = pd.concat((train_data,test_data),axis=0)
cat_cols                           = ['Area_Code','Locality_Code','Region_Code','Species']
mca                                = prince.MCA(n_components=1,random_state=202020).fit(concat_df[cat_cols])
train_data.loc[:,'mca_cat1']        = mca.transform(train_data[cat_cols])[0]
test_data.loc[:,'mca_cat1']         = mca.transform(test_data[cat_cols])[0]


## Step 3:- Principal Component Analysis on the numerical data given

In [5]:
num_cols                           = ['Height','Diameter']
pca                                = PCA(n_components=1,random_state=202020).fit(concat_df[num_cols])
train_data.loc[:,'pca_num']        = pca.transform(train_data[num_cols])[:,0]
test_data.loc[:,'pca_num']         = pca.transform(test_data[num_cols])[:,0]

## Step 4:- Creation of EFB (Exclusive Feature Bundles)
This is done to ensure that in the test data we have the combination of one categorical variable with other. This will make sure that if we have a novel value of a categorical data, we will be able to associate it with a categorical column that the data has already seen in train data

In [6]:
del concat_df
concat_df                          = pd.concat((train_data,test_data),axis=0)
concat_df['EFB1']                  = concat_df['Locality_Code'].astype(str)+'_'+concat_df['Species'].astype(str)
concat_df['EFB2']                  = concat_df['Locality_Code'].astype(str)+'_'+concat_df['Region_Code'].astype(str)
concat_df['EFB3']                  = concat_df['Species'].astype(str)+'_'+concat_df['Region_Code'].astype(str)
concat_df['EFB4']                  = concat_df['Area_Code'].astype(str)+'_'+concat_df['Region_Code'].astype(str)
concat_df['EFB5']                  = concat_df['Area_Code'].astype(str)+'_'+concat_df['Locality_Code'].astype(str)
concat_df['EFB6']                  = concat_df['Area_Code'].astype(str)+'_'+concat_df['Species'].astype(str)


## Step 5:- Aggregate feature Creation
We create aggregate features keeping various categorical columns as primary keys.

In [7]:
concat_df['ratio_height_diam']     = np.where(concat_df['Diameter']!=0,concat_df['Height']/concat_df['Diameter'],np.NAN)
aggregation_columns                = ['Height','Diameter','mca_cat1','pca_num','ratio_height_diam']
numerical_aggregation_primary_keys = ['Area_Code','Locality_Code','Region_Code','Species']

for cols in numerical_aggregation_primary_keys:
    print(cols)
    concat_df                       = updated_df(concat_df,cols,'mean',aggregation_columns)
    concat_df                       = updated_df(concat_df,cols,'std',aggregation_columns)
    concat_df                       = updated_df(concat_df,cols,'min',aggregation_columns)
    concat_df                       = updated_df(concat_df,cols,'max',aggregation_columns)
    concat_df                       = updated_df(concat_df,cols,'median',aggregation_columns)
    print('\n')

concat_df                          = updated_df(concat_df,'Area_Code','nunique',['Species'])
concat_df                          = updated_df(concat_df,'Locality_Code','nunique',['Species'])
concat_df                          = updated_df(concat_df,'Region_Code','nunique',['Species'])

concat_df                          = updated_df(concat_df,'Area_Code','nunique',['Locality_Code'])
concat_df                          = updated_df(concat_df,'Region_Code','nunique',['Locality_Code'])
concat_df                          = updated_df(concat_df,'Species','nunique',['Locality_Code'])

concat_df                          = updated_df(concat_df,'Area_Code','nunique',['Region_Code'])
concat_df                          = updated_df(concat_df,'Locality_Code','nunique',['Region_Code'])
concat_df                          = updated_df(concat_df,'Species','nunique',['Region_Code'])


Area_Code
Aggregate  mean  on column-  Height
Aggregate  mean  on column-  Diameter
Aggregate  mean  on column-  mca_cat1
Aggregate  mean  on column-  pca_num
Aggregate  mean  on column-  ratio_height_diam
Aggregate  std  on column-  Height
Aggregate  std  on column-  Diameter
Aggregate  std  on column-  mca_cat1
Aggregate  std  on column-  pca_num
Aggregate  std  on column-  ratio_height_diam
Aggregate  min  on column-  Height
Aggregate  min  on column-  Diameter
Aggregate  min  on column-  mca_cat1
Aggregate  min  on column-  pca_num
Aggregate  min  on column-  ratio_height_diam
Aggregate  max  on column-  Height
Aggregate  max  on column-  Diameter
Aggregate  max  on column-  mca_cat1
Aggregate  max  on column-  pca_num
Aggregate  max  on column-  ratio_height_diam
Aggregate  median  on column-  Height
Aggregate  median  on column-  Diameter
Aggregate  median  on column-  mca_cat1
Aggregate  median  on column-  pca_num
Aggregate  median  on column-  ratio_height_diam


Locality_Code

## Step 6:- Creating final train and test dataset 

In [8]:
testcount                          = len(test_data)
count                              = len(concat_df)-testcount

train_data                         = concat_df[:count]
test_data                          = concat_df[count:]

##### We identify categorical columns here
cat_cols                           = ['Area_Code','Locality_Code','Region_Code','Species','EFB1','EFB2','EFB3','EFB4','EFB5','EFB6']
for cols in cat_cols:
    train_data[cols]               = train_data[cols].astype(str)
    test_data[cols]                = test_data[cols].astype(str)
    
train                              = train_data.values
test                               = test_data.values
cate_features_index                = np.where(train_data.dtypes == object)[0]

## Step 7:- Model training and validation
Make sure the colab session has GPU selected as Hardware accelerator in the runtime type.

In [9]:
oof_pred               = np.zeros((len(train),8))
y_pred_final           = np.zeros((len(test), 8))
num_models             = 2

n_splits               = 44
error                  = []
kf                     = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (tr_ind, val_ind) in enumerate(kf.split(train, train_y)):
    wghts              = [0]*num_models
    logloss            = []
    
    X_train, X_val     = train[tr_ind], train[val_ind]
    y_train, y_val     = train_y[tr_ind], train_y[val_ind]
    
    
    
    model1             = CatBoostClassifier(n_estimators=1200,random_state=202020,verbose=False,task_type='GPU')
    model1.fit(X_train,y_train,cat_features = cate_features_index,eval_set=(X_val,y_val))
    val_pred1          = model1.predict_proba(X_val)
    logloss.append(log_loss(y_val,val_pred1))
    print('validation logloss model 1 fold-',fold+1,': ',log_loss(y_val,val_pred1))
    
    
    model2             = CatBoostClassifier(n_estimators=1000,random_state=202020,verbose=False,task_type='GPU')
    model2.fit(X_train,y_train,cat_features = cate_features_index,eval_set=(X_val,y_val))
    val_pred2          = model2.predict_proba(X_val)
    logloss.append(log_loss(y_val,val_pred2))
    print('validation logloss model 2 fold-',fold+1,': ',log_loss(y_val,val_pred2))
    
    
    wghts              = np.exp(-1000*np.array(logloss/sum(logloss)))
    wghts              = wghts/sum(wghts)
    
    val_pred           = wghts[0]*val_pred1+wghts[1]*val_pred2
    print('validation logloss fold-',fold+1,': ',log_loss(y_val, val_pred))
    
    oof_pred[val_ind]  = val_pred
    
    y_pred_final += (wghts[0]*model1.predict_proba(test)+wghts[1]*model2.predict_proba(test))/(n_splits)
    
    print('\n')
    
print('OOF logloss:- ',(log_loss(train_y,oof_pred)))

validation logloss model 1 fold- 1 :  0.7570568117337814
validation logloss model 2 fold- 1 :  0.77118285261914
validation logloss fold- 1 :  0.7570561708009502


validation logloss model 1 fold- 2 :  0.7199792111600761
validation logloss model 2 fold- 2 :  0.73690868489152
validation logloss fold- 2 :  0.7199792132847787


validation logloss model 1 fold- 3 :  0.6349443709643815
validation logloss model 2 fold- 3 :  0.6270544694185234
validation logloss fold- 3 :  0.6270326096270531


validation logloss model 1 fold- 4 :  0.6960311012853535
validation logloss model 2 fold- 4 :  0.689148731580448
validation logloss fold- 4 :  0.6890668734053383


validation logloss model 1 fold- 5 :  0.7244444068784825
validation logloss model 2 fold- 5 :  0.7166703143539669
validation logloss fold- 5 :  0.7165812258396086


validation logloss model 1 fold- 6 :  0.7387854767145599
validation logloss model 2 fold- 6 :  0.7332455167785642
validation logloss fold- 6 :  0.7327684180822011


validation logl

## Final Step :- Output Creation

In [10]:
submission_df       = pd.read_csv(submis_path)
columns_name        = submission_df.columns.tolist()
output_df           = pd.DataFrame(y_pred_final,columns=columns_name)
output_df.to_csv('output_trial_catboost_44folds_2models_gpu.csv',index=False)
