## Get all your favourite imports

In [1]:
import math
import numpy as np
import pandas as pd
from scipy import stats
from IPython.display import display
from collections import Counter
from tqdm.notebook import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from PyImpetus import inter_IAMB
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
import time
import warnings
warnings.filterwarnings("ignore")

## The JanataHack Cross-Sell AnalyticsVidhya Hackathon dataset requires some specific preprocessing

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Categorical preprocessing for catboost
df_train['Driving_License'] = "D_" + df_train['Driving_License'].astype(str)
df_test['Driving_License'] = "D_" + df_test['Driving_License'].astype(str)

# Categorical preprocessing for catboost
df_train['Region_Code'] = "D_" + df_train['Region_Code'].astype(str)
df_test['Region_Code'] = "D_" + df_test['Region_Code'].astype(str)

# Categorical preprocessing for catboost
df_train['Policy_Sales_Channel'] = "D_" + df_train['Policy_Sales_Channel'].astype(str)
df_test['Policy_Sales_Channel'] = "D_" + df_test['Policy_Sales_Channel'].astype(str)

# Categorical preprocessing for catboost
vehicle_damage_map = {"Yes": 1, "No": 0}
df_train['Vehicle_Damage'] = df_train['Vehicle_Damage'].map(vehicle_damage_map)
df_test['Vehicle_Damage'] = df_test['Vehicle_Damage'].map(vehicle_damage_map)

# Categorical preprocessing for catboost
vehicle_age_map = {"> 2 Years": 2, "1-2 Year": 1, "< 1 Year": 0}
df_train['Vehicle_Age'] = df_train['Vehicle_Age'].map(vehicle_age_map)
df_test['Vehicle_Age'] = df_test['Vehicle_Age'].map(vehicle_age_map)




# Since catboost requires string type for its categorical features while other models need label encoding
# We create a new dataframe, just for feature-selection.
# This problem will not arise for other downstream classfiers such as lightgbm, XGBoost, etc.

df_train_ = pd.read_csv("train.csv")

# Create a map to convert categorical features to numerical
gender_map = {"Male": 0, "Female": 1}
vehicle_age_map = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
vehicle_damage = {'Yes': 0, 'No': 1}

df_train_["Gender"] = df_train_["Gender"].map(gender_map)
df_train_['Vehicle_Age'] = df_train_['Vehicle_Age'].map(vehicle_age_map)
df_train_['Vehicle_Damage'] = df_train_['Vehicle_Damage'].map(vehicle_damage)



# This feature is not allowed in the competition
df_train.drop(["id"], axis=1, inplace=True)
df_train_.drop(["id"], axis=1, inplace=True)

display(df_train.head())
print()
display(df_train_.head())

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,44,D_1,D_28.0,0,2,1,40454.0,D_26.0,217,1
1,Male,76,D_1,D_3.0,0,1,0,33536.0,D_26.0,183,0
2,Male,47,D_1,D_28.0,0,2,1,38294.0,D_26.0,27,1
3,Male,21,D_1,D_11.0,1,0,0,28619.0,D_152.0,203,0
4,Female,29,D_1,D_41.0,1,0,0,27496.0,D_152.0,39,0





Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,44,1,28.0,0,2,0,40454.0,26.0,217,1
1,0,76,1,3.0,0,1,1,33536.0,26.0,183,0
2,0,47,1,28.0,0,2,0,38294.0,26.0,27,1
3,0,21,1,11.0,1,0,1,28619.0,152.0,203,0
4,1,29,1,41.0,1,0,1,27496.0,152.0,39,0


## Perform feature selection and then perform CV to check results
#### PyImpetus uses CV internally to select the best features. So no need to run your own CV for FS

In [3]:
# Initialize your catalyst object
fs = inter_IAMB(num_simul=10, n_jobs=-1, verbose=2, random_state=27)
# The fit function returns a list of the features selected
fs.fit(df_train_.drop("Response", axis=1), df_train_["Response"])
feats = fs.final_feats_
# The transform function prunes your pandas dataset to the set of final features
X_train = fs.transform(df_train).values
# Prune the test dataset as well
X_test = fs.transform(df_test).values
Y = df_train["Response"].values
# Get indices for categorical features. Will require for catboost
cat_feat = [df_train[feats].columns.get_loc(i) for i in ['Gender', 'Driving_License', 'Region_Code','Vehicle_Damage', 'Policy_Sales_Channel'] if i in df_train[feats].columns]
print("\nX_train_shape: ", X_train.shape, "\nX_test_shape: ", X_test.shape, "\nY_shape: ", Y.shape)
print("\n\n")

# # Uncomment this if you dont want to use feature selection
# X_train = df_train.drop(["Response"], axis=1).values
# X_test = df_test.drop(["id"], axis=1).values
# Y = df_train["Response"].values
# cat_feat = [df_train.columns.get_loc(i) for  i in ['Gender', 'Driving_License', 'Region_Code','Vehicle_Damage', 'Policy_Sales_Channel']]

# Now let's perform Kfold and see what results we get
kfold, scores = KFold(n_splits=5, random_state=27, shuffle=True), list()
for train, test in kfold.split(X_train):
    x_train, x_test = X_train[train], X_train[test]
    y_train, y_test = Y[train], Y[test]
    
    model = CatBoostClassifier(random_state=27, verbose=250)
    model.fit(x_train, y_train, cat_features=cat_feat)
    preds_proba = model.predict_proba(x_test)[:,1]
    
    score = roc_auc_score(y_test, preds_proba)
    scores.append(score)
    print("Score: ", score)
print("Final Average: ", sum(scores)/len(scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  6.4min remaining:  4.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  8.4min finished


[['Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel', 'Vehicle_Age', 'Age', 'Region_Code', 'Gender'], ['Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel', 'Vehicle_Age', 'Age', 'Region_Code'], ['Vehicle_Damage', 'Previously_Insured', 'Vehicle_Age', 'Age', 'Policy_Sales_Channel', 'Region_Code', 'Gender'], ['Vehicle_Damage', 'Previously_Insured', 'Age', 'Policy_Sales_Channel', 'Vehicle_Age', 'Region_Code'], ['Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel', 'Vehicle_Age', 'Age', 'Region_Code']]


FINAL SELECTED FEATURES
##################################
Feature:  Vehicle_Damage 	Probability Score:  1.0
Feature:  Previously_Insured 	Probability Score:  1.0
Feature:  Policy_Sales_Channel 	Probability Score:  1.0
Feature:  Vehicle_Age 	Probability Score:  1.0
Feature:  Age 	Probability Score:  1.0
Feature:  Region_Code 	Probability Score:  1.0
Feature:  Gender 	Probability Score:  0.4

X_train_shape:  (381109, 7) 
X_test_shape:  (127037, 7) 
Y_shape:  

## Final training. Then generate submission file to upload on AV

In [4]:
# Final Average (using PyImpetus):  0.8579          [on LB: 0.8576] Rank=166/600
# Final Average (w/o using PyImpetus):  0.8576      [on LB: 0.8568] Rank=223/600

# Finally train the model on the whole dataset
model = CatBoostClassifier(random_state=27, verbose=250)
model.fit(X_train, Y, cat_features=cat_feat)
preds_proba = model.predict_proba(X_test)[:,1]

# And make a submission
fp = open("submit.csv", "w")
fp.write("id,Response\n")
for id_, pred in zip(df_test["id"].values, preds_proba):
    fp.write(str(id_)+","+str(pred)+"\n")
fp.close()

Learning rate set to 0.130329
0:	learn: 0.4894485	total: 161ms	remaining: 2m 40s
250:	learn: 0.2623382	total: 34.4s	remaining: 1m 42s
500:	learn: 0.2609205	total: 1m 9s	remaining: 1m 9s
750:	learn: 0.2596670	total: 1m 45s	remaining: 35s
999:	learn: 0.2584963	total: 2m 21s	remaining: 0us
