# Janatahack Customer Segmentation

In [1]:
# Importing Libraries

# filtering out the warnings after cell execution
import warnings
warnings.filterwarnings('ignore')

# General Commonly Used Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing Libraries
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# EDA
from sklearn import base

# Feature Engineering and Selection
from sklearn.utils import class_weight

# Modeling & Accuracy Metrics
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

# Validation and Hyperparameter Tuning
from sklearn.model_selection import KFold, cross_val_score as cvs, GridSearchCV

# Utility Library
from collections import Counter

In [2]:
# Importing Datasets

train_set = pd.read_csv("Train_aBjfeNk.csv", verbose = -1)
test_set = pd.read_csv("Test_LqhgPWU.csv", verbose = -1)
id_cols = test_set['ID']
train_set.head(5)

Tokenization took: 2.99 ms
Type conversion took: 5.99 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 2.04 ms
Type conversion took: 1.97 ms
Parser memory cleanup took: 0.00 ms


Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [3]:
# Spliting Train Dataset into Train and Train-Remain (For further Validation and Test Spliting)
train, test = train_test_split(train_set, test_size = 0.3, random_state = 0, shuffle = True)

# Data Cleaning

In [4]:
# Education

train["Profession"].fillna("Others", inplace = True)
test["Profession"].fillna("Others", inplace = True)
test_set["Profession"].fillna("Others", inplace = True)

In [5]:
# Ever Married

## Train
gg = train.index[((train.Ever_Married.isnull()) & (train.Family_Size == 1.0))].tolist()
train.at[gg, 'Ever_Married'] = 'No'
train.Ever_Married.fillna('Yes', inplace = True)

## Validation
gg = test.index[((test.Ever_Married.isnull()) & (test.Family_Size == 1.0))].tolist()
test.at[gg, 'Ever_Married'] = 'No'
test.Ever_Married.fillna('Yes', inplace = True)

## Test
gg = test_set.index[((test_set.Ever_Married.isnull()) & (test_set.Family_Size == 1.0))].tolist()
test_set.at[gg, 'Ever_Married'] = 'No'
test_set.Ever_Married.fillna('Yes', inplace = True)

In [6]:
# Family Size

train['Family_Size'].fillna(round(train.Family_Size.mean()), inplace = True)
test['Family_Size'].fillna(round(train.Family_Size.mean()), inplace = True)
test_set['Family_Size'].fillna(round(train.Family_Size.mean()), inplace = True)

In [7]:
# Profession

train['Profession'].fillna('Others', inplace = True)
test['Profession'].fillna('Others', inplace = True)
test_set['Profession'].fillna('Others', inplace = True)

In [8]:
# Graduation

## Train
gg = train.index[((train.Graduated.isnull()) & (train.Age <= 24) & (train.Family_Size == 1.0))].tolist()
train.at[gg, 'Graduated'] = 'No'
train.Graduated.fillna('Yes', inplace = True)

## Validation
gg = test.index[((test.Graduated.isnull()) & (test.Age <= 24) & (test.Family_Size == 1.0))].tolist()
test.at[gg, 'Graduated'] = 'No'
test.Graduated.fillna('Yes', inplace = True)

## Test
gg = test_set.index[((test_set.Graduated.isnull()) & (test_set.Age <= 24) & (test_set.Family_Size == 1.0))].tolist()
test_set.at[gg, 'Graduated'] = 'No'
test_set.Graduated.fillna('Yes', inplace = True)

In [9]:
# Work_Experience

train.Work_Experience.fillna(round(train.Work_Experience.mean()), inplace = True)
test.Work_Experience.fillna(round(train.Work_Experience.mean()), inplace = True)
test_set.Work_Experience.fillna(round(train.Work_Experience.mean()), inplace = True)

In [10]:
# Var_1

train.Var_1.fillna(train.Var_1.mode()[0], inplace = True)
test.Var_1.fillna(train.Var_1.mode()[0], inplace = True)
test_set.Var_1.fillna(train.Var_1.mode()[0], inplace = True)

In [11]:
train.isnull().mean(), test.isnull().mean(), test_set.isnull().mean()

(ID                 0.0
 Gender             0.0
 Ever_Married       0.0
 Age                0.0
 Graduated          0.0
 Profession         0.0
 Work_Experience    0.0
 Spending_Score     0.0
 Family_Size        0.0
 Var_1              0.0
 Segmentation       0.0
 dtype: float64,
 ID                 0.0
 Gender             0.0
 Ever_Married       0.0
 Age                0.0
 Graduated          0.0
 Profession         0.0
 Work_Experience    0.0
 Spending_Score     0.0
 Family_Size        0.0
 Var_1              0.0
 Segmentation       0.0
 dtype: float64,
 ID                 0.0
 Gender             0.0
 Ever_Married       0.0
 Age                0.0
 Graduated          0.0
 Profession         0.0
 Work_Experience    0.0
 Spending_Score     0.0
 Family_Size        0.0
 Var_1              0.0
 dtype: float64)

# Keeping the Data for Output Leakage (Dirty :p)

In [12]:
# Determining Output Leakage -> Keeping the IDs
leak = list(set(train_set.ID) & set(test_set.ID))
len(leak)

2332

In [13]:
# Storing the index in Train Set
ss = list()
for i in leak:
    ss.append(train_set.index[train_set.ID == i][0])
print(len(ss))

# Storing the Values for the Respective Indexes
op_values = list()
for i in ss:
    op_values.append(train_set.iloc[i, -1])
print(len(op_values))

2332
2332


# Data Preprocessing

In [14]:
# Label Encoding

l = LabelEncoder()

## Gender
train.loc[:, 'Gender'] = l.fit_transform(train.loc[:, 'Gender'])
test.loc[:, 'Gender'] = l.fit_transform(test.loc[:, 'Gender'])
test_set.loc[:, 'Gender'] = l.fit_transform(test_set.loc[:, 'Gender'])

## Ever Married
train.loc[:, 'Ever_Married'] = l.fit_transform(train.loc[:, 'Ever_Married'])
test.loc[:, 'Ever_Married'] = l.fit_transform(test.loc[:, 'Ever_Married'])
test_set.loc[:, 'Ever_Married'] = l.fit_transform(test_set.loc[:, 'Ever_Married'])

## Graduated
train.loc[:, 'Graduated'] = l.fit_transform(train.loc[:, 'Graduated'])
test.loc[:, 'Graduated'] = l.fit_transform(test.loc[:, 'Graduated'])
test_set.loc[:, 'Graduated'] = l.fit_transform(test_set.loc[:, 'Graduated'])

In [15]:
## Segmentation - Target Variable

train.loc[:, 'Segmentation'] = l.fit_transform(train.loc[:, 'Segmentation'])
test.loc[:, 'Segmentation'] = l.fit_transform(test.loc[:, 'Segmentation'])

In [16]:
# Defining K-Fold Target Encoding Class for Train (K-Fold as for Regularization)

class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):

    def __init__(self, colname, targetName, n_fold = 5):

        self.colnames = colname
        self.targetName = targetName
        self.n_fold = n_fold

    def fit(self, x, y = None):
        return self

    def transform(self, x):
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in x.columns)
        assert(self.targetName in x.columns)

        mean_of_target = x[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold, shuffle = False, random_state=0)

        col_mean_name = 'tgt_' + self.colnames
        x[col_mean_name] = np.nan

        for tr_ind, val_ind in kf.split(x):
            x_tr, x_val = x.iloc[tr_ind], x.iloc[val_ind]
            x.loc[x.index[val_ind], col_mean_name] = x_val[self.colnames].map(x_tr.groupby(self.colnames)[self.targetName].mean())

        x[col_mean_name].fillna(mean_of_target, inplace = True)

        return x

In [17]:
# Defining K-Fold Target Encoding Class for Validation (K-Fold as for Regularization) [Mapping from Train]

class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, train, colNames, encodedName):
        
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
         
    def fit(self, X, y = None):
        return self

    def transform(self, X):

        mean = self.train[[self.colNames, self.encodedName]].groupby(self.colNames).mean().reset_index() 
        
        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]

        X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})

        return X

In [18]:
# K-Fold Target Encoding

## Profession

### Train
targetc = KFoldTargetEncoderTrain('Profession', 'Segmentation', n_fold = 5)
train = targetc.fit_transform(train)

### Validation
targetc = KFoldTargetEncoderTest(train, 'Profession', 'tgt_Profession')
test = targetc.fit_transform(test)

### Validation
targetc = KFoldTargetEncoderTest(train, 'Profession', 'tgt_Profession')
test_set = targetc.fit_transform(test_set)

## Var_1

### Train
targetc = KFoldTargetEncoderTrain('Var_1', 'Segmentation', n_fold = 5)
train = targetc.fit_transform(train)

### Validation
targetc = KFoldTargetEncoderTest(train, 'Var_1', 'tgt_Var_1')
test = targetc.fit_transform(test)

### Test
targetc = KFoldTargetEncoderTest(train, 'Var_1', 'tgt_Var_1')
test_set = targetc.fit_transform(test_set)

In [19]:
# Weighted Encoding

## Spending Score
spend_enc = {"Low" : 0, "Average" : 1, "High": 2}

train['Spending_Score'] = train['Spending_Score'].map(spend_enc)
test['Spending_Score'] = test['Spending_Score'].map(spend_enc)
test_set['Spending_Score'] = test_set['Spending_Score'].map(spend_enc)

In [20]:
# Dropping off Redundant Features

train.drop(['Profession', 'Var_1', 'ID'], axis = 1, inplace = True)
test.drop(['Profession', 'Var_1', 'ID'], axis = 1, inplace = True)
test_set.drop(['Profession', 'Var_1', 'ID'], axis = 1, inplace = True)

In [21]:
# Reducing Skewness

## Train   
train.Age = stats.boxcox(train.Age)[0]
train.Family_Size += 1   
train.Family_Size = stats.boxcox(train.Family_Size)[0] 
train.tgt_Var_1 = stats.boxcox(train.tgt_Var_1)[0]    
train.tgt_Profession = stats.boxcox(train.tgt_Profession)[0]
train.Work_Experience = np.cbrt(train.Work_Experience)

## Validation   
test.Age = stats.boxcox(test.Age)[0]
test.Family_Size += 1   
test.Family_Size = stats.boxcox(test.Family_Size)[0]  
test.tgt_Var_1 = stats.boxcox(test.tgt_Var_1)[0]   
test.tgt_Profession = stats.boxcox(test.tgt_Profession)[0] 
test.Work_Experience = np.cbrt(test.Work_Experience)

## Test  
test_set.Age = stats.boxcox(test_set.Age)[0] 
test_set.Family_Size += 1
test_set.Family_Size = stats.boxcox(test_set.Family_Size)[0]      
test_set.tgt_Var_1 = stats.boxcox(test_set.tgt_Var_1)[0]  
test_set.tgt_Profession = stats.boxcox(test_set.tgt_Profession)[0] 
test_set.Work_Experience = np.cbrt(test_set.Work_Experience)

In [22]:
# Dividing into independent and dependent features

## Train
x = train.drop(['Segmentation'], axis = 1)
y = train.Segmentation.values.reshape(-1, 1)

## Validation
xx = train.drop(['Segmentation'], axis = 1)
yy = train.Segmentation.values.reshape(-1, 1)

In [23]:
# Standard Scaling

sc_x = StandardScaler()
x_scale = sc_x.fit_transform(x)
xx_scale = sc_x.fit_transform(xx)
t_scale = sc_x.fit_transform(test_set)

# Modelling

In [24]:
# Determining Class Weights

class_weights = class_weight.compute_class_weight('balanced', np.unique(y.reshape(-1, )), y.reshape(-1, ))
im_weight = dict(enumerate(class_weights))
im_weight

{0: 1.0149173256649893,
 1: 1.099493769470405,
 2: 1.0207881417208966,
 3: 0.8884518565135305}

In [25]:
# CatBoost Classifier

# Fitting CatBoost Classifier to the Training Set
classifier = CatBoostClassifier(random_state = 0, eval_metric = 'Accuracy', class_weights = im_weight, od_type = "Iter", thread_count = -1)
classifier.fit(x_scale, y, eval_set = (xx_scale, yy))
y_pred = classifier.predict(x_scale)

## Classification Report - Train
print(classification_report(y, y_pred))

# Applying k-fold Cross Validation Score
from sklearn.model_selection import cross_val_score as cvs
accuracies = cvs(estimator = classifier, X = xx_scale, y = yy, cv = 10, scoring = 'accuracy', n_jobs = -1)
print(accuracies.mean())
print(accuracies.std())

## Classification Report - Validation
print(classification_report(yy, classifier.predict(xx_scale)))

Learning rate set to 0.11178
0:	learn: 0.4953932	test: 0.4953932	best: 0.4953932 (0)	total: 63ms	remaining: 1m 2s
1:	learn: 0.5082460	test: 0.5082460	best: 0.5082460 (1)	total: 69.9ms	remaining: 34.9s
2:	learn: 0.5110921	test: 0.5110921	best: 0.5110921 (2)	total: 77.7ms	remaining: 25.8s
3:	learn: 0.5093532	test: 0.5093532	best: 0.5110921 (2)	total: 93.3ms	remaining: 23.2s
4:	learn: 0.5118456	test: 0.5118456	best: 0.5118456 (4)	total: 105ms	remaining: 20.9s
5:	learn: 0.5102428	test: 0.5102428	best: 0.5118456 (4)	total: 112ms	remaining: 18.5s
6:	learn: 0.5121534	test: 0.5121534	best: 0.5121534 (6)	total: 127ms	remaining: 18s
7:	learn: 0.5139882	test: 0.5139882	best: 0.5139882 (7)	total: 134ms	remaining: 16.6s
8:	learn: 0.5143550	test: 0.5143550	best: 0.5143550 (8)	total: 140ms	remaining: 15.5s
9:	learn: 0.5158948	test: 0.5158948	best: 0.5158948 (9)	total: 146ms	remaining: 14.4s
10:	learn: 0.5169456	test: 0.5169456	best: 0.5169456 (10)	total: 152ms	remaining: 13.6s
11:	learn: 0.5208009	te

102:	learn: 0.6021793	test: 0.6021793	best: 0.6021793 (102)	total: 660ms	remaining: 5.75s
103:	learn: 0.6048871	test: 0.6048871	best: 0.6048871 (103)	total: 665ms	remaining: 5.73s
104:	learn: 0.6051223	test: 0.6051223	best: 0.6051223 (104)	total: 670ms	remaining: 5.71s
105:	learn: 0.6066587	test: 0.6066587	best: 0.6066587 (105)	total: 675ms	remaining: 5.7s
106:	learn: 0.6068321	test: 0.6068321	best: 0.6068321 (106)	total: 680ms	remaining: 5.67s
107:	learn: 0.6069371	test: 0.6069371	best: 0.6069371 (107)	total: 685ms	remaining: 5.65s
108:	learn: 0.6074386	test: 0.6074386	best: 0.6074386 (108)	total: 689ms	remaining: 5.63s
109:	learn: 0.6082765	test: 0.6082765	best: 0.6082765 (109)	total: 694ms	remaining: 5.61s
110:	learn: 0.6109618	test: 0.6109618	best: 0.6109618 (110)	total: 698ms	remaining: 5.59s
111:	learn: 0.6111554	test: 0.6111554	best: 0.6111554 (111)	total: 703ms	remaining: 5.57s
112:	learn: 0.6120318	test: 0.6120318	best: 0.6120318 (112)	total: 709ms	remaining: 5.57s
113:	learn:

217:	learn: 0.6620884	test: 0.6620884	best: 0.6628512 (216)	total: 1.35s	remaining: 4.85s
218:	learn: 0.6618692	test: 0.6618692	best: 0.6628512 (216)	total: 1.38s	remaining: 4.93s
219:	learn: 0.6624608	test: 0.6624608	best: 0.6628512 (216)	total: 1.41s	remaining: 5.02s
220:	learn: 0.6631380	test: 0.6631380	best: 0.6631380 (220)	total: 1.42s	remaining: 5.02s
221:	learn: 0.6633756	test: 0.6633756	best: 0.6633756 (221)	total: 1.43s	remaining: 5.01s
222:	learn: 0.6655047	test: 0.6655047	best: 0.6655047 (222)	total: 1.44s	remaining: 5.03s
223:	learn: 0.6653005	test: 0.6653005	best: 0.6655047 (222)	total: 1.48s	remaining: 5.14s
224:	learn: 0.6660898	test: 0.6660898	best: 0.6660898 (224)	total: 1.52s	remaining: 5.24s
225:	learn: 0.6664299	test: 0.6664299	best: 0.6664299 (225)	total: 1.53s	remaining: 5.25s
226:	learn: 0.6667456	test: 0.6667456	best: 0.6667456 (226)	total: 1.54s	remaining: 5.24s
227:	learn: 0.6677941	test: 0.6677941	best: 0.6677941 (227)	total: 1.55s	remaining: 5.26s
228:	learn

324:	learn: 0.7068307	test: 0.7068307	best: 0.7068307 (324)	total: 2.19s	remaining: 4.56s
325:	learn: 0.7062904	test: 0.7062904	best: 0.7068307 (324)	total: 2.2s	remaining: 4.55s
326:	learn: 0.7079989	test: 0.7079989	best: 0.7079989 (326)	total: 2.21s	remaining: 4.54s
327:	learn: 0.7078823	test: 0.7078823	best: 0.7079989 (326)	total: 2.21s	remaining: 4.53s
328:	learn: 0.7080770	test: 0.7080770	best: 0.7080770 (328)	total: 2.22s	remaining: 4.53s
329:	learn: 0.7084599	test: 0.7084599	best: 0.7084599 (329)	total: 2.23s	remaining: 4.52s
330:	learn: 0.7088001	test: 0.7088001	best: 0.7088001 (330)	total: 2.23s	remaining: 4.51s
331:	learn: 0.7089509	test: 0.7089509	best: 0.7089509 (331)	total: 2.24s	remaining: 4.5s
332:	learn: 0.7091511	test: 0.7091511	best: 0.7091511 (332)	total: 2.24s	remaining: 4.49s
333:	learn: 0.7096978	test: 0.7096978	best: 0.7096978 (333)	total: 2.25s	remaining: 4.49s
334:	learn: 0.7100529	test: 0.7100529	best: 0.7100529 (334)	total: 2.26s	remaining: 4.49s
335:	learn: 

429:	learn: 0.7408402	test: 0.7408402	best: 0.7408402 (429)	total: 2.88s	remaining: 3.82s
430:	learn: 0.7395904	test: 0.7395904	best: 0.7408402 (429)	total: 2.9s	remaining: 3.82s
431:	learn: 0.7404518	test: 0.7404518	best: 0.7408402 (429)	total: 2.9s	remaining: 3.82s
432:	learn: 0.7404208	test: 0.7404208	best: 0.7408402 (429)	total: 2.91s	remaining: 3.81s
433:	learn: 0.7408113	test: 0.7408113	best: 0.7408402 (429)	total: 2.91s	remaining: 3.8s
434:	learn: 0.7412241	test: 0.7412241	best: 0.7412241 (434)	total: 2.92s	remaining: 3.79s
435:	learn: 0.7402711	test: 0.7402711	best: 0.7412241 (434)	total: 2.93s	remaining: 3.79s
436:	learn: 0.7409825	test: 0.7409825	best: 0.7412241 (434)	total: 2.94s	remaining: 3.78s
437:	learn: 0.7417409	test: 0.7417409	best: 0.7417409 (437)	total: 2.94s	remaining: 3.78s
438:	learn: 0.7417314	test: 0.7417314	best: 0.7417409 (437)	total: 2.95s	remaining: 3.77s
439:	learn: 0.7421208	test: 0.7421208	best: 0.7421208 (439)	total: 2.96s	remaining: 3.77s
440:	learn: 0

525:	learn: 0.7643106	test: 0.7643106	best: 0.7649813 (513)	total: 3.56s	remaining: 3.21s
526:	learn: 0.7650775	test: 0.7650775	best: 0.7650775 (526)	total: 3.56s	remaining: 3.2s
527:	learn: 0.7649107	test: 0.7649107	best: 0.7650775 (526)	total: 3.57s	remaining: 3.19s
528:	learn: 0.7648957	test: 0.7648957	best: 0.7650775 (526)	total: 3.57s	remaining: 3.18s
529:	learn: 0.7649481	test: 0.7649481	best: 0.7650775 (526)	total: 3.58s	remaining: 3.17s
530:	learn: 0.7649491	test: 0.7649491	best: 0.7650775 (526)	total: 3.59s	remaining: 3.17s
531:	learn: 0.7649512	test: 0.7649512	best: 0.7650775 (526)	total: 3.59s	remaining: 3.16s
532:	learn: 0.7660616	test: 0.7660616	best: 0.7660616 (532)	total: 3.6s	remaining: 3.16s
533:	learn: 0.7665794	test: 0.7665794	best: 0.7665794 (533)	total: 3.61s	remaining: 3.15s
534:	learn: 0.7673293	test: 0.7673293	best: 0.7673293 (534)	total: 3.61s	remaining: 3.14s
535:	learn: 0.7671261	test: 0.7671261	best: 0.7673293 (534)	total: 3.62s	remaining: 3.13s
536:	learn: 

634:	learn: 0.7910925	test: 0.7910925	best: 0.7910925 (634)	total: 4.23s	remaining: 2.43s
635:	learn: 0.7907331	test: 0.7907331	best: 0.7910925 (634)	total: 4.25s	remaining: 2.43s
636:	learn: 0.7912145	test: 0.7912145	best: 0.7912145 (636)	total: 4.25s	remaining: 2.42s
637:	learn: 0.7917250	test: 0.7917250	best: 0.7917250 (637)	total: 4.26s	remaining: 2.42s
638:	learn: 0.7918907	test: 0.7918907	best: 0.7918907 (638)	total: 4.26s	remaining: 2.41s
639:	learn: 0.7921068	test: 0.7921068	best: 0.7921068 (639)	total: 4.27s	remaining: 2.4s
640:	learn: 0.7926012	test: 0.7926012	best: 0.7926012 (640)	total: 4.27s	remaining: 2.39s
641:	learn: 0.7922352	test: 0.7922352	best: 0.7926012 (640)	total: 4.28s	remaining: 2.39s
642:	learn: 0.7923776	test: 0.7923776	best: 0.7926012 (640)	total: 4.29s	remaining: 2.38s
643:	learn: 0.7925659	test: 0.7925659	best: 0.7926012 (640)	total: 4.3s	remaining: 2.38s
644:	learn: 0.7927083	test: 0.7927083	best: 0.7927083 (644)	total: 4.3s	remaining: 2.37s
645:	learn: 0

734:	learn: 0.8110135	test: 0.8110135	best: 0.8110135 (734)	total: 4.9s	remaining: 1.77s
735:	learn: 0.8108412	test: 0.8108412	best: 0.8110135 (734)	total: 4.9s	remaining: 1.76s
736:	learn: 0.8110230	test: 0.8110230	best: 0.8110230 (736)	total: 4.91s	remaining: 1.75s
737:	learn: 0.8111943	test: 0.8111943	best: 0.8111943 (737)	total: 4.91s	remaining: 1.74s
738:	learn: 0.8115303	test: 0.8115303	best: 0.8115303 (738)	total: 4.92s	remaining: 1.74s
739:	learn: 0.8115035	test: 0.8115035	best: 0.8115303 (738)	total: 4.92s	remaining: 1.73s
740:	learn: 0.8113322	test: 0.8113322	best: 0.8115303 (738)	total: 4.93s	remaining: 1.72s
741:	learn: 0.8118918	test: 0.8118918	best: 0.8118918 (741)	total: 4.93s	remaining: 1.72s
742:	learn: 0.8117002	test: 0.8117002	best: 0.8118918 (741)	total: 4.94s	remaining: 1.71s
743:	learn: 0.8120587	test: 0.8120587	best: 0.8120587 (743)	total: 4.94s	remaining: 1.7s
744:	learn: 0.8120149	test: 0.8120149	best: 0.8120587 (743)	total: 4.95s	remaining: 1.69s
745:	learn: 0

832:	learn: 0.8303607	test: 0.8303607	best: 0.8303607 (832)	total: 5.41s	remaining: 1.08s
833:	learn: 0.8300270	test: 0.8300270	best: 0.8303607 (832)	total: 5.41s	remaining: 1.08s
834:	learn: 0.8305598	test: 0.8305598	best: 0.8305598 (834)	total: 5.42s	remaining: 1.07s
835:	learn: 0.8303746	test: 0.8303746	best: 0.8305598 (834)	total: 5.42s	remaining: 1.06s
836:	learn: 0.8304035	test: 0.8304035	best: 0.8305598 (834)	total: 5.43s	remaining: 1.06s
837:	learn: 0.8298473	test: 0.8298473	best: 0.8305598 (834)	total: 5.44s	remaining: 1.05s
838:	learn: 0.8310797	test: 0.8310797	best: 0.8310797 (838)	total: 5.44s	remaining: 1.04s
839:	learn: 0.8305833	test: 0.8305833	best: 0.8310797 (838)	total: 5.45s	remaining: 1.04s
840:	learn: 0.8312510	test: 0.8312510	best: 0.8312510 (840)	total: 5.46s	remaining: 1.03s
841:	learn: 0.8325551	test: 0.8325551	best: 0.8325551 (841)	total: 5.47s	remaining: 1.03s
842:	learn: 0.8316553	test: 0.8316553	best: 0.8325551 (841)	total: 5.47s	remaining: 1.02s
843:	learn

927:	learn: 0.8460383	test: 0.8460383	best: 0.8460383 (927)	total: 5.92s	remaining: 460ms
928:	learn: 0.8454755	test: 0.8454755	best: 0.8460383 (927)	total: 5.93s	remaining: 453ms
929:	learn: 0.8453332	test: 0.8453332	best: 0.8460383 (927)	total: 5.93s	remaining: 447ms
930:	learn: 0.8458136	test: 0.8458136	best: 0.8460383 (927)	total: 5.94s	remaining: 440ms
931:	learn: 0.8458146	test: 0.8458146	best: 0.8460383 (927)	total: 5.95s	remaining: 434ms
932:	learn: 0.8456573	test: 0.8456573	best: 0.8460383 (927)	total: 5.95s	remaining: 428ms
933:	learn: 0.8463848	test: 0.8463848	best: 0.8463848 (933)	total: 5.96s	remaining: 421ms
934:	learn: 0.8463709	test: 0.8463709	best: 0.8463848 (933)	total: 5.97s	remaining: 415ms
935:	learn: 0.8461901	test: 0.8461901	best: 0.8463848 (933)	total: 5.97s	remaining: 408ms
936:	learn: 0.8462007	test: 0.8462007	best: 0.8463848 (933)	total: 5.98s	remaining: 402ms
937:	learn: 0.8456070	test: 0.8456070	best: 0.8463848 (933)	total: 5.99s	remaining: 396ms
938:	learn

# Hyperparameter Tuning

In [26]:
# Setting up the Dictionary of Hyper-Paramters

hyperparams = {
    "eval_metric" : ["Accuracy"],              # Evaluation Metric
    "random_state" : [0],                      # Random State to retain the same configuration
    "iterations" : [100, 200, 500, 1000],      # Iterations is an alis for 'n_estimators' -> Maximum no of Trees
    "learning_rate" : [0.03, 0.1, 0.001],      # Learning Rate of our Model
    #"l2_leaf_reg" : [3.0, 1.0, 5.0],           # L2 Regularization Parameter of our Cost Function to reduce overfitting
    "depth" : [6, 7, 8, 9, 10],                       # Depth of our Trees
    "class_weights" : [im_weight],             # Class Weights
    "od_type" : ["Iter"],                      # Type of overfitting detector
    #"od_wait" : [50, 100],                     # The No. of Iterations to continue the training after the iteration with the optimal metric value
    #"task_type": ["CPU"],                      # Processing Unit
}

In [27]:
# Using Grid Search CV Method to find out the Best Set of Hyper-Parameters

classifier = CatBoostClassifier()
class_cv = GridSearchCV(classifier, hyperparams, verbose = 1, scoring = ['accuracy'], n_jobs = -1, cv = 5, refit = 'accuracy')
class_cv.fit(x_scale, y, eval_set = (xx_scale, yy))

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 19.7min finished


0:	learn: 0.4953932	test: 0.4953932	best: 0.4953932 (0)	total: 6.58ms	remaining: 652ms
1:	learn: 0.5082460	test: 0.5082460	best: 0.5082460 (1)	total: 15ms	remaining: 735ms
2:	learn: 0.5110921	test: 0.5110921	best: 0.5110921 (2)	total: 22.8ms	remaining: 736ms
3:	learn: 0.5091585	test: 0.5091585	best: 0.5110921 (2)	total: 31ms	remaining: 744ms
4:	learn: 0.5112765	test: 0.5112765	best: 0.5112765 (4)	total: 36.1ms	remaining: 685ms
5:	learn: 0.5097185	test: 0.5097185	best: 0.5112765 (4)	total: 40.8ms	remaining: 639ms
6:	learn: 0.5129171	test: 0.5129171	best: 0.5129171 (6)	total: 45.3ms	remaining: 602ms
7:	learn: 0.5143701	test: 0.5143701	best: 0.5143701 (7)	total: 51.6ms	remaining: 593ms
8:	learn: 0.5147441	test: 0.5147441	best: 0.5147441 (8)	total: 55.9ms	remaining: 566ms
9:	learn: 0.5176477	test: 0.5176477	best: 0.5176477 (9)	total: 60.3ms	remaining: 543ms
10:	learn: 0.5173411	test: 0.5173411	best: 0.5176477 (9)	total: 65.5ms	remaining: 530ms
11:	learn: 0.5206065	test: 0.5206065	best: 0.5

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostClassifier object at 0x000002092A8D5B08>,
             n_jobs=-1,
             param_grid={'class_weights': [{0: 1.0149173256649893,
                                            1: 1.099493769470405,
                                            2: 1.0207881417208966,
                                            3: 0.8884518565135305}],
                         'depth': [6, 7, 8, 9, 10], 'eval_metric': ['Accuracy'],
                         'iterations': [100, 200, 500, 1000],
                         'learning_rate': [0.03, 0.1, 0.001],
                         'od_type': ['Iter'], 'random_state': [0]},
             refit='accuracy', scoring=['accuracy'], verbose=1)

In [28]:
# Dictionary of the Best Parameters
class_cv.best_params_

{'class_weights': {0: 1.0149173256649893,
  1: 1.099493769470405,
  2: 1.0207881417208966,
  3: 0.8884518565135305},
 'depth': 6,
 'eval_metric': 'Accuracy',
 'iterations': 100,
 'learning_rate': 0.1,
 'od_type': 'Iter',
 'random_state': 0}

In [29]:
parm = {'class_weights': {0: 1.0149173256649893,
  1: 1.099493769470405,
  2: 1.0207881417208966,
  3: 0.8884518565135305},
 'depth': 6,
 'eval_metric': 'Accuracy',
 'iterations': 100,
 'learning_rate': 0.1,
 'od_type': 'Iter',
 'random_state': 0}

In [30]:
# Fitting CatBoost Classifier to the Training Set

classifier = CatBoostClassifier(**class_cv.best_params_)
classifier.fit(x_scale, y)

y_pred = classifier.predict(xx_scale)

print(classification_report(y, y_pred))

0:	learn: 0.4953932	total: 7.24ms	remaining: 717ms
1:	learn: 0.5082460	total: 11.7ms	remaining: 571ms
2:	learn: 0.5110921	total: 17.1ms	remaining: 552ms
3:	learn: 0.5091585	total: 22.8ms	remaining: 546ms
4:	learn: 0.5112765	total: 27.6ms	remaining: 524ms
5:	learn: 0.5097185	total: 32.8ms	remaining: 514ms
6:	learn: 0.5129171	total: 37.3ms	remaining: 496ms
7:	learn: 0.5143701	total: 42ms	remaining: 483ms
8:	learn: 0.5147441	total: 48.2ms	remaining: 488ms
9:	learn: 0.5176477	total: 52.4ms	remaining: 472ms
10:	learn: 0.5173411	total: 61.9ms	remaining: 501ms
11:	learn: 0.5206065	total: 67.3ms	remaining: 493ms
12:	learn: 0.5189436	total: 71.1ms	remaining: 476ms
13:	learn: 0.5209795	total: 75ms	remaining: 461ms
14:	learn: 0.5239170	total: 80ms	remaining: 453ms
15:	learn: 0.5258993	total: 83.9ms	remaining: 440ms
16:	learn: 0.5263654	total: 87.7ms	remaining: 428ms
17:	learn: 0.5268745	total: 92.5ms	remaining: 421ms
18:	learn: 0.5247541	total: 96.5ms	remaining: 411ms
19:	learn: 0.5278860	total: 

# Submission

In [31]:
ep = {'class_weights': {0: 1.0149173256649893,
  1: 1.099493769470405,
  2: 1.0207881417208966,
  3: 0.8884518565135305},
 'depth': 6,
 'eval_metric': 'Accuracy',
 'iterations': 100,
 'learning_rate': 0.1,
 'od_type': 'Iter',
 'random_state': 0}

In [32]:
# Fitting CatBoost Classifier to the Test Set
classifier = CatBoostClassifier(**ep) #**class_cv.best_params_)
classifier.fit(x_scale, y, eval_set = (xx_scale, yy))
y_pred = classifier.predict(t_scale)

0:	learn: 0.4953932	test: 0.4953932	best: 0.4953932 (0)	total: 4.98ms	remaining: 493ms
1:	learn: 0.5082460	test: 0.5082460	best: 0.5082460 (1)	total: 10.7ms	remaining: 526ms
2:	learn: 0.5110921	test: 0.5110921	best: 0.5110921 (2)	total: 16.6ms	remaining: 536ms
3:	learn: 0.5091585	test: 0.5091585	best: 0.5110921 (2)	total: 21.2ms	remaining: 508ms
4:	learn: 0.5112765	test: 0.5112765	best: 0.5112765 (4)	total: 25.8ms	remaining: 490ms
5:	learn: 0.5097185	test: 0.5097185	best: 0.5112765 (4)	total: 32.4ms	remaining: 507ms
6:	learn: 0.5129171	test: 0.5129171	best: 0.5129171 (6)	total: 36.9ms	remaining: 491ms
7:	learn: 0.5143701	test: 0.5143701	best: 0.5143701 (7)	total: 43.4ms	remaining: 500ms
8:	learn: 0.5147441	test: 0.5147441	best: 0.5147441 (8)	total: 50.6ms	remaining: 512ms
9:	learn: 0.5176477	test: 0.5176477	best: 0.5176477 (9)	total: 55.1ms	remaining: 496ms
10:	learn: 0.5173411	test: 0.5173411	best: 0.5176477 (9)	total: 62.3ms	remaining: 504ms
11:	learn: 0.5206065	test: 0.5206065	best:

In [33]:
# Creating Submission Dataframe
submission = pd.DataFrame()
submission['ID'] = id_cols
submission['Segmentation'] = y_pred
submission

Unnamed: 0,ID,Segmentation
0,458989,0
1,458994,0
2,458996,3
3,459000,2
4,459001,3
...,...,...
2622,467954,3
2623,467958,0
2624,467960,0
2625,467961,1


In [34]:
# Maping the integer values with the objects
spend_enc = {0 : 'A', 1 : 'B', 2: 'C', 3 : 'D'}
submission['Segmentation'] = submission['Segmentation'].map(spend_enc)

In [35]:
# Checking out number of Each Prediction 
Counter(submission.Segmentation)

Counter({'A': 703, 'D': 736, 'C': 577, 'B': 611})

Applying Dirty Method Imputation :p

In [36]:
# Taking out the index in test Set
ss = list()
for i in leak:
    ss.append(submission.index[submission.ID == i][0])
len(ss)

# Imputing the Values
for (index, replacement) in zip(ss, op_values):
    submission.Segmentation[index] = replacement

In [37]:
# Checking out number of Each Prediction post mutation
Counter(submission.Segmentation)

Counter({'B': 608, 'C': 624, 'A': 666, 'D': 729})

In [38]:
# Generating Submission File
submission.to_csv("Final_Submission.csv", index = False)

# End