In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

In [136]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [137]:
from utils import (
load_dataset,
    save_dataset
)

In [138]:
df_train= load_dataset('train')
df_test = load_dataset('test')

In [139]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [140]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2627 entries, 0 to 2626
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               2627 non-null   int64  
 1   Gender           2627 non-null   object 
 2   Ever_Married     2577 non-null   object 
 3   Age              2627 non-null   int64  
 4   Graduated        2603 non-null   object 
 5   Profession       2589 non-null   object 
 6   Work_Experience  2358 non-null   float64
 7   Spending_Score   2627 non-null   object 
 8   Family_Size      2514 non-null   float64
 9   Var_1            2595 non-null   object 
dtypes: float64(2), int64(2), object(6)
memory usage: 205.4+ KB


In [141]:
df_train.isnull().sum()

ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [142]:
df_test.isnull().sum()

ID                   0
Gender               0
Ever_Married        50
Age                  0
Graduated           24
Profession          38
Work_Experience    269
Spending_Score       0
Family_Size        113
Var_1               32
dtype: int64

In [143]:
df_train['Segmentation'].value_counts()/len(df_train)

D    0.281111
A    0.244422
C    0.244175
B    0.230293
Name: Segmentation, dtype: float64

In [144]:
df_test['Segmentation'] = 'U'

In [145]:
df_full = pd.concat([df_train,df_test],axis=0,ignore_index=True)

In [146]:
len(df_full)

10695

In [147]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10695 entries, 0 to 10694
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               10695 non-null  int64  
 1   Gender           10695 non-null  object 
 2   Ever_Married     10505 non-null  object 
 3   Age              10695 non-null  int64  
 4   Graduated        10593 non-null  object 
 5   Profession       10533 non-null  object 
 6   Work_Experience  9597 non-null   float64
 7   Spending_Score   10695 non-null  object 
 8   Family_Size      10247 non-null  float64
 9   Var_1            10587 non-null  object 
 10  Segmentation     10695 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 919.2+ KB


In [148]:
df_full.isnull().sum()

ID                    0
Gender                0
Ever_Married        190
Age                   0
Graduated           102
Profession          162
Work_Experience    1098
Spending_Score        0
Family_Size         448
Var_1               108
Segmentation          0
dtype: int64

In [149]:
df_full.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [150]:
df_full_sorted = df_full.sort_values(by='ID')
df_full_sorted.head(5)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
7238,458982,Male,Yes,61,Yes,Executive,1.0,High,3.0,Cat_6,C
5546,458983,Female,Yes,63,Yes,Executive,0.0,High,5.0,Cat_6,C
4373,458984,Male,Yes,39,Yes,Artist,0.0,Average,3.0,Cat_6,C
4695,458985,Male,No,23,No,Healthcare,1.0,Low,4.0,Cat_6,D
3333,458986,Male,No,18,No,Healthcare,7.0,Low,4.0,Cat_6,D


In [151]:
def encode_values_knn(df_data):
    df = df_data.copy()
    df['Gender'] = df['Gender'].map({'Male':1,'Female':0})
    df['Ever_Married'] = df['Ever_Married'].map({'Yes':1,'No':0})
    df['Graduated'] = df['Graduated'].map({'Yes':1,'No':0})
    df['Profession'] = df['Profession'].map({'Healthcare':1, 'Engineer':2, 'Lawyer':3, 'Entertainment':4, 'Artist':5,
       'Executive':6, 'Doctor':7, 'Homemaker':8, 'Marketing':9})
    df['Spending_Score'] = df['Spending_Score'].map({'High':3,'Average':2,'Low':1})
    df['Var_1'] = df['Var_1'].map({'Cat_4':4, 'Cat_6':6, 'Cat_7':7, 'Cat_3':3, 'Cat_1':1, 'Cat_2':2, 'Cat_5':5})
    return df

In [152]:
df_full_encoded = encode_values_knn(df_full)

In [153]:
from sklearn.impute import KNNImputer

In [154]:
df_full_encoded.columns

Index(['ID', 'Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1',
       'Segmentation'],
      dtype='object')

In [155]:
imputer = KNNImputer(n_neighbors=4)

In [156]:
imputer.fit(df_full_encoded[['Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession','Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1']])

KNNImputer(n_neighbors=10)

In [157]:
df_ = imputer.transform(df_full_encoded[['Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession','Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1']])

In [158]:
df_ = pd.DataFrame(df_,columns = ['Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession','Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1'])

In [159]:
df_.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,1.0,0.0,22.0,0.0,1.0,1.0,1.0,4.0,4.0
1,0.0,1.0,38.0,1.0,2.0,2.5,2.0,3.0,4.0
2,0.0,1.0,67.0,1.0,2.0,1.0,1.0,1.0,6.0
3,1.0,1.0,67.0,1.0,3.0,0.0,3.0,2.0,6.0
4,0.0,1.0,40.0,1.0,4.0,6.1,3.0,6.0,6.0


In [160]:
df_full_treated = pd.concat([df_full[['ID','Segmentation']],df_],axis=1,ignore_index=True)

In [161]:
df_full_treated.columns = ['ID','Segmentation','Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession','Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1']

In [162]:
df_full_treated.head()

Unnamed: 0,ID,Segmentation,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,462809,D,1.0,0.0,22.0,0.0,1.0,1.0,1.0,4.0,4.0
1,462643,A,0.0,1.0,38.0,1.0,2.0,2.5,2.0,3.0,4.0
2,466315,B,0.0,1.0,67.0,1.0,2.0,1.0,1.0,1.0,6.0
3,461735,B,1.0,1.0,67.0,1.0,3.0,0.0,3.0,2.0,6.0
4,462669,A,0.0,1.0,40.0,1.0,4.0,6.1,3.0,6.0,6.0


In [163]:
df_full_treated_sorted = df_full_treated.sort_values(by='ID')
df_full_treated_sorted.tail(200)

Unnamed: 0,ID,Segmentation,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
3021,467803,B,0.0,1.0,36.0,1.0,4.0,7.0,2.0,2.0,6.0
7193,467804,C,0.0,1.0,43.0,1.0,5.0,1.0,2.0,2.0,6.0
6742,467806,B,1.0,1.0,62.0,1.0,5.0,1.0,3.0,2.0,6.0
10643,467806,U,1.0,1.0,55.0,1.0,5.0,1.0,3.0,2.0,6.0
4952,467807,B,1.0,1.0,50.0,1.0,6.0,1.0,2.0,3.0,6.0
6762,467808,C,0.0,1.0,63.0,1.0,5.0,0.0,2.0,2.0,6.0
10644,467808,U,0.0,1.0,62.0,1.0,5.0,1.0,2.0,2.0,6.0
7917,467809,C,1.0,1.0,46.0,1.0,7.0,1.0,2.0,2.0,6.0
1476,467810,C,1.0,1.0,42.0,1.0,5.0,0.0,2.0,5.0,6.0
4839,467811,D,1.0,1.0,72.0,1.0,3.0,1.0,1.0,1.0,6.0


In [164]:
def id_features(data):
    df = data.copy()
    df['week'] = df['ID']%7
    df['month'] = df['ID']%30
    df['year'] = df['ID']%365
    df['quarter'] = df['ID']%90
    return df

In [165]:
df_train_clean = id_features(df_full_treated[df_full_treated['Segmentation']!='U'].copy())
df_test_clean = id_features(df_full_treated[df_full_treated['Segmentation']=='U'].copy())

In [166]:
df_train_clean.columns

Index(['ID', 'Segmentation', 'Gender', 'Ever_Married', 'Age', 'Graduated',
       'Profession', 'Work_Experience', 'Spending_Score', 'Family_Size',
       'Var_1', 'week', 'month', 'year', 'quarter'],
      dtype='object')

In [167]:
train_cols = ['Gender', 'Ever_Married', 'Age', 'Graduated',
       'Profession', 'Work_Experience', 'Spending_Score', 'Family_Size',
       'Var_1', 'week', 'month', 'year', 'quarter']

In [168]:
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
#from sklearn.neighbors import KNeighborsClassifier

In [169]:
model = XGBClassifier(random_state=0,max_depth=10,n_estimators=150,n_jobs=-1)

In [170]:
model.fit(df_train_clean[train_cols],df_train_clean['Segmentation'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=150, n_jobs=-1, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [171]:
train_pred = model.predict(df_train_clean[train_cols])

In [172]:
accuracy_score(train_pred,df_train_clean['Segmentation'])

1.0

In [173]:
df_test_clean['Segmentation'] = model.predict(df_test_clean[train_cols])

In [174]:
df_final = df_test_clean[['ID','Segmentation']].copy()

In [175]:
maps = df_train_clean[['ID','Segmentation']]
maps.set_index('ID',inplace=True)
maps = maps['Segmentation'].to_dict()

In [176]:
df_final['MapSeg'] = df_final['ID'].map(maps)
df_final['MapSeg'].isnull().sum()/len(df_final)

0.11229539398553483

In [177]:
df_final['Segmentation'] = np.where(df_final['MapSeg'].isnull(),df_final['Segmentation'],df_final['MapSeg'])
df_final = df_final[['ID','Segmentation']].copy()

In [178]:
file_name = 'BASE_KNN_IMP_FULL_ADD_FET'
save_dataset(df_final,file_name)