In [1]:
#!conda install seaborn --yes


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import re
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score

In [4]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier

In [5]:
pd.set_option('display.max_columns',None)

In [6]:
from utils import (load_dataset, save_dataset)

In [7]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [8]:
df_train = load_dataset('train')
df_test = load_dataset('test')

In [9]:
df_train.columns

Index(['ID', 'Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code',
       'Vintage', 'Credit_Product', 'Avg_Account_Balance', 'Is_Active',
       'Is_Lead'],
      dtype='object')

In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245725 entries, 0 to 245724
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   245725 non-null  object
 1   Gender               245725 non-null  object
 2   Age                  245725 non-null  int64 
 3   Region_Code          245725 non-null  object
 4   Occupation           245725 non-null  object
 5   Channel_Code         245725 non-null  object
 6   Vintage              245725 non-null  int64 
 7   Credit_Product       216400 non-null  object
 8   Avg_Account_Balance  245725 non-null  int64 
 9   Is_Active            245725 non-null  object
 10  Is_Lead              245725 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 20.6+ MB


In [11]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105312 entries, 0 to 105311
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   105312 non-null  object
 1   Gender               105312 non-null  object
 2   Age                  105312 non-null  int64 
 3   Region_Code          105312 non-null  object
 4   Occupation           105312 non-null  object
 5   Channel_Code         105312 non-null  object
 6   Vintage              105312 non-null  int64 
 7   Credit_Product       92790 non-null   object
 8   Avg_Account_Balance  105312 non-null  int64 
 9   Is_Active            105312 non-null  object
dtypes: int64(3), object(7)
memory usage: 8.0+ MB


In [12]:
df_train.isnull().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         29325
Avg_Account_Balance        0
Is_Active                  0
Is_Lead                    0
dtype: int64

In [13]:
df_test.isnull().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         12522
Avg_Account_Balance        0
Is_Active                  0
dtype: int64

In [14]:
df_test['Is_Lead'] = -1
df_full = pd.concat([df_train,df_test],axis=0,ignore_index=True)

In [15]:
df_full.isnull().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         41847
Avg_Account_Balance        0
Is_Active                  0
Is_Lead                    0
dtype: int64

In [16]:
def get_label_mean_encode_maps(df_inp,df_train_inp,colname,label):
    df = df_inp.copy()
    df_tr = df_train_inp.copy()
    
    srs = pd.DataFrame(df[colname].value_counts().sort_values(ascending=False))
    srs[colname+'LabelEncode'] = [x for x in range(1,len(srs)+1)]
    label_encode_dict = srs[colname+'LabelEncode'].to_dict()
    srs = pd.DataFrame(df_tr.groupby(colname)[label].mean().sort_values(ascending=True))
    srs[colname+'MeanEncode'] = [x for x in range(1,len(srs)+1)]
    mean_encode_dict = srs[colname+'MeanEncode'].to_dict()
    
    return label_encode_dict, mean_encode_dict

In [17]:
def feature_engineering_one(df_inp,df_train_inp):
    
    df = df_inp.copy()
    df_tr = df_train_inp.copy()
    
    df['GenderEncoded'] = np.where(df['Gender']=='Male',1,0)
    df['AgeLog'] = np.log(df['Age'])
    
    #srs = pd.DataFrame(df['Region_Code'].value_counts().sort_values(ascending=False))
    #srs['RegionCodeLabelEncode'] = [x for x in range(1,len(srs)+1)]
    #label_encode_dict = srs['RegionCodeLabelEncode'].to_dict()
    #mean_encode_dict = df_tr.groupby('Region_Code')['Is_Lead'].mean().sort_values(ascending=False).to_dict()
    
    label_encode_dict, mean_encode_dict = get_label_mean_encode_maps(df_inp,df_train_inp,'Region_Code','Is_Lead')
    
    df['RegionCodeLabelEncode'] = df['Region_Code'].map(label_encode_dict)
    df['RegionCodeMeanEncode'] = df['Region_Code'].map(mean_encode_dict)
    
    label_encode_dict, mean_encode_dict = get_label_mean_encode_maps(df_inp,df_train_inp,'Occupation','Is_Lead')
    
    df['OccupationLabelEncode'] = df['Occupation'].map(label_encode_dict)
    df['OccupationMeanEncode'] = df['Occupation'].map(mean_encode_dict)
    
    label_encode_dict, mean_encode_dict = get_label_mean_encode_maps(df_inp,df_train_inp,'Channel_Code','Is_Lead')
    
    df['ChannelCodeLabelEncode'] = df['Channel_Code'].map(label_encode_dict)
    df['ChannelCodeMeanEncode'] = df['Channel_Code'].map(mean_encode_dict)
    
    df['Credit_Product'] = np.where(df['Credit_Product'].isnull(),'Missing',df['Credit_Product'])
    df_tr['Credit_Product'] = np.where(df_tr['Credit_Product'].isnull(),'Missing',df_tr['Credit_Product'])
    label_encode_dict = {"Missing":3,"Yes":2,"No":1}
    df['CreditProductEncoded'] = df['Credit_Product'].map(label_encode_dict)
    
    df['BalanceLog'] = np.log(df['Avg_Account_Balance'])
    
    
    df['ActiveEncoded'] = np.where(df['Is_Active']=='Yes',1,0)
    
    return df

In [18]:
def feature_engineering_two(df_inp,df_train_inp):
    
    df = df_inp.copy()
    df_tr = df_train_inp.copy()
    
    df['Entrepreneur'] = np.where(df['Occupation']=='Entrepreneur',1,0)
    
    df['AgeOverVintage'] = df['Age'] - df['Vintage']
    df['VintagePerc'] = df['AgeOverVintage']/df['Age']
    
    condition = df['Credit_Product'].isin(['Yes','Missing'])
    condition_ = df['Is_Active'] == 'Yes'
    df['ActiveAndCredit'] = np.where(condition & condition_,1,0)
    
    
    return df

In [19]:
def scale_features(df_inp,features_to_scale,scaler_type='std'):
    
    df = df_inp.copy()
    
    scaler = StandardScaler()
    if scaler_type == 'min_max':
        scaler = MinMaxScaler()
    elif scaler_type == 'robust':
        scaler = RobustScaler()
        
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df[features_to_scale]),columns = ['Scaled'+x for x in features_to_scale])
    df_final_data = pd.concat([df,df_scaled],axis=1)
    
    return df_final_data

In [20]:
features_to_scale = [
    'Age', 
    'Vintage',
    'AgeLog',
    'Avg_Account_Balance',
    'BalanceLog',
    'AgeOverVintage', 
    'VintagePerc'
]

In [21]:
df_process = feature_engineering_one(df_full,df_train)
df_process = feature_engineering_two(df_process,df_train)
df_scaled = scale_features(df_process,features_to_scale)

In [22]:
df_scaled.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead,GenderEncoded,AgeLog,RegionCodeLabelEncode,RegionCodeMeanEncode,OccupationLabelEncode,OccupationMeanEncode,ChannelCodeLabelEncode,ChannelCodeMeanEncode,CreditProductEncoded,BalanceLog,ActiveEncoded,Entrepreneur,AgeOverVintage,VintagePerc,ActiveAndCredit,ScaledAge,ScaledVintage,ScaledAgeLog,ScaledAvg_Account_Balance,ScaledBalanceLog,ScaledAgeOverVintage,ScaledVintagePerc
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0,0,4.290459,1,33,3,2,2,4,1,13.860193,0,0,30,0.410959,0,1.963311,-0.121384,1.696878,-0.098541,0.202963,1.284795,0.821363
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0,0,3.401197,5,25,2,1,1,1,1,13.274205,0,0,-2,-0.066667,0,-0.93389,-0.461633,-0.972819,-0.639654,-0.741453,0.041327,-0.088859
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0,0,4.025352,1,33,1,3,2,4,1,14.210464,1,0,30,0.535714,0,0.817906,-0.647223,0.900985,0.413296,0.767482,1.284795,1.059113
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0,1,3.526361,7,5,2,1,1,1,1,13.061453,0,0,15,0.441176,0,-0.664383,-0.863745,-0.597061,-0.769806,-1.084337,0.701919,0.87895
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0,0,3.401197,12,15,2,1,1,1,1,13.69536,0,0,-3,-0.1,0,-0.93389,-0.430701,-0.972819,-0.283976,-0.062692,0.002469,-0.152383


In [23]:
df_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351037 entries, 0 to 351036
Data columns (total 33 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID                         351037 non-null  object 
 1   Gender                     351037 non-null  object 
 2   Age                        351037 non-null  int64  
 3   Region_Code                351037 non-null  object 
 4   Occupation                 351037 non-null  object 
 5   Channel_Code               351037 non-null  object 
 6   Vintage                    351037 non-null  int64  
 7   Credit_Product             351037 non-null  object 
 8   Avg_Account_Balance        351037 non-null  int64  
 9   Is_Active                  351037 non-null  object 
 10  Is_Lead                    351037 non-null  int64  
 11  GenderEncoded              351037 non-null  int32  
 12  AgeLog                     351037 non-null  float64
 13  RegionCodeLabelEncode      35

In [24]:
df_scaled.columns

Index(['ID', 'Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code',
       'Vintage', 'Credit_Product', 'Avg_Account_Balance', 'Is_Active',
       'Is_Lead', 'GenderEncoded', 'AgeLog', 'RegionCodeLabelEncode',
       'RegionCodeMeanEncode', 'OccupationLabelEncode', 'OccupationMeanEncode',
       'ChannelCodeLabelEncode', 'ChannelCodeMeanEncode',
       'CreditProductEncoded', 'BalanceLog', 'ActiveEncoded', 'Entrepreneur',
       'AgeOverVintage', 'VintagePerc', 'ActiveAndCredit', 'ScaledAge',
       'ScaledVintage', 'ScaledAgeLog', 'ScaledAvg_Account_Balance',
       'ScaledBalanceLog', 'ScaledAgeOverVintage', 'ScaledVintagePerc'],
      dtype='object')

In [25]:
tarining_cols = [
    #'ID', 
    #'Gender', 
    #'Age', 
    #'Region_Code', 
    #'Occupation', 
    #'Channel_Code',
    #'Vintage', 
    #'Credit_Product', 
    #'Avg_Account_Balance', 
    #'Is_Active',
    #'Is_Lead', 
    'GenderEncoded', 
    #'AgeLog', 
    #'RegionCodeLabelEncode',
    'RegionCodeMeanEncode', 
    #'OccupationLabelEncode', 
    'OccupationMeanEncode',
    #'ChannelCodeLabelEncode', 
    'ChannelCodeMeanEncode',
    'CreditProductEncoded', 
    #'BalanceLog', 
    'ActiveEncoded', 
    'Entrepreneur',
    #'AgeOverVintage', 
    #'VintagePerc', 
    'ActiveAndCredit', 
    #'ScaledAge',
    'ScaledVintage', 
    'ScaledAgeLog', 
    #'ScaledAvg_Account_Balance',
    'ScaledBalanceLog', 
    'ScaledAgeOverVintage', 
    'ScaledVintagePerc'
    
]
tarining_cols

['GenderEncoded',
 'RegionCodeMeanEncode',
 'OccupationMeanEncode',
 'ChannelCodeMeanEncode',
 'CreditProductEncoded',
 'ActiveEncoded',
 'Entrepreneur',
 'ActiveAndCredit',
 'ScaledVintage',
 'ScaledAgeLog',
 'ScaledBalanceLog',
 'ScaledAgeOverVintage',
 'ScaledVintagePerc']

In [26]:
df_train.columns

Index(['ID', 'Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code',
       'Vintage', 'Credit_Product', 'Avg_Account_Balance', 'Is_Active',
       'Is_Lead'],
      dtype='object')

In [27]:
df_train_treated = df_scaled[df_scaled['Is_Lead']!=-1].copy()
df_test_treated = df_scaled[df_scaled['Is_Lead']==-1].copy()

In [28]:
gluon_cols_train = [#'ID', 
              'GenderEncoded',
 'RegionCodeMeanEncode',
 'OccupationMeanEncode',
 'ChannelCodeMeanEncode',
 'CreditProductEncoded',
 'ActiveEncoded',
 'Entrepreneur',
 'ActiveAndCredit',
 'ScaledVintage',
 'ScaledAgeLog',
 'ScaledBalanceLog',
 'ScaledAgeOverVintage',
 'ScaledVintagePerc',
       'Is_Lead']

In [29]:
gluon_cols_test = [x for x in gluon_cols_train if x!='Is_Lead']

In [30]:
save_dataset(df_train_treated[gluon_cols_train],name='gluon_train')
save_dataset(df_test_treated[gluon_cols_test],name='gluon_test')