In [231]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [232]:
train = pd.read_csv('../datasets/train.csv')
test = pd.read_csv('../datasets/test.csv')

In [233]:
train.head(5)

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [234]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245725 entries, 0 to 245724
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   245725 non-null  object
 1   Gender               245725 non-null  object
 2   Age                  245725 non-null  int64 
 3   Region_Code          245725 non-null  object
 4   Occupation           245725 non-null  object
 5   Channel_Code         245725 non-null  object
 6   Vintage              245725 non-null  int64 
 7   Credit_Product       216400 non-null  object
 8   Avg_Account_Balance  245725 non-null  int64 
 9   Is_Active            245725 non-null  object
 10  Is_Lead              245725 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 20.6+ MB


In [235]:
## Checking for null values
train.isna().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         29325
Avg_Account_Balance        0
Is_Active                  0
Is_Lead                    0
dtype: int64

In [236]:
train.describe()

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Is_Lead
count,245725.0,245725.0,245725.0,245725.0
mean,43.856307,46.959141,1128403.0,0.237208
std,14.828672,32.353136,852936.4,0.425372
min,23.0,7.0,20790.0,0.0
25%,30.0,20.0,604310.0,0.0
50%,43.0,32.0,894601.0,0.0
75%,54.0,73.0,1366666.0,0.0
max,85.0,135.0,10352010.0,1.0


In [237]:
train.Credit_Product.value_counts()

No     144357
Yes     72043
Name: Credit_Product, dtype: int64

In [238]:
## Change to label encoding 
# Gender, channel-code, age in bins

print(train.Gender.value_counts())
print(train.Region_Code.value_counts())
print(train.Occupation.value_counts())
print(train.Channel_Code.value_counts())
print(train.Credit_Product.value_counts())
print(train.Is_Active.value_counts())
print(train.Is_Lead.value_counts())

Male      134197
Female    111528
Name: Gender, dtype: int64
RG268    35934
RG283    29416
RG254    26840
RG284    19320
RG277    12826
RG280    12775
RG269     7863
RG270     7720
RG261     7633
RG257     6101
RG251     5950
RG282     5829
RG274     5286
RG272     5252
RG281     5093
RG273     4497
RG252     4286
RG279     3976
RG263     3687
RG275     3245
RG260     3110
RG256     2847
RG264     2793
RG276     2764
RG259     2586
RG250     2496
RG255     2018
RG258     1951
RG253     1858
RG278     1822
RG262     1788
RG266     1578
RG265     1546
RG271     1542
RG267     1497
Name: Region_Code, dtype: int64
Self_Employed    100886
Salaried          71999
Other             70173
Entrepreneur       2667
Name: Occupation, dtype: int64
X1    103718
X3     68712
X2     67726
X4      5569
Name: Channel_Code, dtype: int64
No     144357
Yes     72043
Name: Credit_Product, dtype: int64
No     150290
Yes     95435
Name: Is_Active, dtype: int64
0    187437
1     58288
Name: Is_Lead, dtype: int

In [239]:
bin_labels_5 = ['20-30', '30-40', '40-50', '50-60','60+']
train['age_group'] = pd.cut(x= train['Age'],bins=[22, 29, 39, 49,59 ,86],labels=bin_labels_5)
test['age_group'] = pd.cut(x= test['Age'],bins=[22, 29, 39, 49,59 ,86],labels=bin_labels_5)

In [240]:
train.age_group.value_counts()
test.age_group.value_counts()

20-30    24618
40-50    22328
30-40    22140
50-60    20001
60+      16225
Name: age_group, dtype: int64

In [241]:
print(100*train.age_group.value_counts()/train.age_group.count())
print(train.groupby('age_group')['Age'].min())
print(train.groupby('age_group')['Age'].max())

print(test.age_group.value_counts())
print(100*test.age_group.value_counts()/test.age_group.count())
print(test.groupby('age_group')['Age'].min())
print(test.groupby('age_group')['Age'].max())

20-30    23.226778
30-40    21.280293
40-50    20.889612
50-60    19.255265
60+      15.348052
Name: age_group, dtype: float64
age_group
20-30    23
30-40    30
40-50    40
50-60    50
60+      60
Name: Age, dtype: int64
age_group
20-30    29
30-40    39
40-50    49
50-60    59
60+      85
Name: Age, dtype: int64
20-30    24618
40-50    22328
30-40    22140
50-60    20001
60+      16225
Name: age_group, dtype: int64
20-30    23.376253
40-50    21.201762
30-40    21.023245
50-60    18.992138
60+      15.406601
Name: age_group, dtype: float64
age_group
20-30    24
30-40    30
40-50    40
50-60    50
60+      60
Name: Age, dtype: int64
age_group
20-30    29
30-40    39
40-50    49
50-60    59
60+      85
Name: Age, dtype: int64


In [242]:
train["Credit_Product"].fillna("Unknown", inplace = True)
test["Credit_Product"].fillna("Unknown", inplace = True)

In [261]:
train['Vintage_yrs'] = round(train.Vintage / 12,0)
test['Vintage_yrs'] = round(test.Vintage / 12,0)

In [243]:
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

In [244]:
##Gender
print(train.Gender.value_counts())
print(test.Gender.value_counts())
train["Gender"] = label_encoder.fit_transform(train["Gender"])
test["Gender"] = label_encoder.transform(test["Gender"])
print(train.Gender.value_counts())
print(test.Gender.value_counts())

Male      134197
Female    111528
Name: Gender, dtype: int64
Male      57705
Female    47607
Name: Gender, dtype: int64
1    134197
0    111528
Name: Gender, dtype: int64
1    57705
0    47607
Name: Gender, dtype: int64


In [245]:
##Credit_Product
print(train.Credit_Product.value_counts())
print(test.Credit_Product.value_counts())
train["Credit_Product"] = label_encoder.fit_transform(train["Credit_Product"])
test["Credit_Product"] = label_encoder.transform(test["Credit_Product"])
print(train.Credit_Product.value_counts())
print(test.Credit_Product.value_counts())

No         144357
Yes         72043
Unknown     29325
Name: Credit_Product, dtype: int64
No         61608
Yes        31182
Unknown    12522
Name: Credit_Product, dtype: int64
0    144357
2     72043
1     29325
Name: Credit_Product, dtype: int64
0    61608
2    31182
1    12522
Name: Credit_Product, dtype: int64


In [246]:
##Region_Code
print(train.Region_Code.value_counts())
print(test.Region_Code.value_counts())
train["Region_Code_enc"] = label_encoder.fit_transform(train["Region_Code"])
test["Region_Code_enc"] = label_encoder.transform(test["Region_Code"])
print(train.Region_Code_enc.value_counts())
print(test.Region_Code.value_counts())

RG268    35934
RG283    29416
RG254    26840
RG284    19320
RG277    12826
RG280    12775
RG269     7863
RG270     7720
RG261     7633
RG257     6101
RG251     5950
RG282     5829
RG274     5286
RG272     5252
RG281     5093
RG273     4497
RG252     4286
RG279     3976
RG263     3687
RG275     3245
RG260     3110
RG256     2847
RG264     2793
RG276     2764
RG259     2586
RG250     2496
RG255     2018
RG258     1951
RG253     1858
RG278     1822
RG262     1788
RG266     1578
RG265     1546
RG271     1542
RG267     1497
Name: Region_Code, dtype: int64
RG268    15125
RG283    12881
RG254    11737
RG284     8173
RG277     5596
RG280     5529
RG270     3440
RG269     3288
RG261     3286
RG251     2678
RG257     2588
RG272     2338
RG282     2337
RG274     2208
RG281     2144
RG252     1811
RG273     1803
RG263     1615
RG279     1600
RG275     1383
RG260     1300
RG264     1241
RG256     1194
RG276     1185
RG259     1102
RG250     1048
RG255      891
RG278      824
RG258      812
RG253   

In [247]:
##Is_Active
print(train.Is_Active.value_counts())
print(test.Is_Active.value_counts())
train["Is_Active"] = label_encoder.fit_transform(train["Is_Active"])
test["Is_Active"] = label_encoder.transform(test["Is_Active"])
print(train.Is_Active.value_counts())
print(test.Is_Active.value_counts())

No     150290
Yes     95435
Name: Is_Active, dtype: int64
No     63797
Yes    41515
Name: Is_Active, dtype: int64
0    150290
1     95435
Name: Is_Active, dtype: int64
0    63797
1    41515
Name: Is_Active, dtype: int64


In [248]:
##Age group
print(train.age_group.value_counts())
print(test.age_group.value_counts())
train["age_group"] = label_encoder.fit_transform(train["age_group"])
test["age_group"] = label_encoder.transform(test["age_group"])
print(train.age_group.value_counts())
print(test.age_group.value_counts())

20-30    57074
30-40    52291
40-50    51331
50-60    47315
60+      37714
Name: age_group, dtype: int64
20-30    24618
40-50    22328
30-40    22140
50-60    20001
60+      16225
Name: age_group, dtype: int64
0    57074
1    52291
2    51331
3    47315
4    37714
Name: age_group, dtype: int64
0    24618
2    22328
1    22140
3    20001
4    16225
Name: age_group, dtype: int64


In [249]:
train = pd.get_dummies(train, columns=["Occupation"], prefix=["Occ"] )
train = pd.get_dummies(train, columns=["Channel_Code"], prefix=["Channel_Code"] )
train = pd.get_dummies(train, columns=["Region_Code"] )
train.drop('Occ_Other',axis=1,inplace=True)

train.head(5)

Unnamed: 0,ID,Gender,Age,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead,age_group,Region_Code_enc,...,Region_Code_RG275,Region_Code_RG276,Region_Code_RG277,Region_Code_RG278,Region_Code_RG279,Region_Code_RG280,Region_Code_RG281,Region_Code_RG282,Region_Code_RG283,Region_Code_RG284
0,NNVBBKZB,0,73,43,0,1045696,0,0,4,18,...,0,0,0,0,0,0,0,0,0,0
1,IDD62UNG,0,30,32,0,581988,0,0,1,27,...,0,0,1,0,0,0,0,0,0,0
2,HD3DSEMC,0,56,26,0,1484315,1,0,3,18,...,0,0,0,0,0,0,0,0,0,0
3,BF3NC7KV,1,34,19,0,470454,0,0,1,20,...,0,0,0,0,0,0,0,0,0,0
4,TEASRWXV,0,30,33,0,886787,0,0,1,32,...,0,0,0,0,0,0,0,1,0,0


In [250]:
test = pd.get_dummies(test, columns=["Occupation"], prefix=["Occ"] )
test = pd.get_dummies(test, columns=["Channel_Code"], prefix=["Channel_Code"] )
test = pd.get_dummies(test, columns=["Region_Code"] )
test.drop('Occ_Other',axis=1,inplace=True)

test.head(5)

Unnamed: 0,ID,Gender,Age,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,age_group,Region_Code_enc,Occ_Entrepreneur,...,Region_Code_RG275,Region_Code_RG276,Region_Code_RG277,Region_Code_RG278,Region_Code_RG279,Region_Code_RG280,Region_Code_RG281,Region_Code_RG282,Region_Code_RG283,Region_Code_RG284
0,VBENBARO,1,29,25,2,742366,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0
1,CCMEWNKY,1,43,49,1,925537,0,2,18,0,...,0,0,0,0,0,0,0,0,0,0
2,VK3KGA9M,1,31,14,0,215949,0,1,20,0,...,0,0,0,0,0,0,0,0,0,0
3,TT8RPZVC,1,29,33,0,868070,0,0,22,0,...,0,0,0,0,0,0,0,0,0,0
4,SHQZEYTZ,0,29,19,0,657087,0,0,20,0,...,0,0,0,0,0,0,0,0,0,0


In [262]:
##Saving Clean Files
train.to_csv('../datasets/train_cleaned.csv',index = False)
test.to_csv('../datasets/test_cleaned.csv',index = False)