In [129]:
# Importing the libraries
import numpy as np 
import pandas as pd 

In [130]:
df_train = pd.read_csv('train_s3TEQDk.csv')
df_train.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [131]:
df_train=df_train.sample(frac=1).reset_index(drop=True)
df_train.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,3JXGRDYH,Male,48,RG284,Other,X2,55,Yes,861546,No,1
1,AKSMUS2B,Female,58,RG268,Salaried,X3,87,No,1519683,No,1
2,HGKRYAX2,Female,37,RG284,Salaried,X2,14,Yes,1744311,No,1
3,JXLMD2CD,Male,64,RG283,Other,X3,25,,1406244,No,1
4,U5HYCDAV,Female,63,RG268,Other,X3,45,No,1262393,No,0


In [132]:
train_id = df_train['ID'].values

In [133]:
# df_train.drop(['ID'],axis=1,inplace=True)

In [134]:
df_train['Gender'].value_counts()

Male      134197
Female    111528
Name: Gender, dtype: int64

In [135]:
# Age variation 
print('The range of Age is : {} to {}'.format(np.min(df_train['Age']),np.max(df_train['Age'])))

The range of Age is : 23 to 85


In [136]:
df_train['Region_Code'].value_counts()

RG268    35934
RG283    29416
RG254    26840
RG284    19320
RG277    12826
RG280    12775
RG269     7863
RG270     7720
RG261     7633
RG257     6101
RG251     5950
RG282     5829
RG274     5286
RG272     5252
RG281     5093
RG273     4497
RG252     4286
RG279     3976
RG263     3687
RG275     3245
RG260     3110
RG256     2847
RG264     2793
RG276     2764
RG259     2586
RG250     2496
RG255     2018
RG258     1951
RG253     1858
RG278     1822
RG262     1788
RG266     1578
RG265     1546
RG271     1542
RG267     1497
Name: Region_Code, dtype: int64

In [137]:
df_train['Occupation'].value_counts()

Self_Employed    100886
Salaried          71999
Other             70173
Entrepreneur       2667
Name: Occupation, dtype: int64

In [138]:
df_train['Channel_Code'].value_counts()

X1    103718
X3     68712
X2     67726
X4      5569
Name: Channel_Code, dtype: int64

In [139]:
# Value variation 
print('The range of Vintage is : {} to {}'.format(np.min(df_train['Vintage']),np.max(df_train['Vintage'])))

The range of Vintage is : 7 to 135


In [140]:
df_train['Credit_Product'].value_counts(dropna=False)

No     144357
Yes     72043
NaN     29325
Name: Credit_Product, dtype: int64

In [141]:
# Avg Account Balance variation 
print('The range of Account Balance is : {} to {}'.format(np.min(df_train['Avg_Account_Balance']),np.max(df_train['Avg_Account_Balance'])))

The range of Account Balance is : 20790 to 10352009


In [142]:
df_train['Is_Active'].value_counts()

No     150290
Yes     95435
Name: Is_Active, dtype: int64

In [143]:
df_train['Credit_Product'] = df_train['Credit_Product'].replace(np.nan,'NoInformation')
df_train['Credit_Product'].value_counts()

No               144357
Yes               72043
NoInformation     29325
Name: Credit_Product, dtype: int64

In [144]:
features = df_train.columns
features

Index(['ID', 'Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code',
       'Vintage', 'Credit_Product', 'Avg_Account_Balance', 'Is_Active',
       'Is_Lead'],
      dtype='object')

In [145]:
features = [f for f in df_train.columns if f not in ('ID','Is_Lead')]

In [146]:
df_train.shape

(245725, 11)

In [147]:
from sklearn.model_selection import train_test_split
df_train,df_valid = train_test_split(df_train,test_size=0.2,stratify=df_train['Is_Lead'])

In [148]:
print('Training Shape: {}'.format(df_train.shape))
print('Validating Shape: {}'.format(df_valid.shape))

Training Shape: (196580, 11)
Validating Shape: (49145, 11)


In [149]:
xtrain = df_train[features]
xvalid = df_valid[features]
ytrain = df_train.Is_Lead.values
yvalid = df_valid.Is_Lead.values

In [150]:
categorial_columns = [f for f in features if df_train[f].dtypes=='O']
print(categorial_columns)

['Gender', 'Region_Code', 'Occupation', 'Channel_Code', 'Credit_Product', 'Is_Active']


In [151]:
numerical_columns = [f for f in features if df_train[f].dtypes!='O']
print(numerical_columns)

['Age', 'Vintage', 'Avg_Account_Balance']


In [152]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import make_column_transformer 
from sklearn.pipeline import make_pipeline 
from sklearn.linear_model import LogisticRegression

In [153]:
transformer = make_column_transformer((StandardScaler(),numerical_columns),(OneHotEncoder(),categorial_columns))

In [154]:
model = LogisticRegression()

In [155]:
clf = make_pipeline(transformer,model)

In [156]:
clf.fit(xtrain,ytrain)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['Age', 'Vintage',
                                                   'Avg_Account_Balance']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Gender', 'Region_Code',
                                                   'Occupation', 'Channel_Code',
                                                   'Credit_Product',
                                                   'Is_Active'])])),
                ('logisticregression', LogisticRegression())])

In [157]:
ypred = clf.predict(xvalid)

In [158]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [159]:
print(accuracy_score(yvalid,ypred))

0.8542883304507071


In [160]:
cm = confusion_matrix(yvalid,ypred)
print(cm)

[[36271  1216]
 [ 5945  5713]]


In [165]:
df_test = pd.read_csv('test_mSzZ8RL.csv')
df_test.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
0,VBENBARO,Male,29,RG254,Other,X1,25,Yes,742366,No
1,CCMEWNKY,Male,43,RG268,Other,X2,49,,925537,No
2,VK3KGA9M,Male,31,RG270,Salaried,X1,14,No,215949,No
3,TT8RPZVC,Male,29,RG272,Other,X1,33,No,868070,No
4,SHQZEYTZ,Female,29,RG270,Other,X1,19,No,657087,No


In [166]:
df_test['Credit_Product'] = df_test['Credit_Product'].replace(np.nan,'NoInformation')
df_test['Credit_Product'].value_counts()

No               61608
Yes              31182
NoInformation    12522
Name: Credit_Product, dtype: int64

In [167]:
xtest = df_test[features]

In [168]:
test_id=df_test['ID']

In [169]:
ytest = clf.predict(xtest)

In [170]:
data = {'ID':test_id,'Is_Lead':ytest}
df_submission = pd.DataFrame(data)

In [171]:
df_submission.to_csv('submission.csv',index=False)