##Kaggle Competition

##Setup

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from scipy import stats
import yaml, time, sys, os, glob

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

DATASET = "Kaggle_Competition"
SPLIT_TRAINING = True
DEBUG = False
SEED = 42

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

In [49]:
if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(d): os.makedirs(d)
  if not os.path.isdir(ROOT): os.makedirs(ROOT)

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['doc','orig','data','output']: makedirs(d)

##Dataset

In [50]:
df = pd.read_pickle(f"{ROOT}/data/df_train_clean.pkl")
print(df.shape)
df.head()

(6556, 26)


Unnamed: 0,Customer,Churn,Card,Start_Date,Customer_Service_Calls,Credit_Limit,Total_Revolving_Balance,Average_Open_To_Buy,Average_Utilisation_Ratio,Age,Gender,Education,Marital_Status,Dependents,Income,HasCreditCard,HasCurrent,HasOnDemandDeposit,HasDeposit,HasInvestment,HasMortgage,HasLoan,HasJoint,Count,Sum,Mean
0,797197508,No,Silver,2020-06-01,1,2315.0,1565,750.0,0.676,48,F,Graduate,Married,2,Less than €30K,True,True,False,True,True,True,False,False,90,4827.96,53.644
1,812854728,No,Silver,2020-01-01,1,7645.0,2076,5569.0,0.272,58,F,Second level,Single,5,Less than €30K,True,False,False,False,False,False,False,False,75,7709.97,102.7996
2,768000743,No,Silver,2020-01-01,2,6394.0,0,6394.0,0.0,55,F,Second level,Single,2,Less than €30K,True,True,False,False,False,False,False,False,69,3470.06,50.290725
3,722161439,No,Silver,2020-08-01,3,4663.0,0,4663.0,0.0,41,F,Graduate,,2,€30K - €50K,True,True,False,True,True,True,True,False,81,3831.01,47.29642
4,759029725,No,Silver,2021-02-01,4,1879.0,1486,393.0,0.791,37,F,Second level,Single,1,Less than €30K,True,True,False,True,True,True,False,False,88,5202.04,59.114091


In [51]:
df_test = pd.read_pickle(f"{ROOT}/data/df_test_clean.pkl")
print(df_test.shape)
df_test.head()

(3542, 25)


Unnamed: 0,Customer,Card,Start_Date,Customer_Service_Calls,Credit_Limit,Total_Revolving_Balance,Average_Open_To_Buy,Average_Utilisation_Ratio,Age,Gender,Education,Marital_Status,Dependents,Income,HasCreditCard,HasCurrent,HasOnDemandDeposit,HasDeposit,HasInvestment,HasMortgage,HasLoan,HasJoint,Count,Sum,Mean
0,774663629,Gold,2020-01-01,2,29663.0,1743,27920.0,0.059,51,M,Graduate,Single,3,€50K - €80K,True,True,False,True,False,False,False,False,93,14638.0,157.397849
1,720420396,Silver,2020-11-01,2,2032.0,1195,837.0,0.588,45,F,Unknown,Divorced,3,€30K - €50K,True,False,False,False,False,False,False,False,80,4478.97,55.987125
2,815283379,Silver,2020-01-01,1,17268.0,1197,16071.0,0.069,57,M,Graduate,Single,3,€100K +,True,True,False,False,False,False,False,False,90,4077.01,45.300111
3,764861610,Silver,2020-05-01,3,2264.0,0,2264.0,0.0,38,F,Primary level,Single,2,Less than €30K,True,True,False,True,True,False,True,False,28,1204.02,43.000714
4,742798818,Silver,2020-01-01,1,2192.0,1464,728.0,0.668,45,F,MSc,Unknown,4,Less than €30K,True,False,True,True,False,True,True,True,89,4905.03,55.112697


##Model Building 

In [52]:
target = "Churn"

cat_features = [c for c in df.select_dtypes("category").columns if c not in target]
num_features = [c for c in df.select_dtypes(["int","float"]).columns if c not in target]
features = cat_features + num_features

print(f"Traget: {target}")

print(f"Categorical Features: {cat_features}")
print(f"Numerical Features: {num_features}")
print(f"Number of Features: {len(features)}")

Traget: Churn
Categorical Features: ['Card', 'Start_Date', 'Gender', 'Education', 'Marital_Status', 'Income', 'HasCreditCard', 'HasCurrent', 'HasOnDemandDeposit', 'HasDeposit', 'HasInvestment', 'HasMortgage', 'HasLoan', 'HasJoint']
Numerical Features: ['Customer', 'Customer_Service_Calls', 'Credit_Limit', 'Total_Revolving_Balance', 'Average_Open_To_Buy', 'Average_Utilisation_Ratio', 'Age', 'Dependents', 'Count', 'Sum', 'Mean']
Number of Features: 25


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import chi2, SelectPercentile

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, accuracy_score, recall_score

In [54]:
cat_preprocessor = Pipeline(steps=[
    ('imput', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore')),
    ('select', SelectPercentile(chi2, percentile=80)),
])

num_preprocessor = Pipeline(steps=[
    ('imput', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

preprocessor = ColumnTransformer(transformers=[
    ('cat', cat_preprocessor, cat_features),
    ('num', num_preprocessor, num_features),
])

model = Pipeline(steps=[
    ('pre', preprocessor),
    ('clf', XGBClassifier()),
])

In [55]:
x_train = df[features] # x
y_train = df[target].replace({"No":0,"Yes":1}) # y

In [56]:
model.fit(x_train,y_train)

In [57]:
y_pred = model.predict(x_train)

In [58]:
print(classification_report(y_train,y_pred,digits=4))

              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000      5522
           1     1.0000    1.0000    1.0000      1034

    accuracy                         1.0000      6556
   macro avg     1.0000    1.0000    1.0000      6556
weighted avg     1.0000    1.0000    1.0000      6556



In [59]:
y_pred_test = model.predict(df_test)

In [60]:
y_pred_test

array([0, 0, 0, ..., 0, 0, 0])

In [61]:
df_test['Churn'] = y_pred_test
df_test['Churn'] = df_test['Churn'].replace({0:"No", 1:"Yes"})

In [62]:
df_results = df_test[['Customer','Churn']]

In [63]:
df_results.head()

Unnamed: 0,Customer,Churn
0,774663629,No
1,720420396,No
2,815283379,No
3,764861610,No
4,742798818,No


In [64]:
df_results.to_csv(f"{ROOT}/output/submission.csv", index=False)

Score: 0.94006