In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline

In [4]:
df = pd.read_csv('Default.csv')

In [5]:
df.head()

Unnamed: 0,Customer ID,Country,State,Postal Code,Gender,Senior Citizen,Partner,Dependents,tenure,Phone Service,...,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Default
0,7590-VHVEA,Australia,New South Wales,2000,Female,0,Yes,No,1,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,7590-VHVEG,Australia,New South Wales,2035,Female,0,Yes,No,1,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
2,5575-GNVDE,Australia,New South Wales,2014,Male,0,No,No,34,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3,3668-QPYBK,Australia,New South Wales,2041,Male,0,No,No,2,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
4,7795-CFOCW,Australia,New South Wales,2050,Male,0,No,No,45,No,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No


In [None]:
df.info()

In [None]:
df.describe

# One hot encode data 

In [None]:
one_hot_col = ['Default', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 'Internet Service', \
               'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', \
               'Streaming Movies', 'Paperless Billing']
for col in one_hot_col:
    df[col] = df[col].apply(lambda x: 1 if x == "Yes" else 0)    

# Creating X and Y vars

In [None]:
X = pd.get_dummies(df.drop(['Customer ID', 'Default'], axis=1))
X = df['Total Charges'].fillna(0, inplace=True)
y = df['Default']


# Creating train test split model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

   # Build Pipelines

In [None]:
pipelines = {}
pipelines['rf'] = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=1234))
pipelines['gb'] = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1234))

   # Setup hyperparameters

In [None]:
hyperparams = {}
hyperparams ["rf"] = {'randomforestclassifier__n_estimators':[100,200,300]}
hyperparams ['gb'] = {'gradientboostingclassifier__n_estimators':[100,200,300]}

# Fit Model

In [None]:
fit_models = {}

for algo, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hyperparams[algo], n_jobs=-1, cv=10)
    model.fit(X_train, y_train)
    fit_models[algo] = model

# Evaluate The Model

In [None]:
from sklearn.metrics import F1_score, precision_score, recall_score, roc_curve

In [None]:
for algorithm, model in fit_models.items():
    yhat = model.predict(X_test)
    print("{} Metrics - F1: {}, Precision: {}, Recall: {}". format(algorithm, str(precision_score(y_test, yhat)), str(recall_score(y_test, yhat))))

In [None]:
from matplotlib import pyplot as plt
fpr, tpr, _ = roc_curve(y_test.values, fit_models['rf'].predict(X_test))
plt.plot(fpr, tpr, marker='.', label='Random Forest')

fpr, tpr, _ = roc_curve(y_test.values, fit_models['gb'].predict(X_test))
plt.plot(fpr, tpr, marker='.', label='Gradient Boosted')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()