### Import libraries

In [None]:
# Import required packages

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


## Load the Data

In [None]:
train_df = pd.read_csv("path")
print('train_df Shape:', train_df.shape)
train_df.head()

In [None]:
test_df = pd.read_csv("path")
print('test_df Shape:', test_df.shape)
test_df.head()

## Explore, Clean, Validate, and Visualize the Data


In [None]:
train_df.info()
train_df.isnull().sum()
train_df.head()


In [None]:
test_df.info()
test_df.isnull().sum()
test_df.tail()


In [None]:
# checked data
print(train_df['Gender'].unique())
print(train_df['PaperlessBilling'].unique())
print(train_df['MultiDeviceAccess'].unique())
print(train_df['ParentalControl'].unique())
print(train_df['SubtitlesEnabled'].unique())
print(train_df['DeviceRegistered'].unique())
print(train_df['ContentType'].unique())
print(train_df['SubscriptionType'].unique())
print(train_df['PaymentMethod'].unique())
print(train_df['GenrePreference'].unique())


In [None]:
#checked data
print(train_df['Gender'].value_counts())
print(train_df['PaperlessBilling'].value_counts())
print(train_df['MultiDeviceAccess'].value_counts())
print(train_df['ParentalControl'].value_counts())
print(train_df['SubtitlesEnabled'].value_counts())
print(train_df['DeviceRegistered'].value_counts())
print(train_df['ContentType'].value_counts())
print(train_df['SubscriptionType'].value_counts())
print(train_df['PaymentMethod'].value_counts())
print(train_df['GenrePreference'].value_counts())

In [None]:
plt.figure(figsize=(10,10))

sns.heatmap(train_df.corr()['Churn'].sort_values(ascending=False).to_frame(), annot=True)

plt.title('Correlation of Features with Churn', fontsize=16, fontweight='bold', pad=20)
plt.show()

In [None]:
# apply one hot encoded
one_hot_encoded_data = pd.get_dummies(train_df, columns = ['Gender', 'PaperlessBilling', 'MultiDeviceAccess', 'ParentalControl', 'SubtitlesEnabled', 'DeviceRegistered', 'ContentType', 'SubscriptionType', 'PaymentMethod', 'GenrePreference'])
print(one_hot_encoded_data)

In [None]:
# change column name for the readeble purposes
one_hot_encoded_data.rename(columns={'PaymentMethod_Bank transfer':'PaymentMethod_Bank_transfer'}, inplace=True)
one_hot_encoded_data.rename(columns={'PaymentMethod_Credit card':'PaymentMethod_Credit_card'}, inplace=True)
one_hot_encoded_data.rename(columns={'PaymentMethod_Electronic check':'PaymentMethod_Electronic_check'}, inplace=True)
one_hot_encoded_data.rename(columns={'PaymentMethod_Mailed check':'PaymentMethod_Mailed_check'}, inplace=True)
one_hot_encoded_data.rename(columns={'ContentType_TV Shows':'ContentType_TV_Shows'}, inplace=True)

# checked data
one_hot_encoded_data.info()


In [None]:
# checked data
print(one_hot_encoded_data['Gender_Female'].unique())
print(one_hot_encoded_data['Gender_Male'].unique())
print(one_hot_encoded_data['PaperlessBilling_No'].unique())
print(one_hot_encoded_data['PaperlessBilling_Yes'].unique())
print(one_hot_encoded_data['ParentalControl_No'].unique())
print(one_hot_encoded_data['ParentalControl_Yes'].unique())
print(one_hot_encoded_data['SubtitlesEnabled_No'].unique())
print(one_hot_encoded_data['SubtitlesEnabled_Yes'].unique())
print(one_hot_encoded_data['DeviceRegistered_Computer'].unique())
print(one_hot_encoded_data['DeviceRegistered_Mobile'].unique())
print(one_hot_encoded_data['DeviceRegistered_TV'].unique())
print(one_hot_encoded_data['DeviceRegistered_Tablet'].unique())

print(one_hot_encoded_data['ContentType_Both'].unique())
print(one_hot_encoded_data['ContentType_Movies'].unique())
print(one_hot_encoded_data['ContentType_TV_Shows'].unique())
print(one_hot_encoded_data['SubscriptionType_Basic'].unique())
print(one_hot_encoded_data['SubscriptionType_Premium'].unique())
print(one_hot_encoded_data['SubscriptionType_Standard'].unique())
print(one_hot_encoded_data['PaymentMethod_Bank_transfer'].unique())
print(one_hot_encoded_data['PaymentMethod_Credit_card'].unique())
print(one_hot_encoded_data['PaymentMethod_Electronic_check'].unique())
print(one_hot_encoded_data['PaymentMethod_Mailed_check'].unique())
print(one_hot_encoded_data['GenrePreference_Action'].unique())
print(one_hot_encoded_data['GenrePreference_Comedy'].unique())

print(one_hot_encoded_data['GenrePreference_Drama'].unique())
print(one_hot_encoded_data['GenrePreference_Fantasy'].unique())
print(one_hot_encoded_data['GenrePreference_Sci-Fi'].unique())


In [None]:
# apply one hot encoded
one_hot_encoded_test_data = pd.get_dummies(test_df, columns = ['Gender', 'PaperlessBilling', 'MultiDeviceAccess', 'ParentalControl', 'SubtitlesEnabled', 'DeviceRegistered', 'ContentType', 'SubscriptionType', 'PaymentMethod', 'GenrePreference'])
print(one_hot_encoded_test_data)

In [None]:
# change column name for the readeble purposes
one_hot_encoded_test_data.rename(columns={'PaymentMethod_Bank transfer':'PaymentMethod_Bank_transfer'}, inplace=True)
one_hot_encoded_test_data.rename(columns={'PaymentMethod_Credit card':'PaymentMethod_Credit_card'}, inplace=True)
one_hot_encoded_test_data.rename(columns={'PaymentMethod_Electronic check':'PaymentMethod_Electronic_check'}, inplace=True)
one_hot_encoded_test_data.rename(columns={'PaymentMethod_Mailed check':'PaymentMethod_Mailed_check'}, inplace=True)
one_hot_encoded_test_data.rename(columns={'ContentType_TV Shows':'ContentType_TV_Shows'}, inplace=True)

# checked data
one_hot_encoded_test_data.info()

In [None]:
# visualization
sns.boxplot(x='Churn', y='MonthlyCharges', data=one_hot_encoded_data).set(title='Churn vs. MonthlyChanges')


In [None]:
plt.figure(figsize=(20,20))

sns.heatmap(one_hot_encoded_data.corr()['Churn'].sort_values(ascending=False).to_frame(), annot=True, cmap="YlGnBu")

plt.title('Correlation of Features with Churn', fontsize=16, fontweight='bold', pad=20)
plt.show()

In [None]:
plt.figure(figsize=(25,25))

# plotting correlation heatmap 
dataplot=sns.heatmap(one_hot_encoded_data.corr(), annot = True, linewidth = 2) 
  
# displaying heatmap 
plt.show()

## Make predictions 


In [None]:
one_hot_encoded_data = one_hot_encoded_data.drop(['CustomerID',  
                                                  'TotalCharges', 'Gender_Male', 'Gender_Female'],axis=1)

X = one_hot_encoded_data.drop(['Churn'],axis=1)
y = one_hot_encoded_data['Churn']


In [None]:
#one_hot_encoded_data = preprocessing.normalize(one_hot_encoded_data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


# -----------------------------------
# random forest classifier -- 82-73

rf = RandomForestClassifier(random_state=46)
dummy_clf = rf.fit(X_train,y_train)

preds = rf.predict(X_test)
print('Random forest acc : ')
print(accuracy_score(preds,y_test))
score = cross_val_score(dummy_clf, X, y, cv=10, scoring='roc_auc').mean() 
print(score)
# -------------------------------------

# -------------------------------------
# Decision tree classifier -- 72 - 56

dt = DecisionTreeClassifier(random_state=46)
dummy_clf = dt.fit(X_train,y_train)

preds = dt.predict(X_test)
print('Decision tree : ')
print(accuracy_score(preds,y_test))
score = cross_val_score(dummy_clf, X, y, cv=10, scoring='roc_auc').mean() 
print(score)
# -------------------------------------

# ------------------------------------- 
# KNeighbors Classifier -- 79-63

kn = KNeighborsClassifier()
dummy_clf = kn.fit(X_train,y_train)

preds = kn.predict(X_test)
print('KNeighbors Classifier : ')
print(accuracy_score(preds,y_test))
score = cross_val_score(dummy_clf, X, y, cv=10, scoring='roc_auc').mean() 
print(score)
# -------------------------------------

# -------------------------------------
# Ada Boost classifier -- 82-74,28

ab = AdaBoostClassifier(random_state=46)
dummy_clf = ab.fit(X_train,y_train)

preds = ab.predict(X_test)
print('Ada Boost : ')
print(accuracy_score(preds,y_test))
score = cross_val_score(dummy_clf, X, y, cv=10, scoring='roc_auc').mean() 
print(score)
# -------------------------------------


# ------------------------------------- 
# Logistic regression -- 82-74,77

lg = LogisticRegression(random_state=46)
dummy_clf = lg.fit(X_train,y_train)

preds = lg.predict(X_test)
print('Logistic regression : ')
print(accuracy_score(preds,y_test))
score = cross_val_score(dummy_clf, X, y, cv=10, scoring='roc_auc').mean() 
print(score)
# -------------------------------------


In [None]:
# Plotting the ROC curve

plot_roc_curve(dummy_clf, X_test, y_test)
plt.show()

In [None]:
predicted_probability = dummy_clf.predict_proba(one_hot_encoded_test_data.drop(['CustomerID'], axis=1))[:, 1]

In [None]:
prediction_df = pd.DataFrame({'CustomerID': one_hot_encoded_test_data[['CustomerID']].values[:, 0],
                             'predicted_probability': predicted_probability})

In [None]:
print(prediction_df.shape)
prediction_df.head(10)