## Importing Necessary Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from yellowbrick.target import FeatureCorrelation

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

# Metrics
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, recall_score, confusion_matrix


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading Datasets

In [None]:
data = pd.read_csv('/kaggle/input/customer-churn-prediction-2020/train.csv')
data.head()

## Eliminating unnecessary features

In [None]:
num_cols = [col for col in data.columns if data[col].dtype != 'O']
cat_cols = [col for col in data.columns if col not in num_cols and col != 'churn']

In [None]:
X = data[num_cols]
y = data['churn'].map({'yes': 1, 'no': 0})
visualizer = FeatureCorrelation(labels=num_cols)
visualizer.fit(X, y)
visualizer.show()

In [None]:
# Compute the correlation matrix
corr = data.corr()

# Generate the mask for upper triangular matrix
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})

Concluding from the above plots we can actually get rid of **total_day_minutes, total_night_minutes, total_eve_minutes and total_intl_minutes** to avoid **Multicollinearity** problem and **total_day_calls, total_night_calls, total_eve_calls** as they are very **less correlated** with the **dependent feature**.

In [None]:
data.drop(
    ['total_day_minutes', 'total_night_minutes', 'total_eve_minutes', 'total_intl_minutes', 'total_day_calls', 'total_night_calls', 'total_eve_calls'],
    axis=1, 
    inplace=True
)
data.columns

## Handling Categorical Data

In [None]:
cat_cols

In [None]:
data = pd.get_dummies(data, columns=['state','area_code'])

In [None]:
data['international_plan'] = data['international_plan'].map({'yes': 1, 'no': 0})
data['voice_mail_plan'] = data['voice_mail_plan'].map({'yes': 1, 'no': 0})
data['churn'] = data['churn'].map({'yes': 1, 'no': 0})

## Feature Scaling

In [None]:
X = data[[col for col in data.columns if col != 'churn']]
y = data['churn'].values

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

## Handling Imbalanced Dataset

In [None]:
sm = SMOTE(
        sampling_strategy='auto', random_state=None, k_neighbors=5, n_jobs=1
    )
X, y = sm.fit_resample(X, y)

In [None]:
sns.countplot(y)

## Basic Modelling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
def create_classification_report(Y_test, Y_pred):
    print('--------Classification Report---------\n')
    accuracy = accuracy_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred)
    precision = precision_score(Y_test, Y_pred)
    recall = recall_score(Y_test, Y_pred)
    roc_auc = roc_auc_score(Y_test, Y_pred)
    metrices = [accuracy, f1, precision, recall, roc_auc]
    scores = pd.DataFrame(pd.Series(metrices).values, index=['accuracy', 'f1-score', 'precision', 'recall', 'roc auc score'], columns=['score'])
    print(scores)
    print('\n--------Plotting Confusion Matrix---------')
    labels = ['churned', "didn't churned"]
    sns.heatmap(confusion_matrix(Y_test, Y_pred), annot=True, cmap='RdYlGn', annot_kws={'size': 16}, xticklabels=labels, yticklabels=labels)
    return scores

In [None]:
model = RandomForestClassifier(
            n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, 
            min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', 
            max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, 
            bootstrap=True, oob_score=False, n_jobs=2, random_state=None, verbose=2, 
            warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None
)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
create_classification_report(y_pred, y_test)

## Cross Validation and Hyperparameter Tuning

In [None]:
param_grid = { 
    'n_estimators': [200, 400],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [6,7,8, 10, None],
    'criterion' :['gini']
}
gsc = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=10, scoring='accuracy', verbose=0)
gsc.fit(X_train, y_train)
gsc.best_params_

In [None]:
y_pred = gsc.predict(X_test)
report = create_classification_report(y_pred, y_test)

## Submission

In [None]:
gsc.best_params_

In [None]:
model = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=None, max_features='log2')
model.fit(X, y)

In [None]:
test_data = pd.read_csv('/kaggle/input/customer-churn-prediction-2020/test.csv')

test = test_data.copy()

test.drop(
    ['total_day_minutes', 'total_night_minutes', 'total_eve_minutes', 'total_intl_minutes', 'total_day_calls', 'total_night_calls', 'total_eve_calls'],
    axis=1, 
    inplace=True
)

test['international_plan'] = test['international_plan'].map({'yes': 1, 'no': 0})
test['voice_mail_plan'] = test['voice_mail_plan'].map({'yes': 1, 'no': 0})

X_test = pd.get_dummies(test, columns=['state','area_code'], drop_first=True)
X_test['area_code_area_code_408'] = np.zeros(shape=(test.shape[0],))
X_test['state_AK'] = np.zeros(shape=(test.shape[0],))

X_test = X_test[[col for col in X_test.columns if col != 'id']]

X_test = scaler.fit_transform(X_test)

In [None]:
pred = model.predict(X_test)

In [None]:
result = pd.concat([test_data['id'], pd.DataFrame(pred)], axis=1)
result.columns = ['id', 'churn']
result['churn'] = result['churn'].map({0: 'no', 1: 'yes'})
result

In [None]:
result.to_csv('submission.csv', index=False)

In [None]:
result['churn'].value_counts()

In [None]:
result.shape