<a href="https://colab.research.google.com/github/cfreeman22/data_mining_1_D209/blob/main/D209_data_mining1_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#inporting GridSearch and Pipeline from theesklearn library
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV



# library to scaling the data
from sklearn.preprocessing import StandardScaler
# Library to split data
from sklearn.model_selection import train_test_split

# importing the kneighbors classifier
from sklearn.neighbors import KNeighborsClassifier

#importing metric to test our model

from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    plot_confusion_matrix,
    precision_recall_curve,
    roc_curve, classification_report, 
)


In [None]:
#Customs funtions
# defining a function to compute different metrics to check performance of a classification model built using sklearn

def model_performance_classification_sklearn_with_threshold(model, predictors, target, threshold=0.5):
    """
    Function to compute different metrics, based on the threshold specified, to check classification model performance

    model: classifier
    predictors: independent variables
    target: dependent variable
    threshold: threshold for classifying the observation as class 1
    """

    # predicting using the independent variables
    pred_prob = model.predict_proba(predictors)[:, 1]
    pred_thres = pred_prob > threshold
    pred = np.round(pred_thres)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "Accuracy": acc,
            "Recall": recall,
            "Precision": precision,
            "F1": f1,
        },
        index=[0],
    )

    return df_perf

In [None]:
# defining a function to plot the confusion_matrix of a classification model built using sklearn
def confusion_matrix_sklearn_with_threshold(model, predictors, target, threshold=0.5):
    """
    To plot the confusion_matrix, based on the threshold specified, with percentages

    model: classifier
    predictors: independent variables
    target: dependent variable
    threshold: threshold for classifying the observation as class 1
    """
    pred_prob = model.predict_proba(predictors)[:, 1]
    pred_thres = pred_prob > threshold
    y_pred = np.round(pred_thres)

    cm = confusion_matrix(target, y_pred)
    labels = np.asarray(
        [
            ["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
            for item in cm.flatten()
        ]
    ).reshape(2, 2)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=labels, fmt="")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")

In [None]:
#reading the data
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Reading the data from my google drive.
df = pd.read_csv('/content/drive/My Drive/churn_clean.csv')

## Data Preparation steps

In [None]:
#target variable
df.Churn.value_counts()

df.Churn = df.Churn.map({'Yes':1 ,'No':0}).astype('int')
  


In [None]:
# Checking for duplicates
df.duplicated().sum()

In [None]:
#Checking for null
df.isnull().values.any()

In [None]:
# Dropping unnecessary columns
cols_to_drop =['CaseOrder', 'Customer_id', 'Interaction', 'UID', 'Zip', 'Lat', 'Lng', 'TimeZone','City', 'State', 'County','Job','Population' ]

# Dropping unnecessary columns
df = df.drop(cols_to_drop, axis = 1)

# renaming the survey items columns
df = df.rename(columns={'Item1': 'TimelyResponse','Item2': 'TimelyFixes' , 'Item3': 'TimelyReplacements', 'Item4': 'Reliability',

                          'Item5': 'Options', 'Item6': 'RespectfulResponse', 'Item7': 'CourteousExchange', 'Item8': 'ActiveListening'})

# Checking to confirm if unnecessary columns were dropped and survey columns renamed appropriately

df.info()

In [None]:
#extracting categorical variables
cat_vars =  df.select_dtypes(include=['object']).columns.to_list()
cat_vars

In [None]:
# Extracting numerical variables
for x in df.columns.to_list():
  if x not in cat_vars:
    print(x)

In [None]:
# Changing object type to categories
for col in cat_vars:
   df[col] = df[col].astype('category')

In [None]:
# Renaming some categories for easy processing
df.Marital = df.Marital.cat.rename_categories({"Divorced":"Divorced","Widowed": "Widowed","Separated": "Separated", "Never Married": "NeverMarried", "Married":"Married"}) 
df.Contract = df.Contract.cat.rename_categories({"Month-to-month":"month_to_month","Two Year": "TwoYear","One year": "OneYear"})
df.InternetService = df.InternetService.cat.rename_categories({"Fiber Optic":"Fiber","DSL": "DSL","None": "NoService"})
df.PaymentMethod = df.PaymentMethod.cat.rename_categories({"Electronic Check":"ElectronicCheck","Mailed Check": "MailedCheck","Bank Transfer(automatic)": "BankTransfer",
                                                           "Credit Card (automatic)": "CreditCard"}) 

In [None]:
## DROP EITHER TENURE OR BANDWIDTH from previous analysis these two features were highly correlated
df = df.drop('Bandwidth_GB_Year', axis = 1)

## Saving the cleaned data set

In [None]:

#making a copy of the prepared dataset and extract a copy for submission

churn_df =  df.copy()

In [None]:
# cleaned and prepared data
#churn_df.to_csv('cleaned_prepared_churn2.csv')

## Test Train Split and Scaling

In [None]:
X = churn_df
y = X.pop('Churn')
#Getting dummy variables for categorical columns
X = pd.get_dummies(X, drop_first=True)

In [None]:
#spliting the data
X_train, X_test, y_train, y_test = train_test_split( X, y, 
                                                    test_size=0.3, random_state=42, stratify=y)

In [None]:
#training_set = pd.concat([X_train, y_train], axis=1)
#test_set = pd.concat([X_test, y_test], axis=1)

#training_set.to_csv('training_set2.csv', index=False)
#test_set.to_csv('test_set2.csv', index=False)