# Lab | Classification, Handling Imbalanced Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

import numpy as np
import pandas as pd

from sklearn.datasets import load_boston
from sklearn.datasets import load_iris

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

import warnings
warnings.filterwarnings('ignore')

## Round 1
- Import the required libraries and modules that you would need.
- Read that data into Python and call the dataframe `churnData`.
- Check the datatypes of all the columns in the data. You would see that the column `TotalCharges` is object type. Convert this column into numeric type using `pd.to_numeric` function.
- Check for null values in the dataframe. Replace the null values.

In [2]:
df = pd.read_csv('Data/DATA_Customer-Churn.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [4]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')

In [5]:
df = df.fillna(0)

In [6]:
num_df = df.select_dtypes(include=['int64', 'float64'])
cat_df = df.select_dtypes(include=['object'])

In [7]:
cat_df = cat_df.drop(columns='Churn')

In [8]:
cat_df_dummified = pd.get_dummies(cat_df, drop_first=True)

In [9]:
X = pd.concat([num_df, cat_df_dummified], axis=1)

In [10]:
y = df.Churn

## Split the data into a training set and a test set.

In [11]:
TT_SPLIT = 0.2     # ratio train/test size
RAND_STATE = 123   # specifies a sampling for repeatable results

In [12]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=TT_SPLIT,random_state=RAND_STATE) # splitting the data into train and test sets

## Scale the features either by using normalizer or a standard scaler.


In [13]:
scaler = StandardScaler() 

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)

## Comparing 3 classification models BEFORE smote


In [14]:
model1 = DecisionTreeClassifier(max_depth=15)
model2 = KNeighborsClassifier(n_neighbors=5,weights='uniform')
model3 = LogisticRegression()

model_pipeline = [model1, model2, model3]
model_names = ['Decision Tree', 'K Neighbors', 'Logistic Regression']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train_scaled, y_train, cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Decision Tree': 0.7358902507639862, 'K Neighbors': 0.764819913601397, 'Logistic Regression': 0.7976570564900607}


## Upsampling using SMOTE

In [15]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [16]:
y_train.value_counts()

No     4159
Yes    1475
Name: Churn, dtype: int64

In [17]:
X_sm, y_sm = smote.fit_resample(X_train_scaled, y_train)
y_sm.value_counts()

Yes    4159
No     4159
Name: Churn, dtype: int64

## Comparing 3 classification models AFTER smote

In [18]:
model1 = DecisionTreeClassifier(max_depth=15)
model2 = KNeighborsClassifier(n_neighbors=5,weights='uniform')
model3 = LogisticRegression()

model_pipeline = [model1, model2, model3]
model_names = ['Decision Tree', 'K Neighbors', 'Logistic Regression']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_sm, y_sm, cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Decision Tree': 0.7672576784310098, 'K Neighbors': 0.7856480410749804, 'Logistic Regression': 0.7650877844719923}


## Random Forest

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100],
    'min_samples_split': [20],
    'min_samples_leaf' : [10],
    'max_features': ['sqrt']
    ##'max_samples' : ['None', 0.5],
    ##'max_depth':[3,5,10],
    ## 'bootstrap':[True,False]
    }
clf = RandomForestClassifier(random_state=RAND_STATE)

In [27]:
grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1,)

In [31]:
grid_search.fit(X_sm, y_sm.values.ravel())

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=123),
             n_jobs=-1,
             param_grid={'max_features': ['sqrt'], 'min_samples_leaf': [10],
                         'min_samples_split': [20], 'n_estimators': [50, 100]},
             return_train_score=True)

In [32]:
best_params = grid_search.best_params_ #To check the best set of parameters returned
best_params

{'max_features': 'sqrt',
 'min_samples_leaf': 10,
 'min_samples_split': 20,
 'n_estimators': 100}

In [33]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(random_state=RAND_STATE, **best_params)
cross_val_scores = cross_val_score(clf, X_sm, y_sm, cv=5)
print(np.mean(cross_val_scores))

0.8046444244645914
