# Lab | Handling Imbalanced Data / Cross-Validation

 - apply K-fold cross validation on your models before and check the model score. Note: So far we have not balanced the data.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

### Read the DataFrame

In [2]:
df = pd.read_csv("file:///Users/brunasantos/Documents/GitHub/Labs_Bruna/Week%207/DATA_Customer-Churn.csv")

In [3]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


### Selecting columns from the original DataFrame

In [4]:
selected_columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']
df = df[selected_columns].copy()

In [5]:
# Displaying the new DataFrame
df

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0,1,29.85,29.85,No
1,0,34,56.95,1889.5,No
2,0,2,53.85,108.15,Yes
3,0,45,42.30,1840.75,No
4,0,2,70.70,151.65,Yes
...,...,...,...,...,...
7038,0,24,84.80,1990.5,No
7039,0,72,103.20,7362.9,No
7040,0,11,29.60,346.45,No
7041,1,4,74.40,306.6,Yes


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SeniorCitizen   7043 non-null   int64  
 1   tenure          7043 non-null   int64  
 2   MonthlyCharges  7043 non-null   float64
 3   TotalCharges    7043 non-null   object 
 4   Churn           7043 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 275.2+ KB


### Check the datatypes of all the columns and convert TotalCharges to numeric

In [7]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [8]:
meantc = df['TotalCharges'].mean()
df['TotalCharges'] = df['TotalCharges'].fillna(meantc)

In [9]:
df

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0,1,29.85,29.85,No
1,0,34,56.95,1889.50,No
2,0,2,53.85,108.15,Yes
3,0,45,42.30,1840.75,No
4,0,2,70.70,151.65,Yes
...,...,...,...,...,...
7038,0,24,84.80,1990.50,No
7039,0,72,103.20,7362.90,No
7040,0,11,29.60,346.45,No
7041,1,4,74.40,306.60,Yes


In [10]:
df["Churn"] = df["Churn"].apply(lambda x: x.replace('No', '0') if type(x) == str else x)
df["Churn"] = df["Churn"].apply(lambda x: x.replace('Yes', '1') if type(x) == str else x)

In [11]:
df['Churn']=df['Churn'].astype(int) 

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SeniorCitizen   7043 non-null   int64  
 1   tenure          7043 non-null   int64  
 2   MonthlyCharges  7043 non-null   float64
 3   TotalCharges    7043 non-null   float64
 4   Churn           7043 non-null   int64  
dtypes: float64(2), int64(3)
memory usage: 275.2 KB


### Split X-y

In [13]:
X = df[["tenure","SeniorCitizen","MonthlyCharges","TotalCharges"]]
y = df[['Churn']]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=11)

### Balacing the Data

In [15]:
y.value_counts()

Churn
0        5174
1        1869
dtype: int64

In [16]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [17]:
conda install -c conda-forge imbalanced-learn

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.5.1

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.5.1



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [18]:
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

Churn
0        5174
1        5174
dtype: int64

### Scaling the Data

In [35]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

In [36]:
X_scaler_train= scaler.transform(X_train)

X_scaler_test= scaler.transform(X_test)

### Decision Tree Classification

In [37]:
TT_SPLIT = 0.2     # ratio train/test size
RAND_STATE = 42

In [38]:
model = DecisionTreeClassifier(max_depth =5)
model.fit(X_train, y_train)
print("The (mean) accuracy on the test set is %.2f" %(model.score(X_test, y_test)))
print("The (mean) accuracy on the train data is %.2f" %(model.score(X_train, y_train)))
print("X has %d rows and %d columns"  %(X.shape[0],X.shape[1]))
print("y has %d rows"  %(y.shape[0]))

The (mean) accuracy on the test set is 0.77
The (mean) accuracy on the train data is 0.80
X has 7043 rows and 4 columns
y has 7043 rows


In [39]:
scores=cross_val_score(model, X_train, y_train, cv=5)
print("Cross validation scores: ", scores)
print("Score stats: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Cross validation scores:  [0.76841171 0.79414374 0.7985803  0.77196096 0.78774423]
Score stats: 0.78 accuracy with a standard deviation of 0.01


In [40]:
print("Five-fold cv results: \n %0.2f mean accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Five-fold cv results: 
 0.78 mean accuracy with a standard deviation of 0.01


In [41]:
y_test

Unnamed: 0,Churn
1634,0
6067,0
1679,0
1266,0
4787,0
...,...
1424,0
4276,1
3429,0
5557,0


In [42]:
y_pred = cross_val_predict(model, X_test, y_test, cv=5)
y_pred

array([0, 0, 0, ..., 0, 1, 1])

In [43]:
model1 = DecisionTreeClassifier(max_depth=5)
model2 = LogisticRegression()
model3 = KNeighborsClassifier()

model_pipeline = [model1, model2, model3]
model_names = ['Classification Tree', 'Logistic Regression', 'KNN']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Classification Tree': 0.7841681888602225, 'Logistic Regression': 0.792155095106233, 'KNN': 0.767484527211147}


In [None]:
model1 = DecisionTreeClassifier(max_depth=5)
model2 = LogisticRegression()
model3 = KNeighborsClassifier()

model_pipeline = [model1, model2, model3]
model_names = ['Classification Tree', 'Logistic Regression', 'KNN']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_smote_train, y_smote_train, scoring="recall", cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

In [44]:
print("Comparing the 3 regression scores we find \n")

pd.DataFrame([scores], index=["score"])

Comparing the 3 regression scores we find 



Unnamed: 0,Classification Tree,Logistic Regression,KNN
score,0.784168,0.792155,0.767485


In [45]:
y_train.value_counts()

Churn
0        4164
1        1470
dtype: int64