# LAB 8.6 CROSS VALIDATION
### Aída Moure

In [175]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns 
import statsmodels.api as sm


from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [176]:
churnData = pd.read_csv('Customer-Churn.txt') 

In [177]:
churnData.shape

(7043, 16)

In [178]:
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [179]:
churnData.info() #no nulls? we maybe have blanks

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [180]:
#Total charges has missing data (blanks) so let's deal with that:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce') #pass the variable to numeric
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(np.mean(churnData['TotalCharges']))

In [181]:
churnData.TotalCharges.value_counts() #ok

TotalCharges
2283.300441    11
20.200000      11
19.750000       9
20.050000       8
19.900000       8
               ..
6849.400000     1
692.350000      1
130.150000      1
3211.900000     1
6844.500000     1
Name: count, Length: 6531, dtype: int64

In [182]:
y=churnData.Churn

In [183]:
X = churnData.drop(['Churn'], axis=1)

In [184]:
y.replace({'Yes':1}, inplace=True)
y.replace({'No':0}, inplace=True)


In [185]:
y.value_counts() #quite imbalanced

Churn
0    5174
1    1869
Name: count, dtype: int64

In [186]:
#SMOTE only works with numericals, so we need to transform categoricals into dummies

In [187]:
numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(object)

In [188]:
numericalX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SeniorCitizen   7043 non-null   int64  
 1   tenure          7043 non-null   int64  
 2   MonthlyCharges  7043 non-null   float64
 3   TotalCharges    7043 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 220.2 KB


In [189]:
categoricalX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   gender            7043 non-null   object
 1   Partner           7043 non-null   object
 2   Dependents        7043 non-null   object
 3   PhoneService      7043 non-null   object
 4   OnlineSecurity    7043 non-null   object
 5   OnlineBackup      7043 non-null   object
 6   DeviceProtection  7043 non-null   object
 7   TechSupport       7043 non-null   object
 8   StreamingTV       7043 non-null   object
 9   StreamingMovies   7043 non-null   object
 10  Contract          7043 non-null   object
dtypes: object(11)
memory usage: 605.4+ KB


In [190]:
X.shape

(7043, 15)

In [191]:
categoricalX.shape

(7043, 11)

In [192]:
numericalX.shape

(7043, 4)

In [193]:
#transform the categoricals into dummies (important, they must keep the variable name)

In [194]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical,columns=encoder.get_feature_names_out())
X = pd.concat([numericalX, encoded_categorical], axis = 1)

In [195]:
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year
0,0,1,29.85,29.85,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,34,56.95,1889.5,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,2,53.85,108.15,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,45,42.3,1840.75,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,2,70.7,151.65,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [196]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   SeniorCitizen                         7043 non-null   int64  
 1   tenure                                7043 non-null   int64  
 2   MonthlyCharges                        7043 non-null   float64
 3   TotalCharges                          7043 non-null   float64
 4   gender_Male                           7043 non-null   float64
 5   Partner_Yes                           7043 non-null   float64
 6   Dependents_Yes                        7043 non-null   float64
 7   PhoneService_Yes                      7043 non-null   float64
 8   OnlineSecurity_No internet service    7043 non-null   float64
 9   OnlineSecurity_Yes                    7043 non-null   float64
 10  OnlineBackup_No internet service      7043 non-null   float64
 11  OnlineBackup_Yes 

In [197]:
y.value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [198]:
y.info() #we have the same N for all variables, so ok to concat without missing values

<class 'pandas.core.series.Series'>
RangeIndex: 7043 entries, 0 to 7042
Series name: Churn
Non-Null Count  Dtype
--------------  -----
7043 non-null   int64
dtypes: int64(1)
memory usage: 55.2 KB


In [199]:
#Let's transform the data first (we use StandardScale in this case)
from sklearn.preprocessing import StandardScaler 
transformer = StandardScaler().fit(X) 
X = transformer.transform(X) 

## Applying SMOTE for upsampling

In [200]:
##apply SMOTE for upsampling the data:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [201]:
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()
#ok, now they are balanced

Churn
0    5174
1    5174
Name: count, dtype: int64

In [202]:
#train, test, split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, random_state=42)

### * Logistic Regression

In [203]:
from sklearn.linear_model import LogisticRegression

In [204]:
model=LogisticRegression(random_state=42)
model.fit(X_train, y_train)

In [205]:
model.score(X_train, y_train)

0.7611132586006958

In [206]:
model.score(X_test, y_test)

0.7812137611132586

### * Decision Tree Classifier

In [207]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [208]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [209]:
model.score(X_train, y_train)

0.998067259373792

In [210]:
model.score(X_test, y_test) #good score, but not so good as the train

0.7711635098569772

The decision tree model fits the model too well, when we use the test data it fits not so well. This means the model is not very robust. Also, the score is too high in the train data for the tree model so probably we are overfitting the data here? maybe too many variables...

I would choose the Logit model because it gives a more reasonable score. Also the model with the train and the test data show similar score. 

### * Doing the cross-validation

In [211]:
tree_model = DecisionTreeClassifier()

logit_model = LogisticRegression()

In [212]:
from sklearn.model_selection import cross_val_score

In [213]:
model_pipeline = [tree_model, logit_model]
model_names = ['Classifier Tree', 'Logistic Regression']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Classifier Tree': 0.7687164152370338, 'Logistic Regression': 0.7600815653650705}


In [214]:
model_pipeline = [tree_model, logit_model]
model_names = ['Classifier Tree', 'Logistic Regression']
scores = {}
i=0
for model in model_pipeline:
    var_score = np.var(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_names[i]] = var_score
    i = i+1
print(scores)

{'Classifier Tree': 0.0001575743203804276, 'Logistic Regression': 6.676976319126908e-05}


By looking at the means, both in the Tree and the Logit the scores look very similar. The logit is slightly worse.
When looking at the variance, the logit regression is better as it is smaller than in the decision tree. 

I would stick to the logit regression as it is more robust (if I run the tree model again this might change); anyway, the differences are too small between the two models.

## Applying Tomeklinks for downsampling

In [215]:
y.value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [216]:
from imblearn.under_sampling import TomekLinks

In [217]:
tl = TomekLinks()

In [218]:
X_tl, y_tl = tl.fit_resample(X,y)

In [219]:
y_tl.value_counts()

Churn
0    4600
1    1869
Name: count, dtype: int64

In [220]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.25, random_state=42)

In [221]:
model=LogisticRegression(random_state=42)
model.fit(X_train, y_train)

In [222]:
model.score(X_train, y_train)

0.8192125334982477

In [223]:
model.score(X_test, y_test)

0.7972805933250927

### * Decision Tree Classifier

In [224]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [225]:
model.score(X_train, y_train)

0.9956709956709957

In [226]:
model.score(X_test, y_test)

0.7558714462299134

The decision tree model fits the model too well again, when we use the test data it fits not so well. This means the model is not very robust. Also, the score is too high in the train data for the tree model so probably we are overfitting the data here? maybe too many variables...

I would choose the Logit model because it gives a more reasonable score. Also the model with the train and the test data show similar score. 

In [227]:
tree_model = DecisionTreeClassifier()

logit_model = LogisticRegression()

In [228]:
model_pipeline = [tree_model, logit_model]
model_names = ['Classifier Tree', 'Logistic Regression']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Classifier Tree': 0.7586110050485766, 'Logistic Regression': 0.8177705655254337}


In [229]:
model_pipeline = [tree_model, logit_model]
model_names = ['Classifier Tree', 'Logistic Regression']
scores = {}
i=0
for model in model_pipeline:
    var_score = np.var(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_names[i]] = var_score
    i = i+1
print(scores)

{'Classifier Tree': 0.0004962243195632806, 'Logistic Regression': 8.734526148476624e-05}


By looking at the means in this case the logit is better.
When looking at the variance, the logit regression is also better as it is smaller than in the decision tree. 

Therefore, the logit regression fits better the data in this case.

Also, I would stick to the upsampling method with SMOTE. It works with a bigger and more balanced dataset, hence more robust.

In [230]:
#if we run the models again...

In [231]:
model_pipeline = [tree_model, logit_model]
model_names = ['Classifier Tree', 'Logistic Regression']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Classifier Tree': 0.7612888719188833, 'Logistic Regression': 0.8177705655254337}


In [232]:
model_pipeline = [tree_model, logit_model]
model_names = ['Classifier Tree', 'Logistic Regression']
scores = {}
i=0
for model in model_pipeline:
    var_score = np.var(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_names[i]] = var_score
    i = i+1
print(scores)

{'Classifier Tree': 0.00028459737438895364, 'Logistic Regression': 8.734526148476624e-05}


We can see by running the models again that the decision tree scores slightly change, while the logit keep static. This is because each time we run a tree, it varies. Hence, it is a less robust model (this could be partially solved by creating a random forest).