In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score

churnData = pd.read_csv('customer-churn.csv')

In [2]:
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [None]:
For this lab, we will build a model on customer churn binary classification problem. You will be using files_for_lab/Customer-Churn.csv file.

Instructions
Apply SMOTE for upsampling the data

Use logistic regression to fit the model and compute the accuracy of the model.
Use decision tree classifier to fit the model and compute the accuracy of the model.
Compare the accuracies of the two models.


Apply TomekLinks for downsampling

It is important to remember that it does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.
Use logistic regression to fit the model and compute the accuracy of the model.
Use decision tree classifier to fit the model and compute the accuracy of the model.
Compare the accuracies of the two models.
You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.

In [3]:
churnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [4]:
numericData = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]

# Build the logistic regression model. - SMOTE

## SMOTE

In [5]:
from imblearn.over_sampling import SMOTE

In [6]:
smote = SMOTE()
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X_scaled = transformer.transform(X)
y = churnData['Churn']
X_sm, y_sm = smote.fit_sample(X_scaled, y)
pd.DataFrame(y_sm).value_counts()

Churn
Yes      5174
No       5174
dtype: int64

In [7]:
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X_sm, y_sm)

print("The accuracy of the model is: ",round(classification.score(X_sm, y_sm),2))

The accuracy of the model is:  0.74


## Decision tree

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree

In [9]:
model = DecisionTreeClassifier()
model.fit(X_sm, y_sm)
print("The accuracy of the model is: {:4.2f}".format(classification.score(X_sm, y_sm)))

The accuracy of the model is: 0.74


# TOMEK

In [10]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(sampling_strategy='majority')
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X_scaled = transformer.transform(X)
y = churnData['Churn']
X_tl, y_tl = tl.fit_sample(X_scaled, y)
pd.DataFrame(y_tl).value_counts()

Churn
No       4694
Yes      1869
dtype: int64

## Classification

In [11]:
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X_tl, y_tl)

print("The accuracy of the model is: ",round(classification.score(X_tl, y_tl),2))

The accuracy of the model is:  0.79


## Decision tree

In [12]:
model = DecisionTreeClassifier()
model.fit(X_tl, y_tl)
print("The accuracy of the model is: {:4.2f}".format(classification.score(X_tl, y_tl)))

The accuracy of the model is: 0.79


In [13]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from scipy.stats import t, norm

In [14]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce') 
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(np.mean(churnData['TotalCharges']))

from sklearn.preprocessing import StandardScaler

smote = SMOTE()

X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges', 'TotalCharges']] 
transformer = StandardScaler().fit(X)
X = transformer.transform(X) 
y = churnData['Churn'] 
X_sm, y_sm = smote.fit_sample(X, y) 
y_sm.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [15]:
model1 = DecisionTreeClassifier()
model2 = LogisticRegression()

In [16]:
model_pipeline = [model1, model2]
model_names = ['Regression Tree','Logistic Regression']


def confidence_intervals(model_pipeline, model_names, X_train, y_train, alpha = 0.05, K = 10):
# We set the significance level
#alpha = 0.05
#K = 10
    scores = {}
    i=0
    for model in model_pipeline:
        mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=K))
        if (K < 30):
            # t.ppf(area) gives us the critical value corresponding to the area for the t-student distribution.
            t_critical = abs(t.ppf(1-alpha/2, K-1)) 
            interval = t_critical*(np.std(cross_val_score(model, X_train, y_train, cv=K))/np.sqrt(K))
        else:
            # norm.ppf(area) gives us the critical value corresponding to the area for the normal distribution
            z_critical = abs(norm.ppf(1-alpha/2)) 
            interval = z_critical*(np.std(cross_val_score(model, X_train, y_train, cv=K))/np.sqrt(K))
        scores[model_names[i]] = [mean_score, mean_score - interval, mean_score + interval]
        print("The rmse of the {} model is (CV witk K={}) = {:4.2f} +/- {:4.2f}".format(model_names[i], K, mean_score, interval))
        i = i+1

confidence_intervals(model_pipeline, model_names, X_sm, y_sm, 0.05, 5)

The rmse of the Regression Tree model is (CV witk K=5) = 0.77 +/- 0.04
The rmse of the Logistic Regression model is (CV witk K=5) = 0.73 +/- 0.01


# Afternoon lab

In [17]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

In [None]:
#Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.
#Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [21]:
y = target['TARGET_B']
X = numerical

categoricalX = categorical.select_dtypes(np.object)

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numerical, encoded_categorical], axis = 1)

In [23]:
smote = SMOTE()

transformer = StandardScaler().fit(X)
X = transformer.transform(X) 
y = target['TARGET_B']
X_sm, y_sm = smote.fit_sample(X, y) 
y_sm.value_counts()

1    90569
0    90569
Name: TARGET_B, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [27]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = y_train
y_test_regression = y_test

# Now we can remove the column target d from the set of features 
#X_train = X_train.drop(['TARGET_D'], axis = 1)
#X_test = X_test.drop(['TARGET_D'], axis = 1)

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
print("The accuracy of the Random forest is: {:4.2f}".format(clf.score(X_test, y_test)))
print()

alpha = 0.05
K = 10
# For cross validation
clf = RandomForestClassifier(max_depth=2, random_state=0)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=K)

if (K < 30):
    t_critical = abs(t.ppf(1-alpha/2, K-1))
    interval = t_critical*(np.std(cross_val_score(clf, X_train, y_train, cv=10))/np.sqrt(K))
else:
    z_critical = abs(norm.ppf(1-alpha/2))
    interval = z_critical*(np.std(cross_val_score(clf, X_train, y_train, cv=10))/np.sqrt(K)) 
print("The accuracy of the Random Forest model (CV witk K={}) is: {:4.2f} +/- {:4.2f}".format(K,np.mean(cross_val_scores),interval))

The accuracy of the Random forest is: 0.95

The accuracy of the Random Forest model (CV witk K=10) is: 0.95 +/- 0.00
