In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns

In [2]:
df = pd.read_csv('Resources/Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
df.shape

In [None]:
##### refer to https://sparkbyexamples.com/pandas/pandas-dataframe-query-examples

for col in df.columns.to_list():
    value = ' '
    query_string = f"{col} == @value"
#     print(query_string)

    if df.query(query_string)[col].count() > 0: 
        print(df.query(query_string)[col].count())
        print(df.query(query_string))
#     break

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

Fixing the TotalCharges column.  There were no nulls but there were empty values preventing conversion to float.

In [None]:
# Replace empty strings with 0
df['TotalCharges'] = df['TotalCharges'].replace(' ', 0)

# Convert the column to float
df['TotalCharges'] = df['TotalCharges'].astype(float)

# Replace 0 values with the mean value
mean_value = df['TotalCharges'].mean()
df['TotalCharges'] = df['TotalCharges'].replace(0, mean_value)

In [None]:
df.info()

In [None]:
df.drop(columns='customerID',inplace=True)

# Data Exploration

In [None]:
def plot_churn(df, feature, target='Churn'):
    plt.figure(figsize=(10,6))
    sns.countplot(data=df, x=feature, hue=target)
    plt.title(f'Churn by {feature}', fontsize=15)
    plt.ylabel('Count', fontsize=12)
    plt.xlabel(feature, fontsize=12)
    plt.xticks(rotation=45)
    plt.show()

Gender does not seem play a role in churn

In [None]:
# call the function
plot_churn(df, 'gender')

Customers without a partner are more likely to churn

In [None]:
plot_churn(df,'Partner')

Customers with no dependents are more likely to churn

In [None]:
plot_churn(df,'Dependents')

Month-to-month contracts are MUCH more likely to churn

In [None]:
plot_churn(df,'Contract')

Customers without device protection are more likely to churn

In [None]:
plot_churn(df,'DeviceProtection')

Customers with paperless billing are more likely to churn, but also double the amount of representation

In [None]:
plot_churn(df,'PaperlessBilling')

In [None]:
df['PaperlessBilling'].value_counts()

In [None]:
plot_churn(df,'InternetService')

In [None]:
df['InternetService'].value_counts()

The data is imbalanced, so we'll attempt to balance it.

In [None]:
df['Churn'].value_counts()

In [None]:
X = df.drop('Churn', axis=1)  
y = df['Churn']

In [None]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

In [None]:
y_resampled.value_counts()

Split the data into training and test sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
X_train

## Scale the data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
scaler.fit(X_train[['MonthlyCharges','TotalCharges']])
X_train[['MonthlyCharges','TotalCharges']] = scaler.transform(X_train[['MonthlyCharges','TotalCharges']])
X_test[['MonthlyCharges','TotalCharges']] = scaler.transform(X_test[['MonthlyCharges','TotalCharges']])

## Encode Categorical Data

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
df.columns

In [None]:
categories = ['gender', 
              'SeniorCitizen', 
              'Partner', 
              'Dependents',
              'PhoneService', 
              'MultipleLines', 
              'InternetService', 
              'OnlineSecurity',
              'OnlineBackup', 
              'DeviceProtection', 
              'TechSupport', 
              'StreamingTV',
              'StreamingMovies', 
              'Contract', 
              'PaperlessBilling', 
              'PaymentMethod']

In [None]:
encoder = OneHotEncoder()

In [None]:
categorical_train_data = X_train[categories]
encoder.fit(categorical_train_data)
encoded_train_data = encoder.transform(categorical_train_data).toarray()

In [None]:
# Transform the test data
categorical_test_data = X_test[categories]
encoded_test_data = encoder.transform(categorical_test_data).toarray()

In [None]:
encoded_train_df = pd.DataFrame(encoded_train_data, columns=encoder.get_feature_names_out(categories), index=X_train.index)
encoded_test_df = pd.DataFrame(encoded_test_data, columns=encoder.get_feature_names_out(categories), index=X_test.index)

In [None]:
# Concatenate the original dataframes with the new ones
X_train_encoded = pd.concat([X_train.drop(categories, axis=1), encoded_train_df], axis=1)
X_test_encoded = pd.concat([X_test.drop(categories, axis=1), encoded_test_df], axis=1)

In [None]:
X_train_encoded.columns

## Base Model Testing

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Support Vector Machine', SVC()),
    ('Random Forest', RandomForestClassifier())
]

In [None]:
def test_models(models, X_train, y_train, X_test, y_test):
    for name, model in models:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        print(f'{name} Accuracy: {accuracy * 100:.2f}%')

In [None]:
test_models(models, X_train_encoded, y_train, X_test_encoded, y_test)

## Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

def best_parameters(model, params, X, y, cv=5):
    grid = GridSearchCV(model, params, cv=cv)
    grid.fit(X, y)
    print("Best parameters for ", str(model), " are ", grid.best_params_)
    print("Best score for ", str(model), " is ", grid.best_score_)
    return grid.best_params_

Logistic Regression tuning
- The tuned model performed worse than the base model

In [None]:
# Define the model and parameters
model = LogisticRegression(max_iter=1000)
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
          'penalty': ['l1', 'l2'],
          'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

# Get best parameters
best_LR_params = best_parameters(model, params, X_train_encoded, y_train)

SVC Tuning
- I tried multiple different parameters and let it run for an hour each time but it never finished.
- After some research, I found out this model is known for having a long training time

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import uniform

# model = SVC()
# # Specifying the parameters. The difference here is that we provide a distribution for continuous parameters rather than a list of specific values.
# params = {'C': uniform(loc=0, scale=1000), 
#           'gamma': uniform(loc=0, scale=1),
#           'kernel': ['linear', 'rbf']}

# # Instantiate the RandomizedSearchCV object
# rscv = RandomizedSearchCV(model, params, n_iter=100, cv=5, random_state=42)

# # This will start the search over the specified parameter distributions
# rscv.fit(X_train_encoded, y_train)

# # Get the best parameters and the best score
# best_svc_params = rscv.best_params_
# best_svc_score = rscv.best_score_

# print("Best parameters for SVC are ", best_svc_params)
# print("Best score for SVC is ", best_svc_score)


Random Forest Tuning

In [None]:
model = RandomForestClassifier()
params = {
    'n_estimators': [100],  # 100 is a good general starting point
    'max_features': ['auto', 'sqrt', 0.5],  # Lower values can help to decrease complexity
    'max_depth': [10, 20, 30],  # Lower values can also decrease complexity
    'min_samples_split': [5, 10, 20],  # Higher values lead to more regularization
    'min_samples_leaf': [5, 10, 20],  # Higher values also lead to more regularization
    'bootstrap': [True]  # True can lead to a more diverse set of trees
}

# Get best parameters
best_forest_params = best_parameters(model, params, X_train_encoded, y_train)

## Testing with updated parameters

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

Random Forest Test

In [None]:
model = RandomForestClassifier(bootstrap=False, max_depth=30, max_features='auto', 
                               min_samples_leaf=1, min_samples_split=2, n_estimators=10)
model.fit(X_train_encoded, y_train)

# Make predictions
train_preds = model.predict(X_train_encoded)
test_preds = model.predict(X_test_encoded)

# Print classification report for training data
print("Training Classification Report:")
print(classification_report(y_train, train_preds))

# Print classification report for test data
print("Test Classification Report:")
print(classification_report(y_test, test_preds))

# Print confusion matrices
print("Training Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))
print("Test Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))

# Print AUC-ROC scores
print("Training AUC-ROC Score:")
print(roc_auc_score(y_train, model.predict_proba(X_train_encoded)[:, 1]))
print("Test AUC-ROC Score:")
print(roc_auc_score(y_test, model.predict_proba(X_test_encoded)[:, 1]))

Logistic Regression Test

In [None]:
model = LogisticRegression(max_iter=1000,C= 1, penalty= 'l2', solver= 'sag')
model.fit(X_train_encoded, y_train)

# Make predictions
train_preds = model.predict(X_train_encoded)
test_preds = model.predict(X_test_encoded)

# Print classification report for training data
print("Training Classification Report:")
print(classification_report(y_train, train_preds))

# Print classification report for test data
print("Test Classification Report:")
print(classification_report(y_test, test_preds))

# Print confusion matrices
print("Training Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))
print("Test Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))

# Print AUC-ROC scores
print("Training AUC-ROC Score:")
print(roc_auc_score(y_train, model.predict_proba(X_train_encoded)[:, 1]))
print("Test AUC-ROC Score:")
print(roc_auc_score(y_test, model.predict_proba(X_test_encoded)[:, 1]))

# Feature Engineering
- Tenure was not scaled for the previous models, these tests will scale tenure with the other numerical features

In [None]:
scaler = StandardScaler()

In [None]:
# Fit on the training data
scaler.fit(X_train_encoded[['tenure']])

# Transform the 'Tenure' column in both the training and test sets
X_train_encoded['tenure'] = scaler.transform(X_train_encoded[['tenure']])
X_test_encoded['tenure'] = scaler.transform(X_test_encoded[['tenure']])

Testing the base models with the scaled 'tenure' feature

In [None]:
test_models(models, X_train_encoded, y_train, X_test_encoded, y_test)

Testing with optimized models

Random Forest
- As nice as a perfect model looks, these parameters might be overfitting.

In [None]:
model = RandomForestClassifier(bootstrap=False, max_depth=30, max_features='auto', 
                               min_samples_leaf=1, min_samples_split=2, n_estimators=10)
model.fit(X_train_encoded, y_train)

# Make predictions
train_preds = model.predict(X_train_encoded)
test_preds = model.predict(X_test_encoded)

# Print classification report for training data
print("Training Classification Report:")
print(classification_report(y_train, train_preds))

# Print classification report for test data
print("Test Classification Report:")
print(classification_report(y_test, test_preds))

# Print confusion matrices
print("Training Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))
print("Test Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))

# Print AUC-ROC scores
print("Training AUC-ROC Score:")
print(roc_auc_score(y_train, model.predict_proba(X_train_encoded)[:, 1]))
print("Test AUC-ROC Score:")
print(roc_auc_score(y_test, model.predict_proba(X_test_encoded)[:, 1]))

Random Forest Model V2

In [None]:
# Best model with new params = 'bootstrap': True, 'max_depth': 20, 'max_features': 0.5, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 100
model = RandomForestClassifier(bootstrap=True, max_depth=20, max_features=0.5, 
                               min_samples_leaf=5, min_samples_split=10, n_estimators=100)
model.fit(X_train_encoded, y_train)

# Make predictions
train_preds = model.predict(X_train_encoded)
test_preds = model.predict(X_test_encoded)

# Print classification report for training data
print("Training Classification Report:")
print(classification_report(y_train, train_preds))

# Print classification report for test data
print("Test Classification Report:")
print(classification_report(y_test, test_preds))

# Print confusion matrices
print("Training Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))
print("Test Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))

# Print AUC-ROC scores
print("Training AUC-ROC Score:")
print(roc_auc_score(y_train, model.predict_proba(X_train_encoded)[:, 1]))
print("Test AUC-ROC Score:")
print(roc_auc_score(y_test, model.predict_proba(X_test_encoded)[:, 1]))

Logistic Regression

In [None]:
model = LogisticRegression(max_iter=1000,C= 1, penalty= 'l2', solver= 'sag')
model.fit(X_train_encoded, y_train)

# Make predictions
train_preds = model.predict(X_train_encoded)
test_preds = model.predict(X_test_encoded)

# Print classification report for training data
print("Training Classification Report:")
print(classification_report(y_train, train_preds))

# Print classification report for test data
print("Test Classification Report:")
print(classification_report(y_test, test_preds))

# Print confusion matrices
print("Training Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))
print("Test Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))

# Print AUC-ROC scores
print("Training AUC-ROC Score:")
print(roc_auc_score(y_train, model.predict_proba(X_train_encoded)[:, 1]))
print("Test AUC-ROC Score:")
print(roc_auc_score(y_test, model.predict_proba(X_test_encoded)[:, 1]))

## Model Test: Logistic Regression

In [None]:
# from sklearn.linear_model import LogisticRegression
# classifier = LogisticRegression(solver='lbfgs', random_state=1,max_iter=500)

In [None]:
# classifier.fit(X_train_encoded, y_train)

The score isn't bad, but let's see if we can do better.

In [None]:
# # Score the model
# print(f"Training Data Score: {classifier.score(X_train_encoded, y_train)}")
# print(f"Testing Data Score: {classifier.score(X_test_encoded, y_test)}")

## Logistic Regression V2