Load the customer data into a pandas DataFrame.


In [2]:
df = pd.read_excel('/content/drive/MyDrive/Customer_data.xlsx')

Display the first few rows to understand its structure and identify potential issues like missing values and data types.



In [3]:
display(df.head())
display(df.info())

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


None

Handling missing values



In [4]:
# Check for missing values in the 'TotalCharges' column
missing_total_charges = df['TotalCharges'].isnull().sum()
print(f"Number of missing values in 'TotalCharges': {missing_total_charges}")

# Drop rows with missing values in the 'TotalCharges' column
df.dropna(subset=['TotalCharges'], inplace=True)

# Verify that there are no more missing values in 'TotalCharges'
missing_total_charges_after_drop = df['TotalCharges'].isnull().sum()
print(f"Number of missing values in 'TotalCharges' after dropping rows: {missing_total_charges_after_drop}")

Number of missing values in 'TotalCharges': 11
Number of missing values in 'TotalCharges' after dropping rows: 0


## Data Exploration and Preprocessing - Examining Data Types and Unique Values

In [5]:
# Display data types of each column
print(df.dtypes)

# Display unique values for object type columns
for col in df.select_dtypes(include='object').columns:
    print(f'\nUnique values for column {col}:')
    print(df[col].unique())

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

Unique values for column customerID:
['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']

Unique values for column gender:
['Female' 'Male']

Unique values for column Partner:
['Yes' 'No']

Unique values for column Dependents:
['No' 'Yes']

Unique values for column PhoneService:
['No' 'Yes']

Unique values for column MultipleLines:
['No phone service' 'No' 'Yes'

## Data Preprocessing - Encoding Categorical Features and Scaling Numerical Features

In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Identify categorical and numerical columns (excluding 'customerID' and 'Churn')
categorical_features = [col for col in df.columns if df[col].dtype == 'object' and col not in ['customerID', 'Churn']]
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Create a column transformer to apply different preprocessing steps to different columns
# We will drop 'customerID' and 'Churn' for now and add them back later
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('scaler', StandardScaler(), numerical_features)
    ],
    remainder='drop' # Drop other columns initially
)

# Apply the preprocessing
df_processed_array = preprocessor.fit_transform(df.drop(columns=['customerID', 'Churn']))

# Get the feature names after one-hot encoding
onehot_feature_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_features)
all_feature_names = list(onehot_feature_names) + numerical_features

# Convert the processed array back to a DataFrame
df_processed = pd.DataFrame(df_processed_array, columns=all_feature_names, index=df.index)

# Add back the 'customerID' and 'Churn' columns
df_processed['customerID'] = df['customerID']
df_processed['Churn'] = df['Churn']


# Display the first few rows of the processed DataFrame
display(df_processed.head())

#EXPLAINATION
#We addressed the missing values in the TotalCharges column by dropping the 11 rows containing them.
#We examined the data types and unique values for all columns, identifying categorical and numerical features.
#We then preprocessed the data by:
#Applying One-Hot Encoding to the categorical features (excluding 'customerID' and 'Churn') to convert them into a numerical format suitable for machine learning models.
#Scaling the numerical features (tenure, MonthlyCharges, TotalCharges) using StandardScaler to normalize their ranges.
#The processed data was organized into a new DataFrame df_processed, with 'customerID' and 'Churn' added back.

Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,...,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges,customerID,Churn
0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,-1.280248,-1.161694,-0.994194,7590-VHVEG,No
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.064303,-0.260878,-0.17374,5575-GNVDE,No
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,-1.239504,-0.363923,-0.959649,3668-QPYBK,Yes
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.512486,-0.74785,-0.195248,7795-CFOCW,No
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,-1.239504,0.196178,-0.940457,9237-HQITU,Yes


## Splitting the Data into Training and Testing Sets

In [19]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df_processed.drop(['customerID', 'Churn'], axis=1)
y = df_processed['Churn']

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

# EXPLAINATION
# The preprocessed data was split into training (80%) and testing (20%) sets using train_test_split,
# with stratification on the 'Churn' column to maintain the proportion of churned and non-churned customers in both sets.

Shape of X_train: (5625, 44)
Shape of X_test: (1407, 44)
Shape of y_train: (5625,)
Shape of y_test: (1407,)


## Building the Random Forest Model

In [12]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
# Using a random_state for reproducibility
rf_model = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

## Tuning the Random Forest Model

In [14]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the parameter grid to sample from
param_grid = {
    'n_estimators': np.arange(100, 1001, 100),  # Number of trees in the forest
    'max_features': ['sqrt', 'log2'], # Number of features to consider when looking for the best split
    'max_depth': [10, 20, 30, 40, 50, None],  # Maximum depth of the tree
    'min_samples_split': np.arange(2, 11, 2), # Minimum number of samples required to split an internal node
    'min_samples_leaf': np.arange(1, 11, 2),  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False] # Whether bootstrap samples are used when building trees
}

# Initialize RandomizedSearchCV
# n_iter controls the number of parameter settings that are sampled.
# cv is the number of folds for cross-validation.
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid,
                                   n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1, scoring='f1')

# Fit the random search model
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print(f"Best hyperparameters found: {best_params}")

# Get the best model
best_rf_model = random_search.best_estimator_

# EXPLAINATION
#We chose the Random Forest Classifier as our model for churn prediction due to its balance of predictive power and interpretability (through feature importance).
#A baseline Random Forest model was initialized.
#To optimize performance, we performed hyperparameter tuning using RandomizedSearchCV with a predefined parameter grid for n_estimators,
#max_features, max_depth, min_samples_split, min_samples_leaf, and bootstrap. The tuning process used 5-fold cross-validation and aimed to optimize for the F1-score.
#The best hyperparameters found were:
#{'n_estimators': 300, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40, 'bootstrap': True}.
#The best_rf_model was obtained using these parameters.

Fitting 5 folds for each of 100 candidates, totalling 500 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan]


Best hyperparameters found: {'n_estimators': np.int64(300), 'min_samples_split': np.int64(8), 'min_samples_leaf': np.int64(1), 'max_features': 'log2', 'max_depth': 40, 'bootstrap': True}


## Evaluating the Tuned Random Forest Model

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Make predictions on the test set using the best model
y_pred_tuned_rf = best_rf_model.predict(X_test)

# Calculate evaluation metrics for the tuned model
accuracy = accuracy_score(y_test, y_pred_tuned_rf)
precision = precision_score(y_test, y_pred_tuned_rf, pos_label='Yes')
recall = recall_score(y_test, y_pred_tuned_rf, pos_label='Yes')
f1 = f1_score(y_test, y_pred_tuned_rf, pos_label='Yes')
conf_matrix = confusion_matrix(y_test, y_pred_tuned_rf)

# Display the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

# EXPLAINATION
#The tuned Random Forest model (best_rf_model) was evaluated on the unseen test set.
#The evaluation metrics obtained were given below as a output along with Confusion Matrix.

Accuracy: 0.7839
Precision: 0.6182
Recall: 0.4893
F1-Score: 0.5463

Confusion Matrix:
[[920 113]
 [191 183]]


## Interpreting the Tuned Random Forest Model's Output - Feature Importance

In [18]:
# Get feature importances from the tuned Random Forest model
feature_importances = best_rf_model.feature_importances_

# Get feature names from the training data
feature_names = X_train.columns

# Create a DataFrame to display feature importance
feature_importance_rf = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort features by importance in descending order
feature_importance_rf = feature_importance_rf.sort_values(by='Importance', ascending=False)

# Display the feature importance
print("Feature Importance (Random Forest):")
display(feature_importance_rf)

Feature Importance (Random Forest):


Unnamed: 0,Feature,Importance
43,TotalCharges,0.143848
41,tenure,0.139189
42,MonthlyCharges,0.109681
32,Contract_Month-to-month,0.079056
14,OnlineSecurity_No,0.041018
39,PaymentMethod_Electronic check,0.038683
23,TechSupport_No,0.037308
12,InternetService_Fiber optic,0.031375
34,Contract_Two year,0.027638
11,InternetService_DSL,0.018241
