In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [7]:
#1. Data Preprocessing
data = pd.read_excel("C:/Users/sudha/Downloads/customer_churn_large_dataset.xlsx")
data.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


In [8]:
print("Shape of the dataset:", data.shape)

Shape of the dataset: (100000, 9)


In [4]:
data.info()
print(data.describe())
print("Unique values in 'Gender':", data['Gender'].unique())
print("Unique values in 'Location':", data['Location'].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  int64  
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  int64  
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  int64  
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  int64  
 8   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 6.9+ MB
          CustomerID            Age  Subscription_Length_Months  \
count  100000.000000  100000.000000               100000.000000   
mean    50000.500000      44.027020                   12.

In [9]:
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Subscription_Length_Months'].fillna(data['Subscription_Length_Months'].mean(), inplace=True)


data['Gender'].fillna(data['Gender'].mode()[0], inplace=True)
data['Location'].fillna(data['Location'].mode()[0], inplace=True)


print("The missing values after filling:", data.isnull().sum().sum())

The missing values after filling: 0


In [10]:
data = pd.get_dummies(data, columns=['Gender', 'Location'], drop_first=True)


data.head()

Unnamed: 0,CustomerID,Name,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn,Gender_Male,Location_Houston,Location_Los Angeles,Location_Miami,Location_New York
0,1,Customer_1,63,17,73.36,236,0,True,False,True,False,False
1,2,Customer_2,62,1,48.76,172,0,False,False,False,False,True
2,3,Customer_3,24,5,85.47,460,0,False,False,True,False,False
3,4,Customer_4,36,3,97.94,297,1,False,False,False,True,False
4,5,Customer_5,46,19,58.14,266,0,False,False,False,True,False


In [11]:
X = data.drop(['CustomerID', 'Churn'], axis=1)

y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (80000, 10)
X_test shape: (20000, 10)
y_train shape: (80000,)
y_test shape: (20000,)


In [13]:
#2. Feature Engineering: Finding Customer Tenure
X_train['Customer_Tenure'] = X_train['Age'] - X_train['Subscription_Length_Months']
X_test['Customer_Tenure'] = X_test['Age'] - X_test['Subscription_Length_Months']

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = pd.read_excel("C:/Users/sudha/Downloads/customer_churn_large_dataset.xlsx")

categorical_columns = ["Gender", "Location"]

data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

X = data_encoded.drop("Churn", axis=1)
y = data_encoded["Churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


numerical_features = ["Age", "Subscription_Length_Months", "Monthly_Bill", "Total_Usage_GB"]
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])



In [32]:
#3. Model Building: Logistic Regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

data = pd.read_excel("C:/Users/sudha/Downloads/customer_churn_large_dataset.xlsx")

data = data.drop('CustomerID', axis=1)

# Define categorical and numerical features
categorical_features = ["Gender", "Location"]
numerical_features = ["Age", "Subscription_Length_Months", "Monthly_Bill", "Total_Usage_GB"]

# Separate features and target
X = data.drop("Churn", axis=1)
y = data["Churn"]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(drop='first'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(random_state=42))])


model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


print("Model Evaluation Metrics:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)


Model Evaluation Metrics:
Accuracy: 0.5037

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.63      0.56     10079
           1       0.50      0.38      0.43      9921

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000


Confusion Matrix:
 [[6313 3766]
 [6160 3761]]


In [33]:
#4. Model Optimization
from sklearn.model_selection import GridSearchCV

# Defining grid for hyperparameter
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  
    'classifier__penalty': ['l1', 'l2'],  
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Model Performance Evaluation
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Optimized Model Evaluation Metrics:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sudha\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sudha\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\sudha\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_pa

Best Hyperparameters: {'classifier__C': 0.01, 'classifier__penalty': 'l2'}
Optimized Model Evaluation Metrics:
Accuracy: 0.50385
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.63      0.56     10079
           1       0.50      0.38      0.43      9921

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000

Confusion Matrix:
 [[6328 3751]
 [6172 3749]]


In [39]:
#5. Model Deployment
import joblib

joblib.dump(model.named_steps['preprocessor'].named_transformers_['num'], 'numerical_transformer.pkl')
joblib.dump(model.named_steps['preprocessor'].named_transformers_['cat'], 'categorical_transformer.pkl')
joblib.dump(model.named_steps['classifier'], 'best_model.pkl')


['best_model.pkl']