In [2]:
# Mount the google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/MyDrive/Job Assignments/Internshala/Data Science/SunbaseData

/content/drive/MyDrive/Job Assignments/Internshala/Data Science/SunbaseData


# **Data Preprocessing**

In [4]:
# import libraries
import pandas as pd

In [8]:
# import dataset
churn_data = pd.read_excel("customer_churn_large_dataset.xlsx")
churn_data.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


In [9]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  int64  
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  int64  
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  int64  
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  int64  
 8   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 6.9+ MB


In [10]:
churn_data.describe()

Unnamed: 0,CustomerID,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,50000.5,44.02702,12.4901,65.053197,274.39365,0.49779
std,28867.657797,15.280283,6.926461,20.230696,130.463063,0.499998
min,1.0,18.0,1.0,30.0,50.0,0.0
25%,25000.75,31.0,6.0,47.54,161.0,0.0
50%,50000.5,44.0,12.0,65.01,274.0,0.0
75%,75000.25,57.0,19.0,82.64,387.0,1.0
max,100000.0,70.0,24.0,100.0,500.0,1.0


Handle missing data and outliers

In [11]:
churn_data.isnull().sum() # Check the missing data

CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

In [14]:
# Calculate Q1, Q3, and IQR
Q1 = churn_data.select_dtypes(include=['number']).quantile(0.25, numeric_only=True)
Q3 = churn_data.select_dtypes(include=['number']).quantile(0.75, numeric_only=True)
IQR = Q3 - Q1

# Removing outliers
churn_data = churn_data[~((churn_data.select_dtypes(include=['number']) < (Q1 - 1.5 * IQR)) |
          (churn_data.select_dtypes(include=['number']) > (Q3 + 1.5 * IQR))).any(axis=1)]


Prepare data for machine learning

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encoding categorical variables
labelencoder = LabelEncoder()
churn_data['Gender'] = labelencoder.fit_transform(churn_data['Gender'])
churn_data['Location'] = labelencoder.fit_transform(churn_data['Location'])

In [16]:
# Split data into training and testing sets
X = churn_data.drop(['CustomerID', 'Name', 'Churn'], axis=1)
y = churn_data['Churn']

In [17]:
print('Shape of X = ', X.shape)
print('Shape of y = ', y.shape)

Shape of X =  (100000, 6)
Shape of y =  (100000,)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

Shape of X_train =  (80000, 6)
Shape of y_train =  (80000,)
Shape of X_test =  (20000, 6)
Shape of y_test =  (20000,)


# Feature Engineering

In [20]:
# Featuere Engineering
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Model Building

**Logistic Regression**

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
logistic_pred = logistic_model.predict(X_test)

In [27]:
print("Logistic Regression:")
print(f'Accuracy: {accuracy_score(y_test, logistic_pred)}')
print(f'Precision: {precision_score(y_test, logistic_pred)}')
print(f'Recall: {recall_score(y_test, logistic_pred)}')
print(f'F1-Score: {f1_score(y_test, logistic_pred)}')

Logistic Regression:
Accuracy: 0.50225
Precision: 0.4977596204533474
Recall: 0.38070758996068943
F1-Score: 0.4314352618653264


**Random Forest Classifier**

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
# Random Forest
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
rf_pred = random_forest_model.predict(X_test)

In [31]:
print("Random Forest:")
print(f'Accuracy: {accuracy_score(y_test, rf_pred)}')
print(f'Precision: {precision_score(y_test, rf_pred)}')
print(f'Recall: {recall_score(y_test, rf_pred)}')
print(f'F1-Score: {f1_score(y_test, rf_pred)}')

Random Forest:
Accuracy: 0.4988
Precision: 0.4945646437994723
Recall: 0.4723314182038101
F1-Score: 0.48319241080635184


**Neural Network**

In [32]:
# Neural Network
from sklearn.neural_network import MLPClassifier

In [33]:
neural_net_model = MLPClassifier()
neural_net_model.fit(X_train, y_train)
nn_pred = neural_net_model.predict(X_test)

In [34]:
print("Neural Network:")
print(f'Accuracy: {accuracy_score(y_test, nn_pred)}')
print(f'Precision: {precision_score(y_test, nn_pred)}')
print(f'Recall: {recall_score(y_test, nn_pred)}')
print(f'F1-Score: {f1_score(y_test, nn_pred)}')

Neural Network:
Accuracy: 0.49805
Precision: 0.4906734113183686
Recall: 0.31287168632194334
F1-Score: 0.3821013110112636


Best Model

In [35]:
# Select best model
logistic_f1 = f1_score(y_test, logistic_pred)
rf_f1 = f1_score(y_test, rf_pred)
nn_f1 = f1_score(y_test, nn_pred)

best_model = None
if max(logistic_f1, rf_f1, nn_f1) == logistic_f1:
    best_model = logistic_model
elif max(logistic_f1, rf_f1, nn_f1) == rf_f1:
    best_model = random_forest_model
else:
    best_model = neural_net_model


Save the Best Model

In [37]:
import pickle

# Save the model
with open('model.pkl', 'wb') as file:
    pickle.dump(best_model, file)