In [1]:
import pandas as pd


In [4]:
data = pd.read_excel("customer_churn_large_dataset.xlsx")
print(data.shape)
data.head(3)

(100000, 9)


Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0


In [5]:
new_data = data.drop(columns=['CustomerID', 'Name'])

print(new_data.shape)
new_data.head(3)

(100000, 7)


Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,63,Male,Los Angeles,17,73.36,236,0
1,62,Female,New York,1,48.76,172,0
2,24,Female,Los Angeles,5,85.47,460,0


In [6]:
new_data.sample(1)

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
45214,62,Female,Houston,6,48.81,298,0


In [7]:
X = new_data.iloc[:, :-1]
y = new_data.iloc[:, -1]

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate

In [9]:
#  training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((90000, 6), (10000, 6), (90000,), (10000,))

In [10]:
# Categorical & Numerical features
categorical_features = ['Gender', 'Location']
numerical_features = ['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']

#  `StandardScalar` and `OneHotEncoder` 
one_hot_encoder = OneHotEncoder()
standard_scalar = StandardScaler()


preprocesser = ColumnTransformer(transformers=(
    ('encode_gender', one_hot_encoder, categorical_features),
    ('standardization', standard_scalar, numerical_features)
))

In [11]:
# Model Pipeline for Logistic Regrion
clf = Pipeline(steps=(
    ('preprocessing', preprocesser),
    ('classifier', LogisticRegression())
))

In [12]:
clf.fit(X_train, y_train)
print("Accuracy score of Logistic Regression is: ", clf.score(X_test, y_test))

Accuracy score of Logistic Regression is:  0.4994


In [13]:
# Check score using other metrics like `Precision Score`, `Recall Score`, `F1 Score`
y_pred = clf.predict(X_test)

print("The precision score of Logistic Regression is: ", precision_score(y_test, y_pred))
print("The recall score of Logistic Regression is: ", recall_score(y_test, y_pred))
print("The F1 score of Logistic Regression is: ", f1_score(y_test, y_pred))

The precision score of Logistic Regression is:  0.4985514880168554
The recall score of Logistic Regression is:  0.378978978978979
The F1 score of Logistic Regression is:  0.4306187443130118


In [14]:
# Create `Model Pipeline` for `RandomForestClassifier` 
clf2 = Pipeline(steps=[
    ('preprocessing', preprocesser),
    ('classifier', RandomForestClassifier())
])

In [15]:
clf2.fit(X_train, y_train)
print("The accuracy score of Random Forest Classifier is:", clf2.score(X_test, y_test))

The accuracy score of Random Forest Classifier is: 0.4945


In [16]:
# Check score using other metrics like `Precision Score`, `Recall Score`, `F1 Score`
y_pred = clf2.predict(X_test)

print("The precision score of Logistic Regression is: ", precision_score(y_test, y_pred))
print("The recall score of Logistic Regression is: ", recall_score(y_test, y_pred))
print("The F1 score of Logistic Regression is: ", f1_score(y_test, y_pred))

The precision score of Logistic Regression is:  0.49366018596787825
The recall score of Logistic Regression is:  0.46766766766766765
The F1 score of Logistic Regression is:  0.480312532127069


In [17]:
import tensorflow as tf
from tensorflow import keras

In [18]:
# Create `Features` & `Targets`
features = preprocesser.fit_transform(X_train)
targets = y_train

In [19]:
# Create Model using `Sequential` layer
model = keras.Sequential(layers=[
    keras.layers.Dense(units=64, activation="relu", input_shape=(features.shape[1], )),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(units=128, activation="relu"),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(units=64, activation="relu"),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(units=1, activation="sigmoid")
])

In [20]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                768       
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 128)               8320      
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 6

In [22]:
model.fit(x=features, y=targets, batch_size=1000, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x215cf389600>

In [23]:
test_features = preprocesser.transform(X_test)
test_targets = y_test

model.evaluate(test_features, test_targets)



[0.6946685314178467, 0.4925999939441681]

In [24]:
import pickle

pickle.dump(clf, open("model.pkl", 'wb'))