Data Preprocessing

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from sklearn.metrics import f1_score
import numpy as np

# Load the data
df = pd.read_csv('Telco-Customer-Churn.csv')

# Drop Customer ID
df = df.drop(columns=['customerID'])

# Convert binary columns to numerical values
binary_columns = {
    'gender': {'Male': 1, 'Female': 0},
    'Partner': {'Yes': 1, 'No': 0},
    'Dependents': {'Yes': 1, 'No': 0},
    'PhoneService': {'Yes': 1, 'No': 0},
    'PaperlessBilling': {'Yes': 1, 'No': 0},
    'Churn': {'Yes': 1, 'No': 0}
}
df.replace(binary_columns, inplace=True)

# Replace empty strings with NaN and drop missing values
df.replace(' ', pd.NA, inplace=True)
df.dropna(inplace=True)

# Convert numeric columns to appropriate data types
numeric_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric)

# Separate features and target
y = df['Churn']
X = df.drop(columns=['Churn'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numerical and categorical data
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
numeric_transformer = StandardScaler()

categorical_features = ['MultipleLines', 'InternetService', 'OnlineSecurity', 
                        'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                        'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[ 
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# Build the pipeline without the classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit the pipeline on training data and transform both train and test data
X_train_processed = pipeline.fit_transform(X_train)
X_test_processed = pipeline.transform(X_test)

  df.replace(binary_columns, inplace=True)


Looping through Parameter combinations to see best performing combo

In [None]:
# Define parameter grid
layer_sizes = [32, 48, 64]
dropout_rates = [0.2, 0.3]
learning_rates = [0.0001, 0.001, 0.005, 0.01, 0.05]
batch_sizes = [16, 32, 64]
optimizers = {'adam': Adam, 'rmsprop': RMSprop}
epochs = 5  # Start with a small number of epochs for quick testing

# Table to store results
results = []

# Loop over all combinations of parameters
for layer_size in layer_sizes:
    for dropout_rate in dropout_rates:
        for learning_rate in learning_rates:
            for batch_size in batch_sizes:
                for opt_name, opt_class in optimizers.items():
                    # Build the model
                    model = Sequential([
                        Dense(layer_size, activation='relu', input_shape=(X_train_processed.shape[1],)),
                        Dropout(dropout_rate),
                        Dense(int(layer_size / 2), activation='relu'),
                        Dense(1, activation='sigmoid')
                    ])
                    
                    # Compile the model with dynamic optimizer
                    optimizer = opt_class(learning_rate=learning_rate)
                    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
                    
                    # Train the model
                    model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
                    
                    # Evaluate on test data
                    y_pred = (model.predict(X_test_processed) > 0.5).astype("int32")
                    f1 = f1_score(y_test, y_pred)
                    
                    # Store results
                    results.append({
                        'Layer Size': layer_size,
                        'Dropout Rate': dropout_rate,
                        'Learning Rate': learning_rate,
                        'Batch Size': batch_size,
                        'Optimizer': opt_name,
                        'F1 Score': f1
                    })

In [13]:
# Convert results to DataFrame and sort by F1 Score
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='F1 Score', ascending=False)
print(results_df)

     Layer Size  Dropout Rate  Learning Rate  Batch Size Optimizer  F1 Score
167          64           0.3         0.0050          64   rmsprop  0.602410
101          48           0.3         0.0010          64   rmsprop  0.599179
72           48           0.2         0.0050          16      adam  0.597333
48           32           0.3         0.0100          16      adam  0.591700
38           32           0.3         0.0010          32      adam  0.591160
..          ...           ...            ...         ...       ...       ...
5            32           0.2         0.0001          64   rmsprop  0.285106
55           32           0.3         0.0500          16   rmsprop  0.249443
35           32           0.3         0.0001          64   rmsprop  0.244989
34           32           0.3         0.0001          64      adam  0.147971
4            32           0.2         0.0001          64      adam  0.010610

[180 rows x 6 columns]


Run the model again with 1 set of parameters.

In [6]:
from sklearn.utils import class_weight

# Calculate class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

# Create a dictionary for class weights
class_weights_dict = {int(label): weight for label, weight in zip(np.unique(y_train), class_weights)}

# Print the class weights dictionary for verification
print("Class weights dictionary:", class_weights_dict)

Class weights dictionary: {0: 0.6809927360774818, 1: 1.8812709030100334}


In [7]:
y_train = y_train.values  # Convert to a plain numpy array

In [8]:
# Set parameters
layer_size = 32
dropout_rate = 0.2
learning_rate = 0.01
batch_size = 32
epochs = 20
optimizer = Adam(learning_rate=learning_rate)

# Build the model
model = Sequential([
    Dense(layer_size, activation='relu', input_shape=(X_train_processed.shape[1],)),
    Dropout(dropout_rate),
    Dense(int(layer_size / 2), activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with class weights
model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size, 
          verbose=1, class_weight=class_weights_dict)

# Evaluate on test data
y_pred = (model.predict(X_test_processed) > 0.5).astype("int32")
f1 = f1_score(y_test, y_pred)

# Output F1 Score
print("F1 Score on test set:", f1)

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7090 - loss: 0.5434
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7518 - loss: 0.5007
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7540 - loss: 0.4829
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 996us/step - accuracy: 0.7616 - loss: 0.4860
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 912us/step - accuracy: 0.7638 - loss: 0.4855
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7505 - loss: 0.4803
Epoch 7/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7565 - loss: 0.4957
Epoch 8/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7675 - loss: 0.4750
Epoch 9/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━

In [9]:
# Get 3 random indices from the test set
random_indices = np.random.choice(X_test.index, size=3, replace=False)

# Select the corresponding input features and target labels
X_samples = X_test.loc[random_indices]
y_samples = y_test.loc[random_indices]

# Apply the preprocessing pipeline to the selected test samples
X_samples_processed = pipeline.transform(X_samples)

# Get the model's predictions for these samples
y_pred_samples = (model.predict(X_samples_processed) > 0.5).astype("int32")

# Display the original input features (before transformation) and the corresponding predictions
for i in range(3):
    print(f"Sample {i+1}:")
    print("Original Features:")
    print(X_samples.iloc[i])
    print("Original Target (True):", y_samples.iloc[i])
    print("Predicted Output:", y_pred_samples[i][0])
    print("-" * 40)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Sample 1:
Original Features:
gender                             0
SeniorCitizen                      1
Partner                            0
Dependents                         0
tenure                            15
PhoneService                       1
MultipleLines                     No
InternetService          Fiber optic
OnlineSecurity                    No
OnlineBackup                      No
DeviceProtection                  No
TechSupport                       No
StreamingTV                      Yes
StreamingMovies                  Yes
Contract              Month-to-month
PaperlessBilling                   1
PaymentMethod       Electronic check
MonthlyCharges                  91.5
TotalCharges                  1400.3
Name: 3469, dtype: object
Original Target (True): 0
Predicted Output: 1
----------------------------------------
Sample 2:
Original Features:
gender                           1
SeniorCitizen      