# Import Libraries

In [14]:
# basic imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

# ML libraries
from sklearn.model_selection import train_test_split

# remove warnings
import warnings
warnings.filterwarnings("ignore")

# Sample Data

In [15]:
# Generate mock customer data
np.random.seed(42)
num_customers = 2000

data = {
    'CustomerID': [f'CUST{1000+i}' for i in range(num_customers)],
    'Gender': np.random.choice(['Male', 'Female'], num_customers, p=[0.5, 0.5]),
    'SeniorCitizen': np.random.choice([0, 1], num_customers, p=[0.84, 0.16]),
    'Partner': np.random.choice(['Yes', 'No'], num_customers, p=[0.48, 0.52]),
    'Dependents': np.random.choice(['Yes', 'No'], num_customers, p=[0.3, 0.7]),
    'Tenure': np.random.randint(1, 73, num_customers), # Months
    'PhoneService': np.random.choice(['Yes', 'No'], num_customers, p=[0.9, 0.1]),
    'MultipleLines': np.random.choice(['Yes', 'No', 'No phone service'], num_customers, p=[0.42, 0.48, 0.1]),
    'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], num_customers, p=[0.34, 0.44, 0.22]),
    'OnlineSecurity': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.28, 0.50, 0.22]),
    'OnlineBackup': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.34, 0.44, 0.22]),
    'DeviceProtection': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.34, 0.44, 0.22]),
    'TechSupport': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.29, 0.49, 0.22]),
    'StreamingTV': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.38, 0.40, 0.22]),
    'StreamingMovies': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.39, 0.39, 0.22]),
    'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], num_customers, p=[0.55, 0.24, 0.21]),
    'PaperlessBilling': np.random.choice(['Yes', 'No'], num_customers, p=[0.59, 0.41]),
    'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)'], num_customers, p=[0.34, 0.23, 0.22, 0.21]),
    'MonthlyCharges': np.random.normal(loc=65, scale=30, size=num_customers).clip(18, 120).round(2),
}

# create DataFrame
df_customers = pd.DataFrame(data)

# Generate TotalCharges based on Tenure and MonthlyCharges with some noise
df_customers['TotalCharges'] = (df_customers['Tenure'] * df_customers['MonthlyCharges'] * np.random.uniform(0.95, 1.05, num_customers)).round(2)

# Make some TotalCharges empty for realism (e.g., new customers with 0 tenure)
df_customers.loc[df_customers['Tenure'] == 1, 'TotalCharges'] = df_customers['MonthlyCharges']

# Get all customers with Tenure < 3
low_tenure_mask = df_customers['Tenure'] < 3
# Get random sample of 1% of those low-tenure customers
random_indices = df_customers[low_tenure_mask].sample(frac=0.01, random_state=42).index
# Set TotalCharges to NaN for these
df_customers.loc[random_indices, 'TotalCharges'] = np.nan

# Simulate Churn (more likely for month-to-month, higher charges, lower tenure)
churn_probability = 0.1 \
+ 0.15 * (df_customers['Contract'] == 'Month-to-month') \
+ 0.1 * (df_customers['InternetService'] == 'Fiber optic') \
+ 0.001 * (df_customers['MonthlyCharges'] - 65) \
- 0.002 * (df_customers['Tenure'] - 36) \
+ 0.1 * (df_customers['OnlineSecurity'] == 'No') \
+ 0.1 * (df_customers['TechSupport'] == 'No')

churn_probability = np.clip(churn_probability, 0.01, 0.99)
df_customers['Churn'] = np.random.binomial(1, churn_probability, num_customers).astype(str)
df_customers['Churn'] = df_customers['Churn'].replace({'1': 'Yes', '0': 'No'})

# Replace 'No phone service' and 'No internet service' for consistency
for col in ['MultipleLines']:
    df_customers[col] = df_customers.apply(lambda row: 'No' if row['PhoneService'] == 'No' else row[col], axis=1)

for col in ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']:
    df_customers[col] = df_customers.apply(lambda row: 'No' if row['InternetService'] == 'No' else row[col], axis=1)

# make folder if it doesn't exist
if not os.path.exists('Data'):
    os.makedirs('Data')

# Save the DataFrame to an Excel file
df_customers.to_excel('./Data/telecom_churn_mock_data.xlsx', index=False)
print("Mock telecom churn data generated: telecom_churn_mock_data.xlsx")


Mock telecom churn data generated: telecom_churn_mock_data.xlsx


In [16]:
# load the data
df = pd.read_excel('./Data/telecom_churn_mock_data.xlsx')

print("Data loaded successfully")

Data loaded successfully


In [17]:
# show all columns
pd.set_option('display.max_columns', None)

# data preview
df.head()

Unnamed: 0,CustomerID,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,CUST1000,Male,0,No,No,30,Yes,Yes,No,No,No,No,No,No,No,Month-to-month,No,Bank transfer (automatic),69.55,2047.01,Yes
1,CUST1001,Female,0,No,Yes,11,Yes,Yes,Fiber optic,No internet service,No internet service,No,No internet service,No,No internet service,Month-to-month,Yes,Electronic check,48.08,522.42,Yes
2,CUST1002,Female,1,No,No,17,No,No,Fiber optic,No,Yes,No,No,Yes,No,Two year,No,Electronic check,36.56,610.07,No
3,CUST1003,Female,0,Yes,No,26,Yes,No,No,No,No,No,No,No,No,One year,Yes,Bank transfer (automatic),79.72,2159.26,No
4,CUST1004,Male,0,Yes,Yes,23,Yes,No,Fiber optic,No internet service,No,No,Yes,No,Yes,Month-to-month,Yes,Mailed check,70.42,1672.56,Yes
