In [2]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [4]:
# Step 2: Load the Dataset
df = pd.read_csv('C:/Users/User/Downloads/archive/syria_tel.csv')

In [5]:
# Step 3: Explore the Data
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())


  state  account length  area code phone number international plan  \
0    KS             128        415     382-4657                 no   
1    OH             107        415     371-7191                 no   
2    NJ             137        415     358-1921                 no   
3    OH              84        408     375-9999                yes   
4    OK              75        415     330-6626                yes   

  voice mail plan  number vmail messages  total day minutes  total day calls  \
0             yes                     25              265.1              110   
1             yes                     26              161.6              123   
2              no                      0              243.4              114   
3              no                      0              299.4               71   
4              no                      0              166.7              113   

   total day charge  ...  total eve calls  total eve charge  \
0             45.07  ...           

In [6]:
print(df.isnull().sum())


state                     0
account length            0
area code                 0
phone number              0
international plan        0
voice mail plan           0
number vmail messages     0
total day minutes         0
total day calls           0
total day charge          0
total eve minutes         0
total eve calls           0
total eve charge          0
total night minutes       0
total night calls         0
total night charge        0
total intl minutes        0
total intl calls          0
total intl charge         0
customer service calls    0
churn                     0
dtype: int64


In [9]:
# Splitting the dataset
X = df.drop('churn', axis=1)  # Features
y = df['churn']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
print(X_train.dtypes)


state                      object
account length              int64
area code                   int64
phone number               object
international plan         object
voice mail plan            object
number vmail messages       int64
total day minutes         float64
total day calls             int64
total day charge          float64
total eve minutes         float64
total eve calls             int64
total eve charge          float64
total night minutes       float64
total night calls           int64
total night charge        float64
total intl minutes        float64
total intl calls            int64
total intl charge         float64
customer service calls      int64
dtype: object


In [13]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Drop 'phone number' as it's not useful for prediction
X_train = X_train.drop(columns=['phone number'])
X_test = X_test.drop(columns=['phone number'])

# Encode 'international plan' and 'voice mail plan' using LabelEncoder
label_encoder = LabelEncoder()

X_train['international plan'] = label_encoder.fit_transform(X_train['international plan'])
X_test['international plan'] = label_encoder.transform(X_test['international plan'])

X_train['voice mail plan'] = label_encoder.fit_transform(X_train['voice mail plan'])
X_test['voice mail plan'] = label_encoder.transform(X_test['voice mail plan'])

# One-hot encode 'state' column
onehot_encoder = OneHotEncoder(drop='first', sparse=False)

state_train_encoded = onehot_encoder.fit_transform(X_train[['state']])
state_test_encoded = onehot_encoder.transform(X_test[['state']])

# Convert to DataFrame for merging back
state_train_encoded_df = pd.DataFrame(state_train_encoded, index=X_train.index, columns=onehot_encoder.get_feature_names_out(['state']))
state_test_encoded_df = pd.DataFrame(state_test_encoded, index=X_test.index, columns=onehot_encoder.get_feature_names_out(['state']))

# Drop the original 'state' column and concatenate with the new encoded columns
X_train = pd.concat([X_train.drop(columns=['state']), state_train_encoded_df], axis=1)
X_test = pd.concat([X_test.drop(columns=['state']), state_test_encoded_df], axis=1)




In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [15]:
# Baseline Model - Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_train = log_reg.predict(X_train)
y_pred_test = log_reg.predict(X_test)

In [16]:
# Evaluate the baseline model
print("Baseline Model - Logistic Regression")
print(f"Training Accuracy: {accuracy_score(y_train, y_pred_train)}")
print(f"Testing Accuracy: {accuracy_score(y_test, y_pred_test)}")
print(f"Precision: {precision_score(y_test, y_pred_test)}")
print(f"Recall: {recall_score(y_test, y_pred_test)}")
print(f"F1 Score: {f1_score(y_test, y_pred_test)}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_test)}")

Baseline Model - Logistic Regression
Training Accuracy: 0.8713428357089272
Testing Accuracy: 0.8575712143928036
Precision: 0.5833333333333334
Recall: 0.2079207920792079
F1 Score: 0.30656934306569344
ROC AUC Score: 0.5907095126473778


In [17]:
# Step 6: Model Iteration and Improvement
# Hyperparameter Tuning for Logistic Regression
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_log_reg = grid_search.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Evaluate the tuned model
y_pred_test_tuned = best_log_reg.predict(X_test)
print("Tuned Model - Logistic Regression")
print(f"Testing Accuracy: {accuracy_score(y_test, y_pred_test_tuned)}")


Tuned Model - Logistic Regression
Testing Accuracy: 0.856071964017991


In [19]:
# Advanced Model - Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Advanced Model - Random Forest")
print(f"Testing Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_rf)}")

Advanced Model - Random Forest
Testing Accuracy: 0.9430284857571214
ROC AUC Score: 0.820015393765525
