In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
import seaborn as sns
from IPython.display import display
pd.set_option('display.max_columns', None)
from sklearn.metrics import confusion_matrix,recall_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score



In [None]:
df=pd.read_csv("telecom_customer_churn.csv")
dic = pd.read_csv("telecom_data_dictionary.csv", encoding='latin1')
print(df.shape)
df.head()


In [None]:
for i in dic.index:
    print(dic["Field"][i])  
    print(dic["Description"][i])

In [None]:
stat=df.groupby("Customer Status").count().iloc[:,0:1]
stat.columns = ["distribution"]
display(stat)
stat.plot(kind='bar', legend=False)
plt.title('Customer Status Count')
plt.xlabel('Customer Status')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

So we are dealing with an imbalanced dataset

In [None]:
df.info()


In [None]:
df = df[df['Customer Status'] != 'Joined']
df['Customer Status'] = df['Customer Status'].map({'Churned': 1, 'Stayed': 0})


We dropped "Joined" because we are interesting only on "churned" or "stayed" customers

In [None]:
df= df.drop(columns=['Zip Code','City','Churn Category','Churn Reason','Latitude','Longitude','Customer ID'])


In [None]:
df['Offer'] = df['Offer'].fillna('no offer')
df['Avg Monthly Long Distance Charges'] = df['Avg Monthly Long Distance Charges'].fillna(0)
df['Internet Type'] = df['Internet Type'].fillna('no internet')
df['Avg Monthly GB Download'] = df['Avg Monthly GB Download'].fillna(0)
for c in ["Multiple Lines","Online Security","Online Backup","Device Protection Plan","Premium Tech Support","Streaming TV","Streaming Movies","Streaming Music","Unlimited Data"]:
    df[c]=df[c].fillna('No')

In [None]:
df.info()

In [None]:
for i in df.columns:
    churn_rate = df.groupby(i)['Customer Status'].apply(lambda x: (x == 1).mean() * 100)

    total_customers = df.groupby(i).size()

    import matplotlib.pyplot as plt

    fig, ax1 = plt.subplots(figsize=(10, 6))

    ax1.bar(total_customers.index, total_customers.values, color='lightblue', label='Total Customers')
    ax1.set_xlabel(i)
    ax1.set_ylabel('Total Customers', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')

    ax2 = ax1.twinx()
    ax2.plot(churn_rate.index, churn_rate.values, color='red', label='Churn Rate', marker='o', linestyle='--')
    ax2.set_ylabel('Churn Rate (%)', color='red')
    ax2.tick_params(axis='y', labelcolor='red')

    plt.title(f'Churn Rate and Total Customers by {i}')

    ax1.legend(loc="upper left")
    ax2.legend(loc="upper right")

    plt.tight_layout()
    plt.show()


In [None]:
df= df.drop(columns=['Gender'])


In [None]:
features = df.drop('Customer Status', axis=1).columns
X = pd.get_dummies(df[features], drop_first=True,dtype=int)  # One-hot encode categorical variables
y = df['Customer Status']

In [None]:
num_df = X.copy()  
df_x_y = pd.concat([num_df, y], axis=1)
correlation_matrix = df_x_y.corr()

plt.figure(figsize=(25, 25))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
num_df = X.copy() 

df_x_y = pd.concat([num_df, y], axis=1)

correlation_matrix = df_x_y.corr()

customer_status_corr = correlation_matrix.loc['Customer Status'].drop('Customer Status')  

plt.figure(figsize=(25, 2))
sns.heatmap(customer_status_corr.to_frame().T, annot=True, cmap='coolwarm', fmt=".2f", cbar_kws={'label': 'Correlation'})
plt.title('Correlation of Customer Status with Other Features')

plt.show()

In [None]:
df.describe()

## ML with out scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf= GradientBoostingClassifier(random_state=0,max_depth=3)
clf.fit(X_train, y_train)

print('Accuracy on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy on test set: {:.2f}\n'.format(clf.score(X_test, y_test)))

cm = confusion_matrix(y_test, clf.predict(X_test))
print("Confusion Matrix:\n", cm)

recall = recall_score(y_test, clf.predict(X_test))
print(f"Recall: {recall:.2f}")

In [None]:
class_weights = {0: 1, 1: 2}  
sample_weights = y_train.map(class_weights)
clf_ = GradientBoostingClassifier(random_state=0)
clf_.fit(X_train, y_train, sample_weight=sample_weights)

print('Accuracy on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy on test set: {:.2f}\n'.format(clf.score(X_test, y_test)))

cm = confusion_matrix(y_test, clf_.predict(X_test))
print("Confusion Matrix:\n", cm)

recall = recall_score(y_test, clf_.predict(X_test))
print(f"Recall: {recall:.2f}")

In [None]:
class_weights = {0: 1, 1: 2} 

clf1 = RandomForestClassifier(random_state=0,class_weight='balanced')
clf1.fit(X_train, y_train)

print("Accuracy on training set: {:.2f}".format(clf1.score(X_train, y_train)))
print("Accuracy on test set: {:.2f}".format(clf1.score(X_test, y_test)))

cm = confusion_matrix(y_test, clf1.predict(X_test))
print("Confusion Matrix:\n", cm)

recall = recall_score(y_test, clf1.predict(X_test))
print(f"Recall: {recall:.2f}")

In [None]:
from sklearn.linear_model import LogisticRegression

clf2 = LogisticRegression(C=100).fit(X_train, y_train)
print('Accuracy on training set: {:.2f}'.format(clf2.score(X_train, y_train)))
print('Accuracy on test set: {:.2f}\n'.format(clf2.score(X_test, y_test)))

cm = confusion_matrix(y_test, clf2.predict(X_test))
print("Confusion Matrix:\n", cm)

recall = recall_score(y_test, clf2.predict(X_test))
print(f"Recall: {recall:.2f}")

## ML using MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier


scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf3 = MLPClassifier(hidden_layer_sizes = [10,10,10], alpha = 5,
                   random_state = 0, solver='lbfgs').fit(X_train_scaled, y_train)

print('Accuracy on training set: {:.2f}'.format(clf3.score(X_train_scaled, y_train)))
print('Accuracy on test set: {:.2f}\n'.format(clf3.score(X_test_scaled, y_test)))

cm = confusion_matrix(y_test, clf3.predict(X_test_scaled))
print("Confusion Matrix:\n", cm)

recall = recall_score(y_test, clf3.predict(X_test_scaled))
print(f"Recall: {recall:.2f}")

In [None]:
clf2_ = LogisticRegression(C=100).fit(X_train_scaled, y_train)
print('Accuracy on training set: {:.2f}'.format(clf2_.score(X_train_scaled, y_train)))
print('Accuracy on test set: {:.2f}\n'.format(clf2_.score(X_test_scaled, y_test)))

cm = confusion_matrix(y_test, clf2_.predict(X_test_scaled))
print("Confusion Matrix:\n", cm)

recall = recall_score(y_test, clf2_.predict(X_test_scaled))
print(f"Recall: {recall:.2f}")

In [None]:
from sklearn.svm import LinearSVC

clf4 = LinearSVC().fit(X_train_scaled, y_train)
print('Accuracy on training set: {:.2f}'.format(clf4.score(X_train_scaled, y_train)))
print('Accuracy on test set: {:.2f}\n'.format(clf4.score(X_test_scaled, y_test)))

cm = confusion_matrix(y_test, clf4.predict(X_test_scaled))
print("Confusion Matrix:\n", cm)

recall = recall_score(y_test, clf4.predict(X_test_scaled))
print(f"Recall: {recall:.2f}")

In [None]:
clf4 = SVC(kernel = 'poly').fit(X_train_scaled, y_train)
print('Accuracy on training set: {:.2f}'.format(clf4.score(X_train_scaled, y_train)))
print('Accuracy on test set: {:.2f}\n'.format(clf4.score(X_test_scaled, y_test)))

cm = confusion_matrix(y_test, clf4.predict(X_test_scaled))
print("Confusion Matrix:\n", cm)

recall = recall_score(y_test, clf4.predict(X_test_scaled))
print(f"Recall: {recall:.2f}")