In [1]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report

In [None]:
df = pd.read_csv("telecom-input-data.xls")
df.head()

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
for col in df.columns:
    col_values = df[col].unique()
    print("***** ", col, " *****")
    print("- Type: ", df[col].dtype)
    print("- Total Unique values: ", len(col_values))
    print("- Unique values: ", col_values)

In [None]:
df['TotalCharges'].nunique()

In [None]:
invalid_rows = df[pd.to_numeric(df['TotalCharges'], errors='coerce').isna()]

print('Invalid raws (TotalCharges value is not convertable to int): ', len(invalid_rows))

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
df.isnull().sum()

In [None]:
df[np.isnan(df['TotalCharges'])]

In [None]:
df[df['tenure']==0]

In [None]:
df = df[df['tenure'] != 0]

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(columns=['customerID'])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
print('MonthlyCharges MIN: ', df['MonthlyCharges'].min())
print('MonthlyCharges MAX: ', df['MonthlyCharges'].max())

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df['MonthlyCharges'], bins=30, kde=True)
plt.title('Distribution of Monthly Charges')
plt.xlabel('Monthly Charges')
plt.ylabel('Frequency')
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='Contract', hue='Churn', data=df)
plt.title('Count of Contract Types')
plt.xlabel('Contract Type')
plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='gender', hue='Churn', data=df)
plt.title('Count of Gender Types')
plt.xlabel('Gender Type')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='Partner', hue='Churn', data=df)
plt.title('Count of Gender Types')
plt.xlabel('Gender Type')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(14, 4))
sns.countplot(x='PaymentMethod', hue='Churn', data=df)
plt.title('Count of Gender Types')
plt.xlabel('Gender Type')
plt.ylabel('Count')
plt.show()

In [None]:
sns.set_context("paper",font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No') ],
                color="Red");
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes') ],
                ax =ax, color="Blue");
ax.legend(["Not Churn","Churn"],loc='upper right');
ax.set_ylabel('Density');
ax.set_xlabel('Monthly Charges');
ax.set_title('Distribution of monthly charges by churn');

In [None]:
df_1 = df.copy()
df_2 = df.copy()

In [None]:
df_1.info()

In [None]:
#########

# Label Encoded all Objects - For knowing how encoding affects the peformance

########

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
scaler = StandardScaler()

In [None]:
## Function to do label encoding
def label_encode_columns(df, columns):
    label_encoders = {}
    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    return df

In [None]:
to_label_encode = list(df.select_dtypes(include='object').columns)
df_1 = label_encode_columns(df_1, to_label_encode)

In [None]:
df_1.info()

In [None]:
df_1.isnull().sum()

In [None]:
df_1.head()

In [None]:
## Correlation of the featrures
correlation_matrix = df_1.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
df_1.corr()['Churn']

In [None]:
import time

# Get current time in milliseconds
milliseconds = int(time.time() * 1000)

# Convert to string
timestamp_str = str(milliseconds)

print(timestamp_str)

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

def evaluate_models_unscaled(X, y, f_name):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define models
    models = {
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Gradient Boosting': GradientBoostingClassifier()
    }

    # Initialize results dictionary
    results = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}

    # Train and evaluate models
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results['Model'].append(name)
        results['Accuracy'].append(accuracy_score(y_test, y_pred))
        results['Precision'].append(precision_score(y_test, y_pred, zero_division=0))
        results['Recall'].append(recall_score(y_test, y_pred, zero_division=0))
        results['F1 Score'].append(f1_score(y_test, y_pred, zero_division=0))

    # Create DataFrame
    results_df = pd.DataFrame(results)

    # Plotting
    results_df.set_index('Model')[['Accuracy', 'Precision', 'Recall', 'F1 Score']].plot(kind='bar', figsize=(10, 6))
    plt.title('Performance of Unscaled Models')
    plt.ylabel('Score')
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Export to Excel
    results_df.to_csv(f_name, index=False)

    return results_df


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_models_scaled(X, y, f_name):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define models
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Support Vector Machine': SVC(),
        'Neural Network': MLPClassifier(max_iter=1000)
    }

    # Initialize results dictionary
    results = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}

    # Train and evaluate models
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results['Model'].append(name)
        results['Accuracy'].append(accuracy_score(y_test, y_pred))
        results['Precision'].append(precision_score(y_test, y_pred, zero_division=0))
        results['Recall'].append(recall_score(y_test, y_pred, zero_division=0))
        results['F1 Score'].append(f1_score(y_test, y_pred, zero_division=0))

    # Create DataFrame
    results_df = pd.DataFrame(results)

    # Plotting
    results_df.set_index('Model')[['Accuracy', 'Precision', 'Recall', 'F1 Score']].plot(kind='bar', figsize=(10, 6))
    plt.title('Performance of Scaled Models')
    plt.ylabel('Score')
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Export to Excel
    results_df.to_csv(f_name, index=False)

    return results_df


In [None]:
df_num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df_2_ohe_col =['PaymentMethod', 'Contract', 'InternetService'] 
df_2_label_enc_col = list(set(df_2.columns)- set(df_num_cols) - set(df_2_ohe_col))

In [None]:
df_1_scaled = df_1.copy()
df_1_scaled[df_num_cols] = scaler.fit_transform(df_1_scaled[df_num_cols])
df_1_scaled.head()

In [None]:
df_2.head()

In [None]:
X = df_1.drop(columns=['Churn'])
y = df_1['Churn']

In [None]:
print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
X_scaled = df_1_scaled.drop(columns=['Churn'])
y_scaled = df_1_scaled['Churn']

In [None]:
print("X Scaled shape:", X.shape)
print("y Scaled shape:", y.shape)

In [None]:
# Evaluate scaled models
from datetime import datetime

timestamp_str = datetime.now().strftime('%d_%H%S')
timestamp_str

In [None]:
# Evaluate unscaled models
fname = 'df1_un_scaled_' + timestamp_str + '.csv'
unscaled_results = evaluate_models_unscaled(X, y, fname)

In [None]:
fname = 'df1_scaled_' + timestamp_str + '.csv'
scaled_results = evaluate_models_scaled(X_scaled, y_scaled, fname)

In [None]:
#########

# Using Different Encodinf for Objects - For knowing how encoding affects the peformance

########

In [None]:
df_2 = df

In [None]:
df_2.info()

In [None]:
df_2['tenure'].unique()

In [None]:
df_2.head()

In [None]:
df_2 = pd.get_dummies(df_2, columns=df_2_ohe_col, drop_first=True, dtype=int)


In [None]:
df_2.head()

In [None]:
df_2 = label_encode_columns(df_2, df_2_label_enc_col)

In [None]:
df_2_label_enc_col

In [None]:
df_2.head()

In [None]:
df_num_cols

In [None]:
df_2_scaled = df_2.copy()
df_2_scaled[df_num_cols] = scaler.fit_transform(df_2_scaled[df_num_cols])
df_2_scaled.head()

In [None]:
X2 = df_2.drop(columns='Churn')
y2 = df_2['Churn']

In [None]:
print("X shape:", X2.shape)
print("y shape:", y2.shape)

In [None]:
X2_scaled = df_2_scaled.drop(columns='Churn')
y2_scaled = df_2_scaled['Churn']

In [None]:
print("X_Scaled shape:", X2_scaled.shape)
print("y_Scaled shape:", y2_scaled.shape)

In [None]:
# Evaluate unscaled models
fname = 'df2_un_scaled_' + timestamp_str + '.csv'
unscaled_results = evaluate_models_unscaled(X2, y2, fname)

In [None]:
# Evaluate scaled models
fname = 'df2_scaled_' + timestamp_str + '.csv'
scaled_results = evaluate_models_scaled(X2_scaled, y2_scaled, fname)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

model_rf = RandomForestClassifier(
    n_estimators=500,
    oob_score=True,
    n_jobs=-1,
    random_state=50,
    max_features='sqrt',  # Updated from 'auto' to 'sqrt'
    max_leaf_nodes=30
)

model_rf.fit(X_train, y_train)

# Make predictions
prediction_test = model_rf.predict(X_test)
print (metrics.accuracy_score(y_test, prediction_test))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X2_scaled, y2_scaled, test_size=0.2, random_state=42)

model_rf = RandomForestClassifier(
    n_estimators=500,
    oob_score=True,
    n_jobs=-1,
    random_state=50,
    max_features='sqrt',  # Updated from 'auto' to 'sqrt'
    max_leaf_nodes=30
)

model_rf.fit(X_train, y_train)

# Make predictions
prediction_test = model_rf.predict(X_test)
print (metrics.accuracy_score(y_test, prediction_test))

In [None]:
print(classification_report(y_test, prediction_test))


In [None]:
a_model = AdaBoostClassifier()
a_model.fit(X_train,y_train)
a_preds = a_model.predict(X_test)
print("AdaBoost Classifier accuracy")
metrics.accuracy_score(y_test, a_preds)

In [None]:
from sklearn.ensemble import VotingClassifier
clf1 = GradientBoostingClassifier()
clf2 = LogisticRegression()
clf3 = AdaBoostClassifier()
eclf1 = VotingClassifier(estimators=[('gbc', clf1), ('lr', clf2), ('abc', clf3)], voting='soft')
eclf1.fit(X_train, y_train)
predictions = eclf1.predict(X_test)
print("Final Accuracy Score ")
print(accuracy_score(y_test, predictions))