In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
data = pd.read_csv("D:\\23881A66E2\\Projects\\Customer_Churn_Predictor\\data\\WA_Fn-UseC_-Telco-Customer-Churn.csv")
data

In [None]:
data.columns

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.hist(figsize=(20, 20), bins=50)

In [None]:
for col in data.columns:
    print(f"\nColumn: {col}")
    print(data[col].value_counts())

In [None]:
from scipy.stats import chi2_contingency

for col in data.select_dtypes(include=['object']).columns:
    contingency_table = pd.crosstab(data[col], data['Churn'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"\nChi-squared test for {col}:")
    print(f"Chi2: {chi2}, p-value: {p}")

In [None]:
from scipy.stats import chi2_contingency

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

for col in data.select_dtypes(include=['object']).columns:
    if col != 'Churn':
        cramer_v_value = cramers_v(data[col], data['Churn'])
        print(f"Cram√©r's V for {col} and Churn: {cramer_v_value}")

In [None]:
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors='coerce')
data["MonthlyCharges"] = pd.to_numeric(data["MonthlyCharges"], errors='coerce')

In [None]:
data = data.drop(columns=["customerID", "gender", "PhoneService"])

In [None]:
data.isnull().sum()

In [None]:
data["TotalCharges"].fillna(data["TotalCharges"].median(), inplace = True)

In [None]:
data.isnull().sum()

In [None]:
x = data.drop(columns=["Churn"])
y = data["Churn"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)   

In [None]:
data.info()

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

cat_features = x_train.select_dtypes(include=['object']).columns.tolist()

model = CatBoostClassifier(
    iterations=1000,          # number of boosting rounds
    learning_rate=0.05,       # step size
    depth=6,                  # tree depth
    eval_metric='Accuracy',   # evaluation metric
    random_seed=42,
    verbose=200,              # print progress every 200 iterations
    early_stopping_rounds=50  # stop if no improvement
)

model.fit(
    x_train, y_train,
    cat_features=cat_features,
    eval_set=(x_test, y_test),
    use_best_model=True
)

y_pred = model.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

cat_features = x_train.select_dtypes(include=['object']).columns.tolist()

model = CatBoostClassifier(
    iterations=250,
    learning_rate=0.05,
    depth=6,
    eval_metric='Accuracy',
    random_seed=42,
    early_stopping_rounds=50,
    class_weights=[1, 2]   # Give more weight to churn class
)

model.fit(
    x_train, y_train,
    cat_features=cat_features,
    eval_set=(x_test, y_test),
    use_best_model=True
)

y_pred = model.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='linear', random_state=42)
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(x_train, y_train)
y_pred_rfc = rfc.predict(x_test)

print(classification_report(y_test, y_pred_rfc))
print(confusion_matrix(y_test, y_pred_rfc))


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))


In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1, 5],
}

grid_search_xgb = GridSearchCV(estimator=xgb,
                           param_grid=param_grid,
                           scoring='f1',
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

grid_search_xgb.fit(x_train, y_train)

print("Best parameters:", grid_search_xgb.best_params_)
print("Best score:", grid_search_xgb.best_score_)

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
}

grid_search_rfc = GridSearchCV(estimator=rfc,
                           param_grid=param_grid,
                           scoring='f1',
                           cv=5,
                           n_jobs=-1,
                           verbose=2)

grid_search_rfc.fit(x_train, y_train)
print("Best parameters:", grid_search_rfc.best_params_)
print("Best score:", grid_search_rfc.best_score_)

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto']
}

grid_search_svc = GridSearchCV(estimator=svc,
                           param_grid=param_grid,
                           scoring='f1',
                           cv=5,
                           n_jobs=-1,
                           verbose=2)

grid_search_svc.fit(x_train, y_train)
print("Best parameters:", grid_search_svc.best_params_)
print("Best score:", grid_search_svc.best_score_)