In [None]:
%pip install pandas
# importing needed libraries
import os
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
#from seaborn_qqplot import pplot

import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import norm, chi2_contingency
import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.impute import  SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score, classification_report, confusion_matrix


import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv("df_train.csv")

df_train.shape

In [None]:
# boolean columns changed
df_train.head()

In [None]:
#SeniorCitizen values changed
df_train.tail()

In [None]:
smart_features = ["OnlineSecurity", "OnlineBackup", "DeviceProtection",
                    "TechSupport", "StreamingTV", "StreamingMovies"]

In [None]:
#smart features imputed
df_train[smart_features].isna().any()


In [None]:
# row with missing Churn value dropped
df_train[df_train.Churn.isna()]


In [None]:
# data cleaned
df_train.isna().any().sum()

In [None]:
df_train.dtypes


In [None]:
# unexpected value in TotalCharges column
df_train[df_train["TotalCharges"] == " "]

In [None]:
df_train.TotalCharges = pd.to_numeric(df_train.TotalCharges, errors='coerce') #converts " " to NaN
df_train.TotalCharges = df_train.TotalCharges.fillna(df_train.TotalCharges.median())

In [None]:
cat_columns = df_train.select_dtypes(exclude=['float', 'int'])
print(len(cat_columns.columns))
cat_columns.head()

In [None]:
cat_columns.tail()

In [None]:
cols = list(cat_columns.columns)
cols.remove('Churn')

In [None]:

# set up figure and axes
fig, axes = plt.subplots(nrows=4, ncols=4, sharex=False, sharey=False, figsize=(16, 16))
fig.suptitle('Barplots of Categorical Variables', fontsize=16)

#flatten axes array to loop through
axes = axes.flatten()

#access axes and create barplots
for i, cat_var  in enumerate(cols):
    ax = sns.countplot(data=cat_columns, x=cat_var, ax=axes[i], hue='Churn');
    ax.legend().set_visible(False)
    axes[i].set_xlabel("")
    axes[i].set_ylabel("")
    axes[i].set_title(cat_var)


# Add a single legend for all the countplots outside the grid
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper right', bbox_to_anchor=(1, 1))

plt.tight_layout()
plt.xticks(rotation=30, ha='right');

In [None]:
significant_variables = []
insignificant_variables = []

def chi_square_test(var):

    # Create a contingency table (cross-tabulation) of the two categorical variables
    contingency_table = pd.crosstab(df_train[var], df_train['Churn'])

    # Perform the Chi-Square test of independence
    chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

    # Print the results
    print(f'{var}')
    print(f"Chi-Square Test of Independence Results:")
    print("Chi-Square Statistic:", chi2_stat)
    print("p-value:", p_value)
    print("Degrees of Freedom:", dof)
    print("Expected Frequencies Table:")
    print(pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns))

    # Interpret the results
    alpha = 0.05
    if p_value < alpha:
        print(f"There is a SIGNIFICANT RELATIONSHIP between {var} and Churn.")
        significant_variables.append(var)
    else:
        print(f"There is NO SIGNIFICANT RELATIONSHIP between {var} and Churn.")
        insignificant_variables.append(var)

In [None]:
for col in cols:
    chi_square_test(var=col)
    print("****" * 20)

In [None]:
print('Insignificant variables: ', insignificant_variables)
print()
print('Significant Variables: ')
significant_variables

In [None]:
churn_data = df_train.copy()
churn_data.to_csv('churn_data.csv', index=False)

In [None]:
churn_data.columns

In [None]:
#drop duplicates
df_train = df_train.drop_duplicates()
df_train.duplicated().sum()

In [None]:
#drop gender and phone service.
df_train = df_train.drop(['gender', 'PhoneService'], axis=1)
df_train.columns

In [None]:
# train test split. split data into train and eval set to maintain integrity of test set
X = df_train.drop('Churn', axis=1)
y = df_train['Churn']

In [None]:
print(X.shape, y.shape)
#print(X_train.shape, X_eval.shape)

In [None]:
#extract numerical and categorical features
cat_cols = list(X.select_dtypes(include='object').columns)
num_cols = list(X.select_dtypes(exclude='object').columns)

In [None]:
#reorder columns
X = X[num_cols + cat_cols]
print(X.columns)

In [None]:
#define preprocessing for categorical features
cat_transformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore') )
])


In [None]:
X.head()

In [None]:
# we don't do any numerical transformations to prevent data leakage before splitting.

cat_preprocessor = ColumnTransformer(transformers=[
   ('Num_transformer', 'passthrough', num_cols),
   ('cat_transformer', cat_transformer, cat_cols)

])


In [None]:
X_prepared = cat_preprocessor.fit_transform(X)


In [None]:
#obtain transformed cat_cols columns
transformed_columns = (num_cols +
   list(cat_preprocessor.named_transformers_['cat_transformer'].named_steps['encoder'].get_feature_names_out(cat_cols) ))
len(transformed_columns)

In [None]:
#convert to dataframe
X_prepared = pd.DataFrame(X_prepared, columns=transformed_columns)
X_prepared.head()

In [None]:
# handling class imbalance
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=99)
X_balanced, y_balanced= smote.fit_resample(X_prepared, y)
print(X_balanced.shape, y_balanced.shape)

In [None]:
#split dataset
#set stratify=y to ensure minority class is well represented in both sets.
X_train, X_eval, y_train, y_eval = train_test_split(X_balanced, y_balanced, test_size=0.2,
                                                    stratify=y_balanced, random_state=99)

In [None]:
print('shape of trainset: ', X_train.shape, y_train.shape)
print('shape of evaluation set: ', X_eval.shape, y_eval.shape)

In [None]:
#build numerical transformer pipeline after splitting data

num_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
  ])

X_train[num_cols] = num_transformer.fit_transform(X_train[num_cols])
X_train

In [None]:
X_train = pd.DataFrame(X_train, columns=transformed_columns)
X_train.head()

In [None]:
X_train.columns

In [None]:
#model based feature selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

clf = RandomForestClassifier(n_estimators=10, random_state=99)
clf.fit(X_train, y_train)

print(clf.feature_importances_)

selector = SelectFromModel(clf, prefit=True)
X_train_reducedFeatures = selector.transform(X_train)
print('feature reduced train set shape:', X_train_reducedFeatures.shape)


In [None]:
#extract the important features

feature_names = X_train.columns

# Get the selected feature indices as a boolean mask
selected_features_mask = selector.get_support()

# Print the names of the selected features
selected_feature_names = [feature_names[i] for i, selected in enumerate(selected_features_mask) if selected]
print("Selected Features:", selected_feature_names)

In [None]:
#apply feature reduction model transformation
X_eval_reducedFeatures = X_eval[selected_feature_names]
X_eval_reducedFeatures.head()

In [None]:
print('shape of evaluation set:', X_eval.shape)
print('shape of reduced features evaluation set:', X_eval_reducedFeatures.shape)

In [None]:
models = {
    "Linear Support vector": LinearSVC(),
    "Logistic Regression": LogisticRegression(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}
list(models.keys())

In [None]:

def models_trainer(models_dict, X_train, X_eval, y_train, y_eval):
  '''
  This function trains a dictionary of models and input train and evaluation sets.

  Input parameters Parameters:

  models_dict: a dictionary of models to be trained
  X_train: train set to be used to train the models
  X_eval: evaluation set to be used for evaluating model performance
  y_train: train set target
  y_eval: evaluation set target label

  Output
  prints the accuracy, recall, precision and f1_score metrics of each model

  '''

  #loop through the models
  for i in range(len(list(models_dict))):
    model = list(models_dict.values())[i]

    #train model
    model.fit(X_train, y_train)

    #make predictions
    y_train_pred = model.predict(X_train)
    y_eval_pred = model.predict(X_eval)

    #evaluation on trainset
    train_accuracy_score = accuracy_score(y_true=y_train, y_pred=y_train_pred)
    train_precision_score = precision_score(y_true=y_train, y_pred=y_train_pred, pos_label='Yes')
    train_recall_score = recall_score(y_true=y_train, y_pred=y_train_pred, pos_label='Yes')
    train_f1_score = f1_score(y_true=y_train, y_pred=y_train_pred, pos_label='Yes')

    #evaluation on eval set.
    eval_accuracy_score = accuracy_score(y_true=y_eval, y_pred=y_eval_pred)
    eval_precision_score = precision_score(y_true=y_eval, y_pred=y_eval_pred, pos_label='Yes')
    eval_recall_score = recall_score(y_true=y_eval, y_pred=y_eval_pred, pos_label='Yes')
    eval_f1_score = f1_score(y_true=y_eval, y_pred=y_eval_pred, pos_label='Yes')


    print('Model: ', list(models_dict.keys())[i])
    print("Performance on train set:")
    print(f'Accuracy Score: {train_accuracy_score: .4f}')
    print(f'Precision Score: {train_precision_score: .4f}')
    print(f'Recall Score: {train_recall_score: .4f}')
    print(f'f1 Score: {train_f1_score: .4f}')
    print('---'*30)

    print("Performance on evaluation set:")
    print(f'Accuracy Score: {eval_accuracy_score: .4f}')
    print(f'Precision Score: {eval_precision_score: .4f}')
    print(f'Recall Score: {eval_recall_score: .4f}')
    print(f'f1 Score: {eval_f1_score: .4f}')
    print('***' * 30)
    print('\n')

In [None]:
# evaluation models performance without feature reduction
models_trainer(models_dict=models, X_train=X_train, X_eval=X_eval, y_train=y_train, y_eval=y_eval)

In [None]:
#train model with reduced features
models_trainer(models_dict=models, X_train=X_train_reducedFeatures, X_eval=X_eval_reducedFeatures, y_train=y_train, y_eval=y_eval)

In [None]:

gb_param_grid = {
    'n_estimators': np.arange(50, 300, 10), # Number of boosting stages to be used
    'learning_rate': [0.01, 0.1, 0.2, 0.3], # Learning rate shrinks the contribution of each estimator
    'max_depth': np.arange(3, 12, 1), # Maximum depth of the individual estimators
    'min_samples_split': np.arange(2, 11, 1), # Minimum number of samples required to split an internal node
    'min_samples_leaf': np.arange(1, 11, 1), # Minimum number of samples required to be at a leaf node
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0], # Fraction of samples used for fitting the trees
    'max_features': ['auto', 'sqrt', 'log2', None], # Number of features to consider for the best split
    'random_state': [99] # Random state for reproducibility
}



rf_param_grid = {
    'n_estimators': np.arange(200, 300, 50),  # Number of trees in the forest
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'max_depth': np.arange(3, 12, 1),  # Maximum depth of the tree
    'min_samples_split': np.arange(2, 11, 1),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': np.arange(1, 11, 1),  # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2', None],  # Number of features to consider for the best split
    'bootstrap': [True, False],  # Whether bootstrap samples are used when building trees
    'random_state': [99]  # Random state for reproducibility
}

rand_search_models = [('Gradient Boosting', GradientBoostingClassifier(), gb_param_grid),
                    ('Random Forest', RandomForestClassifier(), rf_param_grid)]

In [None]:
model_best_params = {}
best_models = {}
best_score = {}

for name, model, params in rand_search_models:
  random_search = RandomizedSearchCV(estimator=model,
                                      param_distributions=params,
                                      n_iter=100,
                                      cv=3,
                                      n_jobs=-1)

  random_search.fit(X_train_reducedFeatures, y_train)
  model_best_params[name] = random_search.best_params_
  best_models[name] = random_search.best_estimator_
  best_score[name] = random_search.best_score_

for name in model_best_params:
  print(f'Best hyperparameters for {name}:')
  print(model_best_params[name])
  print(f'Best Score for {name}:')
  print(best_score[name])
  print('***'*30)
  print('\n')



In [None]:
best_models

In [None]:
optimized_gb_classifier = best_models['Gradient Boosting']
optimized_rf_classifier = best_models['Random Forest']

In [None]:
#train best model on full trainset
optimized_gb_classifier.fit(X_train_reducedFeatures, y_train)

#predict
y_pred_gb = optimized_gb_classifier.predict(X_eval_reducedFeatures)

#evaluate
report = classification_report(y_true=y_eval, y_pred=y_pred_gb)
print(report)



In [None]:
#train best model on full trainset
optimized_rf_classifier.fit(X_train_reducedFeatures, y_train)

#predict
y_pred_rf = optimized_rf_classifier.predict(X_eval_reducedFeatures)

#evaluate
report = classification_report(y_true=y_eval, y_pred=y_pred_rf)
print(report)



In [None]:
# save best classifier
classifier = optimized_gb_classifier
classifier

In [None]:
reference_features = list(X.columns)
target = 'Churn'

In [None]:
components = {'reference_features': reference_features,
              'target': target,
              'transformed_columns': transformed_columns,
              'numerical_columns': num_cols,
              'selected_features': selected_feature_names,
              'classification_model': classifier}

In [None]:
X_train.columns

In [None]:
import pickle


# Replace these with your actual objects
model = optimized_gb_classifier  # Replace with your model
cat_preprocessor = cat_preprocessor  # Replace with your categorical transformer
num_transformer = num_transformer  # Replace with your numerical transformer

# Step 2: Save the model and transformers using pickle.dump()
# Save the trained model
with open('src/optimized_gb_classifier.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)


# Save the categorical preprocessor
with open('src/cat_preprocessor.pkl', 'wb') as cat_preprocessor_file:
    pickle.dump(cat_preprocessor, cat_preprocessor_file)

# Save the numerical transformer
with open('src/num_transformer.pkl', 'wb') as num_transformer_file:
    pickle.dump(num_transformer, num_transformer_file)

with open('src/cat_transformer.pkl', 'wb') as num_transformer_file:
    pickle.dump(num_transformer, num_transformer_file)


In [None]:
!dir export/

In [None]:
destination = os.path.join('.', 'export')
destination

In [None]:

gradio_toolkit = {
    'model': optimized_gb_classifier,
    'cat_preprocessor': cat_preprocessor,
    'num_transformer': num_transformer,
    'cat_transformer': cat_transformer
}

In [None]:
with open('gradio_toolkit.pkl', 'wb') as toolkit_file:
    pickle.dump(gradio_toolkit, toolkit_file)


In [None]:
#save components
with open(os.path.join(destination, 'ml.pkl'), 'wb') as file:
  pickle.dump(components, file)

In [None]:
# save packages in working environment
!pip freeze > export/requirements.txt

In [None]:
# convert export and it's content to a zip archive
!zip -r export.zip export/

In [None]:
#loaded model and it's components
with open(os.path.join(destination, 'ml.pkl'), 'rb') as file:
  loaded_components = pickle.load(file)



In [None]:
loaded_components.keys()

In [None]:
# unpack trained model and it's components
reference_features = loaded_components['reference_features']
target = loaded_components['target']
transformed_columns = loaded_components['transformed_columns']
numerical_columns = loaded_components['numerical_columns']
selected_features = loaded_components['selected_features']
classifier = loaded_components['classification_model']

In [None]:
#load test set
testset = pd.read_excel('Telco-churn-second-2000.xlsx')
testset.head()

In [None]:
#check if data meets expection
testset.info()

In [None]:
# clean and reformat testset to meet expectation
testset['SeniorCitizen'] = testset['SeniorCitizen'].replace({0:'No', 1:'yes'})
testset['TotalCharges'] = pd.to_numeric(testset['TotalCharges'], errors='coerce' )

In [None]:
#now this looks as expected
testset.info()

In [None]:
#extract columns of interest
X_test = testset[reference_features]
print(X_test.shape)
X_test.head()

In [None]:
#preprocess testset

#imputing and encoding cat cols
X_test_prepared = cat_preprocessor.transform(X_test)
print(X_test_prepared.shape)
print(type(X_test_prepared))


In [None]:
#convert to dataframe using transformed columns
X_test_prepared = pd.DataFrame(X_test_prepared, columns=transformed_columns)
X_test_prepared.head()

In [None]:
#apply numeric transformer
X_test_prepared[num_cols] = num_transformer.transform(X_test_prepared[num_cols])
X_test_prepared.head()



In [None]:
#select important features
X_test_prepared_reducedFeatures = X_test_prepared[selected_feature_names]
X_test_prepared_reducedFeatures

In [None]:
## testset predictions
y_pred_testset = classifier.predict(X_test_prepared_reducedFeatures)
y_pred_df = pd.DataFrame(y_pred_testset, columns=['Churn'])
y_pred_df.head()

In [None]:
#append results to test dataframe
results = pd.concat([testset, y_pred_df], axis=1)
results.head()