# Customer's Retention Analysis
<p><img src="https://www.cleartouch.in/wp-content/uploads/2022/11/Customer-Churn.png" alt="Markdown">.</p>
<p><strong>Customer churn</strong> is the term used to describe when customers end their relationship or subscription with a company or service provider. Churn affects businesses’ revenue, growth, and customer loyalty. By studying churn and its related features, companies can create ways to keep existing customers, make them happier, and lower churn rates. They can also use predictive models to anticipate and prevent future churn, and take action to keep customers who are likely to leave.

Dataset used: <code>customer_churn_dataset-testing-master.csv</code> and <code>customer_churn_dataset-training-master.csv</code>, downloaded from Kaggle <a href="https://www.kaggle.com/datasets/muhammadshahidazeem/customer-churn-dataset">here</a>.</p>
<p>These datasets contains 12 feature columns. In detail, these are:
<br></p>
<ul>
    <li><b>CustomerID:</b> A unique identifier for each customer</li>
    <li><b>Age:</b> The age of the customer</li>
    <li><b>Gender:</b> Gender of the customer</li>
    <li><b>Tenure:</b> Duration in months for which a customer has been using the company's products or services</li>
    <li><b>Usage Frequency:</b> Number of times that the customer has used the company’s services in the last month</li>
    <li><b>Support Calls:</b> Number of calls that the customer has made to the customer support in the last month</li>
    <li><b>Payment Delay:</b> Number of days that the customer has delayed their payment in the last month</li>
    <li><b>Subscription Type:</b> Type of subscription the customer has choosen</li>
    <li><b>Contract Length:</b> Duration of the contract that the customer has signed with the company</li>
    <li><b>Total Spend:</b> Total amount of money the customer has spent on the company's products or services</li>
    <li><b>Last Interaction:</b> Number of days since the last interaction that the customer had with the company</li>
    <li><b>Churn:</b> Binary label indicating whether a customer has churned (1) or not (0)</li>
</ul>

# Importing Data

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder

import pickle

from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

In [None]:
# The dataset is not relly split in a balanced way. So, I'll import train and test data
# as a single data and split it later

df = pd.concat(
    [
        pd.read_csv('/kaggle/input/customer-churn-dataset/customer_churn_dataset-training-master.csv'), 
        pd.read_csv('/kaggle/input/customer-churn-dataset/customer_churn_dataset-testing-master.csv')
    ], 
    axis=0)
df.reset_index(drop=True, inplace=True)
df

In [None]:
df.describe()

In [None]:
df.describe(include=[object]) 

In [None]:
df.info()

# Initial Dataset Preprocessing

In [None]:
df.drop(columns='CustomerID', inplace=True) # removing unnecessary colum

df.columns = [col.lower().replace(' ', '_') for col in df.columns] # renaming the column names 

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df[df.isna().any(axis=1)]

In [None]:
df.dropna(inplace=True) # Removing the single missing value row

In [None]:
df.shape

In [None]:
descrete_col = ['age', 'tenure', 'usage_frequency', 'support_calls', 'payment_delay', 'last_interaction', 'churn']
for col in descrete_col:
    df[col] = df[col].astype(int)
df

# Univarient Analysis

In [None]:
# Creating custom functions to visualize features

def make_histogram(df, target_feature, bins = 10, custom_ticks=None, unit='', additional=''):
    plt.figure(figsize=(10, 5))
    plt.hist(df[target_feature], bins=bins)
    if custom_ticks is not None:
        plt.xticks(custom_ticks)
    plt.ylabel('Count')
    plt.xlabel(target_feature)
    plt.title(f"Distribution of {target_feature.lower()}{additional}:\n")
    plt.grid()
    plt.show()
    print(f"Distribution of {target_feature.lower()}{additional}: {df[target_feature].mean():.2f} ± {df[target_feature].median():.2f} {unit}\nMedian: {df[target_feature].median():.2f} {unit}\nMinimum: {df[target_feature].min()} {unit}\nMaximum: {df[target_feature].max()} {unit}\n{df[target_feature].skew():.3f} Skewness\n")

def make_piechart(df, target_feature, additional=''):
    dict_of_val_counts = dict(df[target_feature].value_counts())
    data = list(dict_of_val_counts.values())
    keys = list(dict_of_val_counts.keys())
    
    palette_color = sns.color_palette('bright')
    plt.pie(data, labels=keys, colors=palette_color, autopct='%.0f%%')
    plt.title(f"Distribution of Cutomer's {target_feature}:")
    plt.show()
    print_str = f"Distribution of cutomer's {target_feature.lower()}{additional}:"
    for k, v in zip(keys, data):
        print_str += f"\n{v} {k}"
    print(print_str)

def make_barplot(df, target_feature, custom_ticks=None, unit='', additional=''):
    plt.figure(figsize=(10, 5))
    dict_of_val_counts = dict(df[target_feature].value_counts())
    data = list(dict_of_val_counts.values())
    keys = list(dict_of_val_counts.keys())
    plt.bar(keys, data)
    if custom_ticks is not None:
        plt.xticks(custom_ticks)
    plt.xlabel(f'{target_feature.capitalize()}{additional}')
    plt.ylabel('Frequency')
    plt.title(f"Distribution of cutomer's {target_feature.lower()}{additional}\n")
    plt.grid(axis='y')
    plt.show()
    print(f"Distribution of cutomer's {target_feature.lower()}{additional}: {df[target_feature].mean():.2f} ± {df[target_feature].median():.2f} {unit}\nMedian: {df[target_feature].median():.2f} {unit}\nMinimum: {df[target_feature].min()} {unit}\nMaximum: {df[target_feature].max()} {unit}\n\n{df[target_feature].skew():.3f} Skewness\n")
    
def make_boxplot(df, feature):
    plt.figure(figsize=(10,5))
    sns.boxplot(df, x=feature)
    plt.title(f"Boxplot of {feature}\n")
    plt.xlabel(feature)
    plt.ylabel("Values")
    plt.show()

## Categorical features

In [None]:
make_piechart(df, 'gender')

<div style='background-color: pink; color: black; padding: 3%'>There are more male customers in the company.</div>


In [None]:
make_piechart(df, 'subscription_type')

<div style='background-color: pink; color: black; padding: 3%'>There is a close balance of customers among the three subscription types: Standard, Premium, and Basic.</div>


In [None]:
make_piechart(df, 'contract_length')

<div style='background-color: pink; color: black; padding: 3%'>Annual contracts and quarterly contracts have similar and the highest number of customers counts, followed by monthly contracts with the lowest customers.</div>

In [None]:
filtered = df.copy()
filtered['churn_category'] = ['Churn' if x == 1.0 else 'Not Churned' for x in df['churn']]
make_piechart(filtered, 'churn_category')

<div style='background-color: pink; color: black; padding: 3%'>The dataset contains more entries of churned customers.</div>


# Descrete Data

In [None]:
make_barplot(df, 'age', custom_ticks=np.arange(0, 66, 5), additional=' (years)', unit='years')

In [None]:
make_boxplot(df, 'age')

<div style='background-color: pink; color: black; padding: 3%'>Most customers are aged 40-50 with age 50 being the most common. There's very low number of customers of age 51 and above.</div>

In [None]:
make_barplot(df, 'tenure', custom_ticks=np.arange(0, 61, 3), additional=' (months)', unit='months')

In [None]:
make_boxplot(df, 'tenure')

<div style='background-color: pink; color: black; padding: 3%'>Customers tend to have longer tenures with customer departures being more common during the 1-5 month and 12-24 month tenure periods.</div>

In [None]:
make_barplot(df, 'usage_frequency', custom_ticks=np.arange(0, 31, 2), unit='times', additional=' (in a month)')

In [None]:
make_boxplot(df, 'usage_frequency')

<div style='background-color: pink; color: black; padding: 3%'>On average, customers tend to use company's services 16 times in a month, with a large variability in individual usage frequency suggesting diverse service consumption habits.</div>

In [None]:
make_barplot(df, 'support_calls', unit='calls', additional=' (in a month)')

In [None]:
make_boxplot(df, 'support_calls')

<div style='background-color: pink; color: black; padding: 3%'>On average, customers tend to make 3 support calls in a month. Customers tend to make 1 or 2 support calls per month, with the most make no support calls at all.</div>

In [None]:
make_barplot(df, 'payment_delay', custom_ticks=np.arange(0, 32, 3), unit='days', additional=' (in days)')

In [None]:
make_boxplot(df, 'payment_delay')

<div style='background-color: pink; color: black; padding: 3%'>On average, customers tend to delay their payments by 13 days, with a large variability in individual payment delay durations suggesting unpreferable diverse payment habits. Most customers pay within 20 days.</div>

In [None]:
make_barplot(df, 'last_interaction', custom_ticks=np.arange(0, 32, 3), unit='days', additional='')

In [None]:
make_boxplot(df, 'last_interaction')

<div style='background-color: pink; color: black; padding: 3%'>Most customers tend to have 15 or less last interaction with average around 14 days.</div>

## Continuous Data

In [None]:
make_histogram(df, 'total_spend', bins=25, custom_ticks=np.arange(0, 1001, 100), unit='USD', additional=" on products or services")

In [None]:
make_boxplot(df, 'total_spend')

<div style='background-color: pink; color: black; padding: 3%'>
    On  average customer tend to send around 631.62 USD, with a large variability on customer spending. Most customers tend spend more than 500 USD.
    <br><br>
    <b><span style='color: red'>If we look at the histogram we can clearly see the division between customers who spend more and less is around 500. We can use this to divide customers into high value customers (total speding above 500) and low value customers (total speding below 500).</span></b>
</div>

# Multivariate Analysis

## Do gender and churn rate have any relationship?

In [None]:
gender_churn = df.groupby(['gender', 'churn']).size().unstack()

X = list(gender_churn.index)
churn_0 = list(gender_churn.iloc[:, 0])
churn_1 = list(gender_churn.iloc[:, 1])
  
X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, churn_1, 0.4, label = 'Churn')
plt.bar(X_axis + 0.2, churn_0, 0.4, label = 'Not Churn')
  
plt.xticks(X_axis, X)
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title("Gender wise churn rate")
plt.legend(loc='center right')
plt.grid(axis='y')
plt.show()

<div style='background-color: pink; color: black; padding: 3%'>=> Yes, gender and churn rate have relationship. <br><br>Female customers exhibit a slightly higher churn rate compared to male customers. Active male customers (non-churned) is nearly double that of female customers.</div>

## Are there any noticeable trends in payment delays among customers who churned?

In [None]:
filtered = df.groupby(['payment_delay', 'churn']).size().unstack()

X = list(filtered.index)
churn_0 = list(filtered.iloc[:, 0])
churn_1 = list(filtered.iloc[:, 1])
  
X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, churn_1, 0.4, label = 'Churn')
plt.bar(X_axis + 0.2, churn_0, 0.4, label = 'Not Churn')
  
plt.xticks(X_axis, X, rotation=90)
plt.xlabel("Customer payment delays in days")
plt.ylabel('Count')
plt.title("Churn rate based on payment delays")
plt.legend(loc='center right')
plt.grid(axis='y')
plt.show()

<div style='background-color: pink; color: black; padding: 3%'>=> Customers who are not churned tend to have higher payment delay days as compared with churned customers till day 20, after which churned customers have payment delays just over 10 times than that of not churned customers.</div>

## What is the distribution of usage frequency for churned vs. non-churned customers?

In [None]:
filtered = df.groupby(['usage_frequency', 'churn']).size().unstack()

X = list(filtered.index)
churn_0 = list(filtered.iloc[:, 0])
churn_1 = list(filtered.iloc[:, 1])
  
X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, churn_1, 0.4, label = 'Churn')
plt.bar(X_axis + 0.2, churn_0, 0.4, label = 'Not Churn')
  
plt.xticks(X_axis, X, rotation=90)
plt.xlabel("Customer's company services usage frequency")
plt.ylabel('Count')
plt.title("Churn rate based on usage frequency")
plt.legend(loc='center right')
plt.grid(axis='y')
plt.show()

<div style='background-color: pink; color: black; padding: 3%'>=> Churned customer tend to use company's services significantly more than customers who are not churned.</div>

## Are customers with longer tenures less likely to churn?

In [None]:
def categorize_age(age):
    if 0 <= age <= 10:
        return '0 to 10 months'
    elif 11 <= age <= 20:
        return '11 to 20 months'
    elif 21 <= age <= 30:
        return '21 to 30 months'
    elif 31 <= age <= 40:
        return '31 to 40 months'
    elif 41 <= age <= 50:
        return '41 to 50 months'
    elif 51 <= age <= 60:
        return '51 to 60 months'
    else:
        pass # For nan values

filtered = df.copy()
filtered['tenure_segmentation'] = df['tenure'].apply(categorize_age)
filtered = filtered.groupby(['tenure_segmentation', 'churn']).size().unstack()

X = list(filtered.index)
churn_0 = list(filtered.iloc[:, 0])
churn_1 = list(filtered.iloc[:, 1])
  
X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, churn_1, 0.4, label = 'Churn')
plt.bar(X_axis + 0.2, churn_0, 0.4, label = 'Not Churn')
  
plt.xticks(X_axis, X, rotation=45)
plt.xlabel('Tenures')
plt.ylabel('Count')
plt.title("Churn rate based on tenures")
plt.legend(loc='center right')
plt.grid(axis='y')
plt.show()

<div style='background-color: pink; color: black; padding: 3%'>=> No, customers with longer tenures are more likely to churn.</div>

## Do customers with more support calls tend to churn more?

In [None]:
filtered = df.groupby(['support_calls', 'churn']).size().unstack()

X = list(filtered.index)
churn_0 = list(filtered.iloc[:, 0])
churn_1 = list(filtered.iloc[:, 1])
  
X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, churn_1, 0.4, label = 'Churn')
plt.bar(X_axis + 0.2, churn_0, 0.4, label = 'Not Churn')
  
plt.xticks(X_axis, X, rotation=45)
plt.xlabel('Customer Support Calls')
plt.ylabel('Count')
plt.title("Churn rate based on support calls made by customers")
plt.legend(loc='center right')
plt.grid(axis='y')
plt.show()

<div style='background-color: pink; color: black; padding: 3%'>=> Yes, customers with more support calls tend to churn more. <br><br>On the contrary, customers who are not churned tend to make much more 0 to 3 customer support calls than churned customers, after which churned customers make significantly more calls.</div>

## Does the subscription type influence the churn rate?

In [None]:
filtered = df.groupby(['subscription_type', 'churn']).size().unstack()

X = list(filtered.index)
churn_0 = list(filtered.iloc[:, 0])
churn_1 = list(filtered.iloc[:, 1])
  
X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, churn_1, 0.4, label = 'Churn')
plt.bar(X_axis + 0.2, churn_0, 0.4, label = 'Not Churn')
  
plt.xticks(X_axis, X, rotation=45)
plt.xlabel('Subscription Type')
plt.ylabel('Count')
plt.title("Churn rate based on subscription type")
plt.legend(loc='center right')
plt.grid(axis='y')
plt.show()

<div style='background-color: pink; color: black; padding: 3%'>=> No, the subscription type does not influence customer churn rate. <br><br>Customers who are not churned tend to choose premium or standard subscription type slightly more than basic subscription.</div>

## Is there a pattern in contract length for customers who churned?

In [None]:
filtered = df.groupby(['contract_length', 'churn']).size().unstack()

X = list(filtered.index)
churn_0 = list(filtered.iloc[:, 0])
churn_1 = list(filtered.iloc[:, 1])
  
X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, churn_1, 0.4, label = 'Churn')
plt.bar(X_axis + 0.2, churn_0, 0.4, label = 'Not Churn')
  
plt.xticks(X_axis, X, rotation=45)
plt.xlabel('Contract Length')
plt.ylabel('Count')
plt.title("Churn rate based on contract length")
plt.legend(loc='center right')
plt.grid(axis='y')
plt.show()

<div style='background-color: pink; color: black; padding: 3%'>=> Customers with quarterly contract lengths have the lowest churn rate, while those with monthly contract lengths exhibit the highest churn rate. Conversely, customers who do not churn overwhelmingly prefer annual and quarterly contracts over monthly contracts.</div>

## Is there a correlation between total spend and churn rate?

In [None]:
filtered = df.copy()
filtered['churn_segment'] = ['Churn' if x == 1.0 else 'Not Churned' for x in df['churn']]

sns.kdeplot(data=filtered, x="total_spend", hue="churn_segment", multiple="stack")
plt.show()

<div style='background-color: pink; color: black; padding: 3%'>=> Customers who churn tends to spend significantly more money than those who don't.<br><br>Both churn and not churned customers share common total spending distribution / total spending behavior.</div>

# Correlation

# Independent Features Correlation

In [None]:
independent_features_df = df.select_dtypes(include=['number']).copy().drop(columns=['churn'])

In [None]:
corr_matrix = independent_features_df.corr()

# Creating a mask to hide the upper triangle of the heatmap
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

plt.figure(figsize=(10, 8))
sns.set(font_scale=1.2)
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", mask=mask)
plt.title("Independent Features Correlation Heatmap")
plt.show()

# Inpedendent features correlation with prediction labels

In [None]:
correlation_data = df.select_dtypes(include=['number']).corr().loc[:'last_interaction', 'churn']


# Create a heatmap
plt.figure(figsize=(5, 3))
sns.set(font_scale=1.2)
sns.heatmap(correlation_data.to_frame(), annot=True, cmap="coolwarm", cbar=True)

plt.title("Correlation Heatmap between Independent Features and Churn")
plt.show()

# Duplicates

In [None]:
df[df.duplicated()]

**There are no duplicates**

# Null values

**Missing values were dropped pre EDA because there was just a single row of missing values which before performing analysis won't affect result of analysis.**

# Train-Test Split

In [None]:
y = df['churn']
X = df.drop(columns='churn')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

# Reset the index of the resulting DataFrames
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
X_train

In [None]:
X_test

# Validating Categorical Columns in Test Data

In [None]:
def validate_test_data_categorical_columns(train_df, test_df):
    # Get the list of categorical columns for both train and test DataFrames
    train_df_categorical_columns = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
    test_df_categorical_columns = test_df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Check if the number of categorical columns is the same in both DataFrames
    if len(set(train_df_categorical_columns).intersection(set(test_df_categorical_columns))) == 0:
        print('Train and test dataframes have different categorical columns')
        return
    else:
        for cat_col in test_df_categorical_columns:
            # Create sets of unique values for the current categorical column in both DataFrames
            train_col = set(x for x in train_df[cat_col].unique().tolist() if not pd.isna(x))
            test_col = set(x for x in test_df[cat_col].unique().tolist() if not pd.isna(x))
            
            # Check if the sets are not equal, indicating different unique values
            if train_col != test_col:
                print(f'{cat_col} column has different unique values in train and test data:')
                print(f'Unique values in train data: {train_col}')
                print(f'Unique values in test data: {test_col}')
                return
        
        print('All categorical columns have consistent unique values in train and test data.')
        return
    
validate_test_data_categorical_columns(X_train, X_test)

# One Hot Encoding

In [None]:
encoder = OneHotEncoder(sparse_output=False)

encoder.fit(X_train[['gender', 'subscription_type', 'contract_length']])

In [None]:
feature_names = encoder.get_feature_names_out(['gender', 'subscription_type', 'contract_length'])
feature_names

In [None]:
train_categorical_one_encoded_data = encoder.transform(X_train[['gender', 'subscription_type', 'contract_length']])
train_OHE_df = pd.DataFrame(train_categorical_one_encoded_data, columns=feature_names)

test_categorical_one_encoded_data = encoder.transform(X_test[['gender', 'subscription_type', 'contract_length']])
test_OHE_df = pd.DataFrame(test_categorical_one_encoded_data, columns=feature_names)

In [None]:
train_OHE_df.head(3)

In [None]:
test_OHE_df.head(3)

In [None]:
X_train = X_train.drop(columns=['gender', 'subscription_type', 'contract_length'])
X_test = X_test.drop(columns=['gender', 'subscription_type', 'contract_length'])

In [None]:
X_train.head(3)

In [None]:
X_test.head(3)

In [None]:
X_train = pd.concat([X_train, train_OHE_df], axis=1)
X_test = pd.concat([X_test, test_OHE_df], axis=1)

In [None]:
X_train.head(3)

In [None]:
X_test.head(3)

In [None]:
# Saving the encoder to a file
with open('encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)

In [None]:
# Example Usage

with open('encoder.pkl', 'rb') as file:
    loaded_encoder = pickle.load(file)
    
loaded_encoder.transform([['Male', 'Premium', 'Monthly']])

# PCA

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

In [None]:
# Printing the explained variance ratio
pca.explained_variance_ratio_

In [None]:
data = {
    'Feature_1': X_pca[:, 0],
    'Feature_2': X_pca[:, 1],
    'Target': y_train
}

pca_df = pd.DataFrame(data)
pca_df

In [None]:
sns.scatterplot(data=pca_df, x='Feature_1', y='Feature_2', hue='Target')

# Set plot labels and title
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title('PCA Components Scatter Plot with Hue')

# Show the plot
plt.legend(title='Target')
plt.show()

# Baseline models

In [None]:
def print_evaluation_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print()
    
    conf_matrix = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)
    print()

    class_report = classification_report(y_true, y_pred)
    print("Classification Report:")
    print(class_report)

In [None]:
def k_fold_cross_validation_with_metrics(classifier, X, y, k_folds=5):
    
    # Initializing stratified k-fold cross-validation
    stratified_kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    # Lists to store the evaluation metrics for each fold
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    # Perform cross-validation
    for train_index, test_index in stratified_kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit the classifier on the training data
        classifier.fit(X_train, y_train)

        # Make predictions on the test data
        y_pred = classifier.predict(X_test)

        # Calculate evaluation metrics for this fold
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)

        # Append the metrics to their respective lists
        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)

    # Calculate and print the mean of each metric across all folds
    mean_accuracy = np.mean(accuracy_scores)
    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)
    print("Mean Metrics Across Folds:")
    print(f"Mean Accuracy: {mean_accuracy:.2f}")
    print(f"Mean Precision: {mean_precision:.2f}")
    print(f"Mean Recall: {mean_recall:.2f}")

# Example usage:
# classifier = YourClassifier()  # Replace with your classifier of choice
# k_fold_cross_validation_with_metrics(classifier, X, y)


## Logistic Regression

In [None]:
model = LogisticRegression()

k_fold_cross_validation_with_metrics(model, X_train, y_train)

## Naive Bayes

In [None]:
# Gaussian Naive Bayes

gnb = GaussianNB()

k_fold_cross_validation_with_metrics(gnb, X_train, y_train)

In [None]:
#Multinomial Naive Bayes model

naive_bayes_model = MultinomialNB()

k_fold_cross_validation_with_metrics(naive_bayes_model, X_train, y_train)

## k-Nearest Neighbor

In [None]:
knn_classifier = KNeighborsClassifier()

k_fold_cross_validation_with_metrics(knn_classifier, X_train, y_train)

## Decision Trees

In [None]:
decision_tree_classifier = DecisionTreeClassifier(random_state=42)

k_fold_cross_validation_with_metrics(decision_tree_classifier, X_train, y_train)

## Random Forest

In [None]:
rf_classifier = RandomForestClassifier(random_state=42)

k_fold_cross_validation_with_metrics(rf_classifier, X_train, y_train)

## Xgboost

In [None]:
xgb_classifier = xgb.XGBClassifier(random_state=42)

k_fold_cross_validation_with_metrics(xgb_classifier, X_train, y_train)

# Models Training

**Decision Trees, Random Forest & Xgboost all three have perfect score. I'll train them again but using the entire training dataset this time and use test dataset to validate the model. Focus will be on best recall.**

In [None]:
decision_tree_classifier = DecisionTreeClassifier(random_state=42)
decision_tree_classifier.fit(X_train, y_train)

In [None]:
random_forest_classifier = RandomForestClassifier(random_state=42)
random_forest_classifier.fit(X_train, y_train)

In [None]:
xgb_classifier = xgb.XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

# Validating Models

In [None]:
# Testing decision trees

y_pred = decision_tree_classifier.predict(X_test)

print_evaluation_metrics(y_test, y_pred)

In [None]:
# Testing random forest

y_pred = random_forest_classifier.predict(X_test)

print_evaluation_metrics(y_test, y_pred)

In [None]:
# Testing xgboost

y_pred = xgb_classifier.predict(X_test)

print_evaluation_metrics(y_test, y_pred)

**Random Forest is the near perfect model since for the problem at hand, my focus is on improving recall**

In [None]:
with open("customer_churn_random_forest_model.pkl", 'wb') as model_file:
    pickle.dump(random_forest_classifier, model_file)

# Deployment

In [None]:
class CustomerChurnClassifier:
    
    def __init__(self, model_path, encoder_path):
        
        with open(model_path, 'rb') as file:
            self.model = pickle.load(file)
            
        with open(encoder_path, 'rb') as file:
            self.encoder = pickle.load(file)
    
    def predict(self, age: int, tenure: int, usage_frequency: int, support_calls: int, payment_delay: int, total_spend: float, last_interaction: int, gender: str, subscription_type: str, contract_length: str):
        
        # Checking input datatypes
        expected_data_types = [int, int, int, int, int, float, int, str, str, str]
        input_arguments = [age, tenure, usage_frequency, support_calls, payment_delay, total_spend, last_interaction, gender, subscription_type, contract_length]
        input_arguments_names = ['age', 'tenure', 'usage_frequency', 'support_calls', 'payment_delay', 'total_spend', 'last_interaction', 'gender', 'subscription_type', 'contract_length']

        for i in range(len(input_arguments)):
            current_arg_type = type(input_arguments[i])

            if current_arg_type != expected_data_types[i]:
                raise TypeError(f"Error: Given {input_arguments_names[i]} ({current_arg_type.__name__}) is not of the expected type ({expected_data_types[i].__name__}).")
                
                
        # Checking gender, subscription_type, and contract_length values
        valid_genders = ['Female', 'Male']
        valid_subscription_types = ['Standard', 'Basic', 'Premium']
        valid_contract_lengths = ['Annual', 'Monthly', 'Quarterly']

        if gender not in valid_genders:
            raise ValueError(f"Error: Invalid gender value '{gender}'. Expected one of {valid_genders}.")

        if subscription_type not in valid_subscription_types:
            raise ValueError(f"Error: Invalid subscription_type value '{subscription_type}'. Expected one of {valid_subscription_types}.")

        if contract_length not in valid_contract_lengths:
            raise ValueError(f"Error: Invalid contract_length value '{contract_length}'. Expected one of {valid_contract_lengths}.")
            
            
        
        # One Hot Encoding
        ohe_data = list(self.encoder.transform([[gender, subscription_type, contract_length]])[0])
        
        to_predict_array = [age, tenure, usage_frequency, support_calls, payment_delay, total_spend, last_interaction] + ohe_data
        to_predict_array = np.array(to_predict_array).reshape((1, -1))
                
        prediction = self.model.predict(to_predict_array)[0]
        
        if prediction > 0.5:
            return 'Will Churn'
        else:
            return "Won't Churn"

In [None]:
customer_churn = CustomerChurnClassifier(
    model_path = 'customer_churn_random_forest_model.pkl', 
    encoder_path = 'encoder.pkl'
)

In [None]:
customer_churn.predict(
    age = 19, 
    tenure = 48, 
    usage_frequency = 7, 
    support_calls = 3, 
    payment_delay = 30, 
    total_spend = 787.0, 
    last_interaction = 29, 
    gender = 'Female', 
    subscription_type = 'Premium', 
    contract_length = 'Annual'
)

In [None]:
customer_churn.predict(
    age = 18, 
    tenure = 12, 
    usage_frequency = 3, 
    support_calls = 2, 
    payment_delay = 10, 
    total_spend = 525.8, 
    last_interaction = 5, 
    gender = 'Male', 
    subscription_type = 'Basic', 
    contract_length = 'Annual'
)