In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score,mean_squared_error,roc_curve,roc_auc_score,classification_report,r2_score,confusion_matrix,recall_score,precision_score

df=pd.read_csv("Churn_Modelling.csv")


In [2]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [3]:
df=df.drop(columns=["RowNumber", "CustomerId", "Surname"])


In [4]:
#Encoding in Geography Column
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
encoded=encoder.fit_transform(df[['Geography']]).toarray()
encoder_df=pd.DataFrame(encoded,columns=encoder.get_feature_names_out())
df=pd.concat([df,encoder_df],axis=1)

In [5]:
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

In [6]:
df=df.drop(["Geography"],axis=1)

In [7]:
# Create a new feature: Balance-to-Salary Ratio
df['Balance_Salary_Ratio'] = df['Balance'] / df['EstimatedSalary']

In [8]:
df.columns


Index(['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited',
       'Geography_France', 'Geography_Germany', 'Geography_Spain',
       'Balance_Salary_Ratio'],
      dtype='object')

In [9]:
df.drop(['Balance','EstimatedSalary'],axis=1,inplace=True)

In [10]:
df.columns

Index(['CreditScore', 'Gender', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'Exited', 'Geography_France', 'Geography_Germany',
       'Geography_Spain', 'Balance_Salary_Ratio'],
      dtype='object')

In [13]:
from sklearn.model_selection import train_test_split
x=df.drop('Exited',axis=1)
y=df['Exited']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42,stratify=y)

In [14]:
x_train.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,NumOfProducts,HasCrCard,IsActiveMember,Geography_France,Geography_Germany,Geography_Spain,Balance_Salary_Ratio
5866,735,0,53,8,2,0,1,1.0,0.0,0.0,0.726558
1938,518,1,38,3,1,0,1,1.0,0.0,0.0,0.560414
4194,572,0,54,9,1,1,1,0.0,1.0,0.0,0.497428
6332,619,0,35,4,1,1,1,1.0,0.0,0.0,4.39855
1,608,0,41,1,1,0,1,0.0,0.0,1.0,0.744677


In [15]:
x_train.isnull().sum()
#x_train.info()

CreditScore             0
Gender                  0
Age                     0
Tenure                  0
NumOfProducts           0
HasCrCard               0
IsActiveMember          0
Geography_France        0
Geography_Germany       0
Geography_Spain         0
Balance_Salary_Ratio    0
dtype: int64

In [16]:
from imblearn.over_sampling import SMOTE
# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)

In [17]:
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,NumOfProducts,HasCrCard,IsActiveMember,Exited,Geography_France,Geography_Germany,Geography_Spain,Balance_Salary_Ratio
0,619,0,42,2,1,1,1,1,1.0,0.0,0.0,0.0
1,608,0,41,1,1,0,1,0,0.0,0.0,1.0,0.744677
2,502,0,42,8,3,1,0,1,1.0,0.0,0.0,1.401375
3,699,0,39,1,2,0,0,0,1.0,0.0,0.0,0.0
4,850,0,43,2,1,1,1,0,0.0,0.0,1.0,1.587055


In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 

In [20]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score

# Define parameter grids for each model
param_grids = {
    "LogisticRegression": {
        "penalty": ["l1", "l2", "elasticnet", None],
        "C": np.logspace(-4, 4, 20),
        "solver": ["liblinear", "saga"]
    },
    "SVC": {
        "C": np.logspace(-2, 2, 10),
        "kernel": ["linear", "poly", "rbf", "sigmoid"],
        "gamma": ["scale", "auto"]
    },
    "KNeighborsClassifier": {
        "n_neighbors": range(1, 31),
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan", "minkowski"]
    },
    "DecisionTreeClassifier": {
        "criterion": ["gini", "entropy"],
        "max_depth": range(1, 20),
        "min_samples_split": range(2, 10),
        "min_samples_leaf": range(1, 10)
    },
    "RandomForestClassifier": {
        "n_estimators": [50, 100, 200, 500],
        "criterion": ["gini", "entropy"],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": range(2, 10),
        "min_samples_leaf": range(1, 10)
    },
    "GradientBoostingClassifier": {
        "n_estimators": [50, 100, 200, 500],
        "learning_rate": [0.01, 0.1, 0.2, 0.3],
        "max_depth": range(1, 10),
        "subsample": [0.5, 0.7, 1.0]
    },
    "AdaBoostClassifier": {
        "n_estimators": [50, 100, 200, 500],
        "learning_rate": [0.01, 0.1, 0.5, 1.0]
    },
    "XGBClassifier": {
        "n_estimators": [50, 100, 200, 500],
        "learning_rate": [0.01, 0.1, 0.2, 0.3],
        "max_depth": range(1, 10),
        "subsample": [0.5, 0.7, 1.0]
    }
}

# Function for hyperparameter tuning
def hyperparameter_tuning(model_name, model, x_train, y_train, search_type="random", n_iter=50):
    """
    Perform hyperparameter tuning using RandomizedSearchCV or GridSearchCV.
    Parameters:
    - model_name: str, name of the model
    - model: sklearn estimator, the model to tune
    - x_train: np.array, training features
    - y_train: np.array, training labels
    - search_type: str, "random" or "grid" for RandomizedSearchCV or GridSearchCV
    - n_iter: int, number of iterations for RandomizedSearchCV (ignored for GridSearchCV)

    Returns:
    - best_model: sklearn estimator, the model with the best hyperparameters
    - best_params: dict, the best hyperparameters
    """
    param_grid = param_grids.get(model_name, None)
    if param_grid is None:
        raise ValueError(f"No parameter grid found for {model_name}.")

    # Define scorer
    scorer = make_scorer(f1_score, average="binary")

    # Initialize search
    if search_type == "random":
        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grid,
            n_iter=n_iter,
            scoring=scorer,
            cv=StratifiedKFold(n_splits=5),
            verbose=1,
            n_jobs=-1,
            random_state=42
        )
    elif search_type == "grid":
        search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            scoring=scorer,
            cv=StratifiedKFold(n_splits=5),
            verbose=1,
            n_jobs=-1
        )
    else:
        raise ValueError("search_type must be 'random' or 'grid'.")

    # Fit search
    search.fit(x_train, y_train)

    # Return the best model and parameters
    return search, search.best_params_

# Example usage (replace with your dataset and models):
# from sklearn.ensemble import RandomForestClassifier
# best_model, best_params = hyperparameter_tuning("RandomForestClassifier", RandomForestClassifier(), x_train, y_train, search_type="random")
# print("Best Parameters:", best_params)


In [None]:
4,15701354,Boni,699,France,Female,39,1,0,2,0,0,93826.63,0

In [32]:

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score

# Define models
models = {
    "LogisticRegression": LogisticRegression(),
    "SVC": svm.SVC(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "XGBClassifier": XGBClassifier(eval_metric="logloss")
}

# Store results
results = []

# Iterate through models
for model_name, model in models.items():
    print(f"Tuning hyperparameters for {model_name}...")
    try:
        search, best_params = hyperparameter_tuning(model_name, model, x_train, y_train, search_type="random", n_iter=50)
        best_model = search.best_estimator_

        # Evaluate on test data
        y_pred = best_model.predict(x_test)
        y_prob = best_model.predict_proba(x_test)[:, 1]  # Get probability estimates for positive class

        f1 = f1_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)

        # Save results
        results.append({
            "model": model_name,
            "best_params": best_params,
            "f1_score": f1,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "roc_auc": roc_auc
        })

        print(f"{model_name} - Best F1 Score: {f1:.4f}\n")
    except Exception as e:
        print("Error")

# Sort and display the results by F1 score
def print_results():
    sorted_results = sorted(results, key=lambda x: x["f1_score"], reverse=True)
    print("\n\nModel Tuning Results:")
    for res in sorted_results:
        print(f"Model: {res['model']}, F1 Score: {res['f1_score']:.4f}")

#print_results()
print(results)


Tuning hyperparameters for LogisticRegression...
Fitting 5 folds for each of 50 candidates, totalling 250 fits


80 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\anmol\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\anmol\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\anmol\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dua

LogisticRegression - Best F1 Score: 0.4884

Tuning hyperparameters for SVC...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Error
Tuning hyperparameters for KNeighborsClassifier...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
KNeighborsClassifier - Best F1 Score: 0.5189

Tuning hyperparameters for DecisionTreeClassifier...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
DecisionTreeClassifier - Best F1 Score: 0.5281

Tuning hyperparameters for RandomForestClassifier...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomForestClassifier - Best F1 Score: 0.5702

Tuning hyperparameters for GradientBoostingClassifier...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
GradientBoostingClassifier - Best F1 Score: 0.5431

Tuning hyperparameters for AdaBoostClassifier...
Fitting 5 folds for each of 16 candidates, totalling 80 fits




AdaBoostClassifier - Best F1 Score: 0.5798

Tuning hyperparameters for XGBClassifier...
Error
[{'model': 'LogisticRegression', 'best_params': {'solver': 'liblinear', 'penalty': 'l2', 'C': np.float64(0.0001)}, 'f1_score': 0.4884157795867251, 'accuracy': 0.7276666666666667, 'precision': 0.39553752535496955, 'recall': 0.6382978723404256, 'roc_auc': np.float64(0.7602411215068517)}, {'model': 'KNeighborsClassifier', 'best_params': {'weights': 'distance', 'n_neighbors': 4, 'metric': 'manhattan'}, 'f1_score': 0.518910741301059, 'accuracy': 0.788, 'precision': 0.48241912798874825, 'recall': 0.5613747954173486, 'roc_auc': np.float64(0.7715045568237947)}, {'model': 'DecisionTreeClassifier', 'best_params': {'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 12, 'criterion': 'entropy'}, 'f1_score': 0.5280575539568345, 'accuracy': 0.7813333333333333, 'precision': 0.4711168164313222, 'recall': 0.6006546644844517, 'roc_auc': np.float64(0.76176200383783)}, {'model': 'RandomForestClassifier', 



In [21]:
results

NameError: name 'results' is not defined

In [23]:
from sklearn.ensemble import RandomForestClassifier
gb=RandomForestClassifier(n_estimators= 500,
   min_samples_split= 3,
   min_samples_leaf= 1,
   max_depth= None,
   criterion= 'entropy')
gb.fit(x_train,y_train)


In [24]:
y_pred=gb.predict(x_test)

In [25]:
mae=mean_squared_error(y_test,y_pred)
acc=accuracy_score(y_test,y_pred)
recall=recall_score(y_test,y_pred)
matrix=confusion_matrix(y_test,y_pred)
rocauc=roc_auc_score(y_test,y_pred)
clasrep=classification_report(y_test,y_pred)
score=r2_score(y_test,y_pred)
pres=precision_score(y_test,y_pred)
print("Mean absolute error", mae)
print("accuracy:",acc)
print("Recall:",recall)
print('confusion_matrix',matrix)
print('roc_auc_score:',rocauc)
print('classification_report:',clasrep)
print("R2 Score", score)
print('precison score:',pres)

Mean absolute error 0.164
accuracy: 0.836
Recall: 0.5677799607072691
confusion_matrix [[1801  190]
 [ 220  289]]
roc_auc_score: 0.7361752641306311
classification_report:               precision    recall  f1-score   support

           0       0.89      0.90      0.90      1991
           1       0.60      0.57      0.59       509

    accuracy                           0.84      2500
   macro avg       0.75      0.74      0.74      2500
weighted avg       0.83      0.84      0.83      2500

R2 Score -0.011427652333338978
precison score: 0.6033402922755741


In [26]:
mae=mean_squared_error(y_test,y_pred)
acc=accuracy_score(y_test,y_pred)
recall=recall_score(y_test,y_pred)
matrix=confusion_matrix(y_test,y_pred)
rocauc=roc_auc_score(y_test,y_pred)
clasrep=classification_report(y_test,y_pred)
score=r2_score(y_test,y_pred)
pres=precision_score(y_test,y_pred)
print("Mean absolute error", mae)
print("accuracy:",acc)
print("Recall:",recall)
print('confusion_matrix',matrix)
print('roc_auc_score:',rocauc)
print('classification_report:',clasrep)
print("R2 Score", score)
print('precison score:',pres)

Mean absolute error 0.164
accuracy: 0.836
Recall: 0.5677799607072691
confusion_matrix [[1801  190]
 [ 220  289]]
roc_auc_score: 0.7361752641306311
classification_report:               precision    recall  f1-score   support

           0       0.89      0.90      0.90      1991
           1       0.60      0.57      0.59       509

    accuracy                           0.84      2500
   macro avg       0.75      0.74      0.74      2500
weighted avg       0.83      0.84      0.83      2500

R2 Score -0.011427652333338978
precison score: 0.6033402922755741


In [27]:
import pickle
with open('churn_model.pkl', 'wb') as f:
    pickle.dump({"model": gb, "scaler": scaler}, f, protocol=pickle.HIGHEST_PROTOCOL)

