In [322]:
# improting of libaries
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    accuracy_score,roc_auc_score
)
import warnings
warnings.filterwarnings('ignore')

In [323]:
def load_and_explore_dataset(filepath):
    print('=' * 60)
    print("LOAD AND EXPLORE DATASET")
    print('=' * 60)

    df = pd.read_csv(filepath)

    print('shape of the dataset:')
    print(df.shape)
    print('\nCheck for missing values:')
    print(df.isnull().sum())
    print('\nFirst five rows:')
    print(df.head())
    print('\nDescriptive Stats:')
    print(df.describe())
    print('\nDataset info:')
    print(df.info())
    print('\Exited Distribution:')
    print(df['Exited'].value_counts())
    print('\Exited Percentage Distribution:')
    print(df['Exited'].value_counts(normalize=True)* 100)
    
    return df

In [324]:
def identify_features(df):
    print('=' * 60)
    print("IDETIFYING  OF FEATURES ")
    print('=' * 60)
    # features to be drop(not useful)
    features_to_drop = [
        "CustomerId","Surname"
    ]

    numerical_features = [
        "CreditScore", "Age", "Tenure", 'EstimatedSalary',
        'Balance', "NumOfProducts", "ServiceRating"
    ]

    categorical_features = [
        "Geography", "Gender", "HasCrCard"
    ] 
    print('\nFeatueres to drop', features_to_drop)
    print('\nNumerical Features', numerical_features)
    print('\nCategorical Features', categorical_features)
    return numerical_features, categorical_features, features_to_drop

In [325]:
def prepare_data(df, numerical_features, categorical_features, features_to_drop):
    print('=' * 60)
    print("PREPARE  DATASET ")
    print('=' * 60)

    # drop unnecessary cols
    df_model = df.drop(columns=features_to_drop)

    # featyre to use
    X = df_model.drop('Exited', axis= 1)
    y = df_model['Exited']

    print("\nFeatures Shape", X.shape)
    print("\nTarget Shape", y.shape)
    print("\nFeature columns list", list(X.columns))

    return X, y, list(X.columns)

In [326]:
def create_preprocessing_pipeline(numerical_features, categorical_features):
    print('=' * 60)
    print("CREATING PREPROCESSING  PIPELINE ")
    print('=' * 60)

    # nmerical_features pipeline
    numerical_pipeline = Pipeline([
        ('scaler', StandardScaler())
    ])

    # categorical features pipeline
    categorical_pipeline = Pipeline([
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown = 'ignore'))
    ])
    #  column preprocessing
    preprocessor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    return preprocessor

In [327]:
def split_data(X, y, test_size=0.2, random_state=42):
    print('=' * 60)
    print("SPLITTING DATA INTO TRAIN/TEST")
    print('=' * 60)

    X_train, X_test, y_train, y_test = train_test_split(
        X,y, test_size= test_size, random_state = random_state, stratify=y
    )
    print('\nTrain set size', X_train.shape[0])
    print('\nTest set size', X_test.shape[0])
    print('\nTrain churn Distribution', y_train.value_counts())
    print('\nTest Churn Distribution', y_test.value_counts())

    return X_train, X_test, y_train, y_test

In [328]:
def create_model_pipeline(preprocessor):
    print('=' * 60)
    print("CREATING MODEL PIPELINE")
    print('=' * 60)

    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(
            max_iter =1000,
            random_state = 42,
            class_weight= 'balanced',
            solver= 'lbfgs'
        ))
    ])

    print('\nModel pipeline created succesfully')

    return model_pipeline

In [329]:
def train_model(model_pipeline, X_train, y_train):
    print('=' * 60)
    print("TRAIN MODEL")
    print('=' * 60)

    model_pipeline.fit(X_train, y_train)
    print('\nModel trained successfully')

    # get name steps from preprocessing
    try:
        num_features = model_pipeline.named_steps['preprocessor'].transformers_[0][2]
        cat_features = model_pipeline.named_steps['preprocessor'].transformers_[1][2]

        # get name_Steps from onehotencoder
        onehot_encoder = model_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
        cat_features_name = onehot_encoder.get_feature_names_out(cat_features)

        all_features_name = list(num_features) + list(cat_features_name)

        # get coefficient 
        coefficient = model_pipeline.named_steps['classifier'].coef_[0]
        print('\nTop 10 most importance fatures(by coefficent magnitude)')
        feature_importance = pd.DataFrame({
            'feature_columns': all_features_name,
            'coefficient': coefficient
        }).sort_values('coefficient', key=abs, ascending=False)
        print(feature_importance.head(10).to_string(index=False))
    except Exception as e:
        print(f"Can not extract features name {e}")
    return model_pipeline

In [330]:
def evaluate_model(model_pipeline, X_train, X_test, y_train, y_test):
    print('=' * 60)
    print("EVALUATING MODEL")
    print('=' * 60)

    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)

    # predict model prob
    y_train_proba = model_pipeline.predict_proba(X_train)[:,1]
    y_test_proba = model_pipeline.predict_proba(X_test)[:,1]

    # metrics 
    y_train_acc_score = accuracy_score(y_train, y_train_pred)
    y_test_acc_Score = accuracy_score(y_test_pred, y_test)

    y_train_roc_score = roc_auc_score(y_train, y_train_proba)
    y_test_roc_score = roc_auc_score(y_test, y_test_proba)

    print("\nMETRICS EVALUATION")
    print(f"\nTrain accuracy score:, {float(y_train_acc_score):.4f}")
    print(f"\nTest accuracy score:, {float(y_test_acc_Score):.4f}")
    print(f"\nTrain ROC Score:, {float(y_train_roc_score):.4f}")
    print(f"\nTest ROC Score:, {float(y_test_roc_score):.4f}")

    print("\n CLASSIFICATION OF TRAIN SIZE")
    print("\n Classification report\n", classification_report(y_train, y_train_pred, target_names = ['Retained','Churned']))
    print("\n CLASSIFICATION OF TEST SIZE")
    print("\n Classification report\n", classification_report(y_test, y_test_pred, target_names = ['Retained','Churned']))

    # cross validation score
    cv_score = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='roc_auc')
    print('\nCross Validation (5-folds)')
    print(f'R2 Score:{cv_score}')
    print(f'Cross Validation mean: {cv_score.mean():.2f}')
    print(f"Cross Validation STD : {cv_score.std():.4f}")

    metrics = {
        "y_train_acc_score": y_train_acc_score,
        "y_test_acc_Score": y_test_acc_Score,
        "y_train_roc_score": y_train_roc_score,
        "y_test_roc_score": y_test_roc_score, 
        "y_train_pred": y_train_pred,
        "y_test_pred": y_test_pred,
        'y_train_proba':y_train_proba,
        "y_test_proba": y_test_proba,
        "cv_score":cv_score,
    }
    return metrics

In [331]:
def save_model_artifact(model_pipeline, feature_columns):
    print('=' * 60)
    print("SAVING MODEL ARTIFACT")
    print('=' * 60)

    joblib.dump(model_pipeline, 'churned_model_pipeline.pkl')
    print('churned model pipeline saved successfully')
    
    joblib.dump(feature_columns, 'feature_columns.pkl')
    print('feature columns saved successfully')

    print('=' * 60)
    print(" MODEL ARTIFACT SAVED SUCCESSFULLY")
    print('=' * 60)

In [342]:
def predict_churn (customer_data):
    model_pipeline = joblib.load("churned_model_pipeline.pkl")
    feature_columns = joblib.load("feature_columns.pkl")
    customer_df = pd.DataFrame([customer_data])
    for feature in feature_columns:
        if feature not in customer_df.columns:
            raise ValueError(f"Missing require feature{feature}")
    customer_df = customer_df[feature_columns]
    
    prediction = model_pipeline.predict(customer_df)[0]
    probability = model_pipeline.predict_proba(customer_df)[0]

    result = {
    'prediction': 'Churned' if prediction == 1 else "Retained",
    'churn_probability': probability[0],
    'retained_probability': probability[1],
    'risk_level': 'High' if probability [1] > 0.7 else 'Medium' if probability[0] > 0.4 else 'Low'
    }

    return result

In [343]:
def test_predict():
    customer_data = {

        'CreditScore' :650,
        'Geography': 'France',
        'Gender' :'Male',
        'Age' : 35,
        'Tenure': 5,
        'Balance': 10000.0,
        'NumOfProducts': 2,
        'HasCrCard': 'Yes',
       'IsActiveMember' : 'Yes',
        'EstimatedSalary': 80000.0,
        'ServiceRating': 4
    }

    result = predict_churn (customer_data)
    print("Prediction", result['prediction'])
    print("Churned Probability", result['churn_probability'])
    print("Retained Probability", result['retained_probability'])
    print("Risk Level", result['risk_level'])

In [344]:
def main():
    filepath  = 'cleaned_bank_churned_dataset.csv'
    df = load_and_explore_dataset(filepath)
    numerical_features, categorical_features, features_to_drop = identify_features(df)
    X, y, feature_columns = prepare_data(df, numerical_features, categorical_features, features_to_drop)
    preprocessor = create_preprocessing_pipeline(numerical_features, categorical_features)
    X_train, X_test, y_train, y_test = split_data(X, y)
    model_pipeline = create_model_pipeline(preprocessor)
    model_pipeline = train_model(model_pipeline, X_train, y_train)
    metrics = evaluate_model(model_pipeline, X_train, X_test, y_train, y_test)
    save_model_artifact(model_pipeline, feature_columns)
    test_predict()

In [345]:
if __name__ == "__main__" :
    main()

LOAD AND EXPLORE DATASET
shape of the dataset:
(9997, 14)

Check for missing values:
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
EstimatedSalary    0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
Exited             0
ServiceRating      0
dtype: int64

First five rows:
   CustomerId   Surname  CreditScore Geography  Gender   Age  Tenure  \
0    15634602  Hargrave          619    France  Female  42.0       2   
1    15647311      Hill          608     Spain  Female  41.0       1   
2    15619304      Onio          502    France  Female  42.0       8   
3    15701354      Boni          699    France  Female  39.0       1   
4    15737888  Mitchell          850     Spain  Female  43.0       2   

   EstimatedSalary    Balance  NumOfProducts HasCrCard IsActiveMember  Exited  \
0        101348.88       0.00              1       Yes            Yes       