In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib  # Changed from pickle to joblib
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

#%% Data Loading
data = pd.read_csv(r"C:\Users\Yash\OneDrive\Desktop\Deployment_Streamkit\bank-full.csv", sep=';')
data = data.rename(columns={'y': 'Subscribed'})  # Corrected spelling

#%% Data Preparation
data[data.select_dtypes(include=['object']).columns] = data.select_dtypes(include=['object']).astype('category')

#%% Train-Test Split
X = data.drop(['Subscribed'], axis=1)
y = data['Subscribed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

#%% Preprocessing Setup
# Initialize encoders and scalers
education_encoder = LabelEncoder()
target_encoder = LabelEncoder()
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
scaler = MinMaxScaler()

#%% Feature Engineering
# Process education column
X_train['education'] = education_encoder.fit_transform(X_train['education'])
X_test['education'] = education_encoder.transform(X_test['education'])

# Process target variable
y_train = target_encoder.fit_transform(y_train)
y_test = target_encoder.transform(y_test)

# Process categorical features
categorical_columns = X.select_dtypes(include=['category', 'object']).columns.drop('education')

# Fit and transform training data
X_train_cat = pd.DataFrame(ohe.fit_transform(X_train[categorical_columns]),
                          columns=ohe.get_feature_names_out(categorical_columns),
                          index=X_train.index)

# Transform test data
X_test_cat = pd.DataFrame(ohe.transform(X_test[categorical_columns]),
                         columns=ohe.get_feature_names_out(categorical_columns),
                         index=X_test.index)

# Combine features
X_train = pd.concat([X_train.drop(columns=categorical_columns), X_train_cat], axis=1)
X_test = pd.concat([X_test.drop(columns=categorical_columns), X_test_cat], axis=1)

#%% Feature Scaling
numerical_columns = X_train.select_dtypes(include=['number']).columns
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

#%% Model Training
model = RandomForestClassifier(random_state=25)
model.fit(X_train, y_train)

#%% Model Evaluation
print("Model Evaluation:")
print(classification_report(y_test, model.predict(X_test)))

#%% Save Artifacts
joblib.dump(model, 'model.joblib')
joblib.dump(education_encoder, 'education_encoder.joblib')
joblib.dump(target_encoder, 'target_encoder.joblib')
joblib.dump(ohe, 'onehot_encoder.joblib')
joblib.dump(scaler, 'scaler.joblib')

print("All artifacts saved successfully!")

Model Evaluation:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95     11990
           1       0.64      0.40      0.49      1574

    accuracy                           0.90     13564
   macro avg       0.78      0.69      0.72     13564
weighted avg       0.89      0.90      0.89     13564

All artifacts saved successfully!
