In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv(r"D:\Projects\archive\train.csv")
test_df = pd.read_csv(r"D:\Projects\archive\test.csv")


In [9]:
missing_train = train_df.isnull().sum()
print("Missing values in training data:")
print(missing_train)


# Encode categorical variables
categorical_cols = train_df.select_dtypes(include=['object']).columns
train_df_encoded = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)


Missing values in training data:
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


#  Scale numerical features

In [4]:
from sklearn.preprocessing import StandardScaler

numerical_cols = train_df_encoded.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
train_df_encoded[numerical_cols] = scaler.fit_transform(train_df_encoded[numerical_cols])


In [5]:
missing_test = test_df.isnull().sum()
print("Missing values in testing data:")
print(missing_test)


# Encode categorical variables (ensure same columns as training data)
test_df_encoded = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# Align test data columns with training data
test_df_encoded = test_df_encoded.reindex(columns=train_df_encoded.columns, fill_value=0)

# Scale numerical features (using the same scaler as training data)
test_df_encoded[numerical_cols] = scaler.transform(test_df_encoded[numerical_cols])


Missing values in testing data:
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


# 3. Model Building and Evaluation

In [6]:
# For training data
X_train = train_df_encoded.drop('y_yes', axis=1)  
y_train = train_df_encoded['y_yes']

# For testing data
X_test = test_df_encoded.drop('y_yes', axis=1)
y_test = test_df_encoded['y_yes']


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))

# Random Forest
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

# Support Vector Machine (SVM)
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

# Gradient Boosting (e.g., XGBoost)
import xgboost as xgb
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

       False       0.92      0.98      0.95      4000
        True       0.65      0.33      0.44       521

    accuracy                           0.90      4521
   macro avg       0.78      0.65      0.69      4521
weighted avg       0.89      0.90      0.89      4521

Confusion Matrix:
[[3905   95]
 [ 348  173]]
Random Forest Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00      4000
        True       1.00      1.00      1.00       521

    accuracy                           1.00      4521
   macro avg       1.00      1.00      1.00      4521
weighted avg       1.00      1.00      1.00      4521

Confusion Matrix:
[[4000    0]
 [   0  521]]
SVM Classification Report:
              precision    recall  f1-score   support

       False       0.93      0.98      0.95      4000
        True       0.77      0.40      0

In [8]:
# Training accuracy
train_accuracy_rf = rf_clf.score(X_train, y_train)
print(f"Random Forest Training Accuracy: {train_accuracy_rf:.2f}")

# Test accuracy
test_accuracy_rf = rf_clf.score(X_test, y_test)
print(f"Random Forest Test Accuracy: {test_accuracy_rf:.2f}")


Random Forest Training Accuracy: 1.00
Random Forest Test Accuracy: 1.00
