In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [3]:
# Load the dataset
df = pd.read_csv('/Users/gracewang/dev/DSI_Team_Project_Bank_Marketing/data/processed/df_processed.csv')  

# Separate features (X) and target (y)
X = df.drop(columns=['y_yes'])  # Assuming 'y_yes' is the target column
y = df['y_yes']


In [4]:
# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [5]:
# Initialize the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)  # n_estimators: Number of trees

# Train the model
rf.fit(X_train, y_train)


In [6]:
# Predict the target for the test set
y_pred = rf.predict(X_test)


In [7]:
# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Generate the classification report
class_report = classification_report(y_test, y_pred)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

# Display results
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
print("\nAccuracy:")
print(accuracy)


Confusion Matrix:
[[7777  208]
 [ 641  417]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.67      0.39      0.50      1058

    accuracy                           0.91      9043
   macro avg       0.80      0.68      0.72      9043
weighted avg       0.89      0.91      0.90      9043


Accuracy:
0.9061152272475949


In [8]:
# Get feature importance
importances = rf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(feature_importance_df)


Feature Importance:
                Feature  Importance
3              duration    0.279500
1               balance    0.105684
0                   age    0.101885
2                   day    0.090871
25     poutcome_success    0.054275
5                 pdays    0.041734
4              campaign    0.040839
22          housing_yes    0.022018
6              previous    0.021405
34            month_mar    0.013542
19  education_secondary    0.012969
17      marital_married    0.011494
20   education_tertiary    0.011280
15       job_technician    0.011279
37            month_oct    0.011217
10       job_management    0.010883
35            month_may    0.010754
27             loan_yes    0.010350
28            month_aug    0.010298
33            month_jun    0.009688
18       marital_single    0.009553
32            month_jul    0.009547
7       job_blue-collar    0.009394
36            month_nov    0.008492
30            month_feb    0.008424
26     poutcome_unknown    0.008140
38      