In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
# Load the dataset
df = pd.read_csv('/Users/gracewang/dev/DSI_Team_Project_Bank_Marketing/data/processed/df_processed1.csv')  

# Separate features (X) and target (y)
X = df.drop(columns=['y'])  
y = df['y']


In [3]:
from imblearn.over_sampling import SMOTE

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Applying SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Now, X_train_resampled and y_train_resampled can be used for model training

In [4]:
# Initialize the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)  # n_estimators: Number of trees

# Train the model
rf.fit(X_train_resampled, y_train_resampled)


In [5]:
# Predict the target for the test set
y_pred = rf.predict(X_test)


In [6]:
# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Generate the classification report
class_report = classification_report(y_test, y_pred)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

# Display results
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
print("\nAccuracy:")
print(accuracy)


Confusion Matrix:
[[11083   883]
 [  614   984]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     11966
           1       0.53      0.62      0.57      1598

    accuracy                           0.89     13564
   macro avg       0.74      0.77      0.75     13564
weighted avg       0.90      0.89      0.89     13564


Accuracy:
0.8896343261574756


In [7]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the model
grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   2.5s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   2.7s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   2.7s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   2.7s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   2.8s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   2.5s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   5.2s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   5.3s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   5.3s
[CV] END max_depth=10, min_sample

In [8]:
# Assuming grid_search is the GridSearchCV object and it has been fit already
best_rf_model = grid_search.best_estimator_

# Get feature importances
importances = best_rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Display the feature importance
print("Feature Importance:")
print(feature_importance_df)



Feature Importance:
                Feature  Importance
3              duration    0.368049
4              campaign    0.096252
1               balance    0.061738
22          housing_yes    0.060333
0                   age    0.056474
2                   day    0.052809
25     poutcome_success    0.030501
5                 pdays    0.029562
6              previous    0.028048
35            month_may    0.027804
27             loan_yes    0.023144
26     poutcome_unknown    0.014424
32            month_jul    0.013509
28            month_aug    0.012976
7       job_blue-collar    0.012529
17      marital_married    0.011693
33            month_jun    0.011250
36            month_nov    0.009944
31            month_jan    0.008160
19  education_secondary    0.006436
15       job_technician    0.006287
30            month_feb    0.006150
18       marital_single    0.005486
20   education_tertiary    0.005451
13         job_services    0.005112
10       job_management    0.004521
23    co