In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import urllib.request
from zipfile import ZipFile
from io import BytesIO

In [6]:
# Load the dataset from the URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"

In [16]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(bank_data.head())

First few rows of the dataset:
   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.c

In [7]:
with urllib.request.urlopen(url) as response:
    with ZipFile(BytesIO(response.read())) as zip_ref:
        # Assuming the dataset file is 'bank-additional-full.csv'
        with zip_ref.open('bank-additional/bank-additional-full.csv') as file:
            bank_data = pd.read_csv(file, sep=';')

In [8]:
# Assuming the target variable is 'y' (subscription to term deposit)
X = bank_data.drop('y', axis=1)
y = LabelEncoder().fit_transform(bank_data['y'])  # Convert 'yes' to 1 and 'no' to 0


In [9]:
# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier()

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [12]:
# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [13]:
# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 150}


In [14]:
# Make predictions using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [15]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9166059723233795

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      7303
           1       0.69      0.48      0.56       935

    accuracy                           0.92      8238
   macro avg       0.81      0.72      0.76      8238
weighted avg       0.91      0.92      0.91      8238

