# Task 2
customers house loan prediction

In [34]:
import os
import pandas as pd


In [35]:
original_data = pd.read_csv('data/customers_loan.csv')
del original_data['Unnamed: 0']
original_data.head()

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,PAIDOFF,1000,30,9/8/2016,10/7/2016,45,High School or Below,male
1,PAIDOFF,1000,30,9/8/2016,10/7/2016,33,Bechalor,female
2,PAIDOFF,1000,15,9/8/2016,9/22/2016,27,college,male
3,PAIDOFF,1000,30,9/9/2016,10/8/2016,28,college,female
4,PAIDOFF,1000,30,9/9/2016,10/8/2016,29,college,male


In [36]:
original_data.dtypes

loan_status       object
Principal          int64
terms              int64
effective_date    object
due_date          object
age                int64
education         object
Gender            object
dtype: object

In [37]:
original_data["education"].value_counts()
# so apply lable encoding to education column

education
High School or Below    151
college                 149
Bechalor                 44
Master or Above           2
Name: count, dtype: int64

In [38]:
original_data["Gender"].value_counts()
# so that apply binay encoding to gender



Gender
male      294
female     52
Name: count, dtype: int64

In [39]:
# check null values
original_data.isnull().sum()

loan_status       0
Principal         0
terms             0
effective_date    0
due_date          0
age               0
education         0
Gender            0
dtype: int64

In [40]:
original_data["loan_status"].value_counts()
# so there is a class imbalance in the target variable

loan_status
PAIDOFF       260
COLLECTION     86
Name: count, dtype: int64

### Feature engineering

In [41]:
updated_df = original_data.copy()
# convert dates
updated_df['effective_date'] = pd.to_datetime(updated_df['effective_date'],format='%m/%d/%Y')
updated_df['due_date'] = pd.to_datetime(updated_df['due_date'],format='%m/%d/%Y')
updated_df.dtypes

loan_status               object
Principal                  int64
terms                      int64
effective_date    datetime64[ns]
due_date          datetime64[ns]
age                        int64
education                 object
Gender                    object
dtype: object

In [42]:
updated_df['DaysBetween'] = (updated_df['due_date'] - updated_df['effective_date']).dt.days
updated_df

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween
0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male,29
1,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female,29
2,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male,14
3,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female,29
4,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male,29
...,...,...,...,...,...,...,...,...,...
341,COLLECTION,800,15,2016-09-11,2016-09-25,32,High School or Below,male,14
342,COLLECTION,1000,30,2016-09-11,2016-10-10,25,High School or Below,male,29
343,COLLECTION,800,15,2016-09-12,2016-09-26,39,college,male,14
344,COLLECTION,1000,30,2016-09-12,2016-11-10,28,college,male,59


In [43]:
# Extract year and month from both 'effective_date' and 'due_date'
# updated_df['Effective_Year'] = updated_df['effective_date'].dt.year
updated_df['Effective_Month'] = updated_df['effective_date'].dt.month

# updated_df['Due_Year'] = updated_df['due_date'].dt.year
updated_df['Due_Month'] = updated_df['due_date'].dt.month

# Display the updated dataframe
updated_df


Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween,Effective_Month,Due_Month
0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male,29,9,10
1,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female,29,9,10
2,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male,14,9,9
3,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female,29,9,10
4,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male,29,9,10
...,...,...,...,...,...,...,...,...,...,...,...
341,COLLECTION,800,15,2016-09-11,2016-09-25,32,High School or Below,male,14,9,9
342,COLLECTION,1000,30,2016-09-11,2016-10-10,25,High School or Below,male,29,9,10
343,COLLECTION,800,15,2016-09-12,2016-09-26,39,college,male,14,9,9
344,COLLECTION,1000,30,2016-09-12,2016-11-10,28,college,male,59,9,11


In [44]:
# process ages
max_young = updated_df['age'].quantile(0.3)
max_middle_aged = updated_df['age'].quantile(0.6)

# Categorize based on these cutoffs
def categorize_age(age):
    if age <= max_young:
        return 'Young'
    elif age <= max_middle_aged:
        return 'Middle-aged'
    else:
        return 'Old'

# Apply the function to create age_bin categories
updated_df['age_bin'] = updated_df['age'].apply(categorize_age)

# One-hot encoding for the new age bins
age_dummies = pd.get_dummies(updated_df['age_bin'], prefix='is')

# Concatenate the one-hot encoded columns to the original dataframe
updated_df = pd.concat([updated_df, age_dummies], axis=1)
updated_df['is_Young'] = updated_df['is_Young'].astype(int)
updated_df['is_Middle-aged'] = updated_df['is_Middle-aged'].astype(int)
updated_df['is_Old'] = updated_df['is_Old'].astype(int)
updated_df

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween,Effective_Month,Due_Month,age_bin,is_Middle-aged,is_Old,is_Young
0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male,29,9,10,Old,0,1,0
1,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female,29,9,10,Old,0,1,0
2,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male,14,9,9,Young,0,0,1
3,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female,29,9,10,Middle-aged,1,0,0
4,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male,29,9,10,Middle-aged,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,COLLECTION,800,15,2016-09-11,2016-09-25,32,High School or Below,male,14,9,9,Middle-aged,1,0,0
342,COLLECTION,1000,30,2016-09-11,2016-10-10,25,High School or Below,male,29,9,10,Young,0,0,1
343,COLLECTION,800,15,2016-09-12,2016-09-26,39,college,male,14,9,9,Old,0,1,0
344,COLLECTION,1000,30,2016-09-12,2016-11-10,28,college,male,59,9,11,Middle-aged,1,0,0


In [45]:
# apply label encoding to education column
education_mapping = {
    'High School or Below': 0,
    'college': 1,
    'Bechalor': 2,
    'Master or Above': 3
}

# Apply the mapping to the 'education' column
updated_df['education_encoded'] = updated_df['education'].map(education_mapping)
updated_df

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween,Effective_Month,Due_Month,age_bin,is_Middle-aged,is_Old,is_Young,education_encoded
0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male,29,9,10,Old,0,1,0,0
1,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female,29,9,10,Old,0,1,0,2
2,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male,14,9,9,Young,0,0,1,1
3,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female,29,9,10,Middle-aged,1,0,0,1
4,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male,29,9,10,Middle-aged,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,COLLECTION,800,15,2016-09-11,2016-09-25,32,High School or Below,male,14,9,9,Middle-aged,1,0,0,0
342,COLLECTION,1000,30,2016-09-11,2016-10-10,25,High School or Below,male,29,9,10,Young,0,0,1,0
343,COLLECTION,800,15,2016-09-12,2016-09-26,39,college,male,14,9,9,Old,0,1,0,1
344,COLLECTION,1000,30,2016-09-12,2016-11-10,28,college,male,59,9,11,Middle-aged,1,0,0,1


In [46]:
# apply binary encoding to gender
updated_df['Gender_encoded'] = updated_df['Gender'].apply(lambda x: 1 if x == 'male' else 0)
updated_df

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween,Effective_Month,Due_Month,age_bin,is_Middle-aged,is_Old,is_Young,education_encoded,Gender_encoded
0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male,29,9,10,Old,0,1,0,0,1
1,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female,29,9,10,Old,0,1,0,2,0
2,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male,14,9,9,Young,0,0,1,1,1
3,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female,29,9,10,Middle-aged,1,0,0,1,0
4,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male,29,9,10,Middle-aged,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,COLLECTION,800,15,2016-09-11,2016-09-25,32,High School or Below,male,14,9,9,Middle-aged,1,0,0,0,1
342,COLLECTION,1000,30,2016-09-11,2016-10-10,25,High School or Below,male,29,9,10,Young,0,0,1,0,1
343,COLLECTION,800,15,2016-09-12,2016-09-26,39,college,male,14,9,9,Old,0,1,0,1,1
344,COLLECTION,1000,30,2016-09-12,2016-11-10,28,college,male,59,9,11,Middle-aged,1,0,0,1,1


In [47]:
# min max norm :: Principal	terms    age	DaysBetween
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
columns_to_normalize = ['Principal', 'terms', 'age', 'DaysBetween']
scaler = MinMaxScaler()
updated_df[columns_to_normalize] = scaler.fit_transform(updated_df[columns_to_normalize])
updated_df


Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween,Effective_Month,Due_Month,age_bin,is_Middle-aged,is_Old,is_Young,education_encoded,Gender_encoded
0,PAIDOFF,1.000000,1.000000,2016-09-08,2016-10-07,0.818182,High School or Below,male,0.433962,9,10,Old,0,1,0,0,1
1,PAIDOFF,1.000000,1.000000,2016-09-08,2016-10-07,0.454545,Bechalor,female,0.433962,9,10,Old,0,1,0,2,0
2,PAIDOFF,1.000000,0.347826,2016-09-08,2016-09-22,0.272727,college,male,0.150943,9,9,Young,0,0,1,1,1
3,PAIDOFF,1.000000,1.000000,2016-09-09,2016-10-08,0.303030,college,female,0.433962,9,10,Middle-aged,1,0,0,1,0
4,PAIDOFF,1.000000,1.000000,2016-09-09,2016-10-08,0.333333,college,male,0.433962,9,10,Middle-aged,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,COLLECTION,0.714286,0.347826,2016-09-11,2016-09-25,0.424242,High School or Below,male,0.150943,9,9,Middle-aged,1,0,0,0,1
342,COLLECTION,1.000000,1.000000,2016-09-11,2016-10-10,0.212121,High School or Below,male,0.433962,9,10,Young,0,0,1,0,1
343,COLLECTION,0.714286,0.347826,2016-09-12,2016-09-26,0.636364,college,male,0.150943,9,9,Old,0,1,0,1,1
344,COLLECTION,1.000000,1.000000,2016-09-12,2016-11-10,0.303030,college,male,1.000000,9,11,Middle-aged,1,0,0,1,1


In [48]:
# apply binary encoding to loan_status
updated_df['loan_status_encoded'] = updated_df['loan_status'].apply(lambda x: 1 if x == 'PAIDOFF' else 0)
updated_df

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,DaysBetween,Effective_Month,Due_Month,age_bin,is_Middle-aged,is_Old,is_Young,education_encoded,Gender_encoded,loan_status_encoded
0,PAIDOFF,1.000000,1.000000,2016-09-08,2016-10-07,0.818182,High School or Below,male,0.433962,9,10,Old,0,1,0,0,1,1
1,PAIDOFF,1.000000,1.000000,2016-09-08,2016-10-07,0.454545,Bechalor,female,0.433962,9,10,Old,0,1,0,2,0,1
2,PAIDOFF,1.000000,0.347826,2016-09-08,2016-09-22,0.272727,college,male,0.150943,9,9,Young,0,0,1,1,1,1
3,PAIDOFF,1.000000,1.000000,2016-09-09,2016-10-08,0.303030,college,female,0.433962,9,10,Middle-aged,1,0,0,1,0,1
4,PAIDOFF,1.000000,1.000000,2016-09-09,2016-10-08,0.333333,college,male,0.433962,9,10,Middle-aged,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,COLLECTION,0.714286,0.347826,2016-09-11,2016-09-25,0.424242,High School or Below,male,0.150943,9,9,Middle-aged,1,0,0,0,1,0
342,COLLECTION,1.000000,1.000000,2016-09-11,2016-10-10,0.212121,High School or Below,male,0.433962,9,10,Young,0,0,1,0,1,0
343,COLLECTION,0.714286,0.347826,2016-09-12,2016-09-26,0.636364,college,male,0.150943,9,9,Old,0,1,0,1,1,0
344,COLLECTION,1.000000,1.000000,2016-09-12,2016-11-10,0.303030,college,male,1.000000,9,11,Middle-aged,1,0,0,1,1,0


In [49]:
selected_features = updated_df[["loan_status_encoded",'Principal', 'terms', 'age', 'DaysBetween', 'education_encoded', "Gender_encoded", 'is_Young', 'is_Middle-aged', 'is_Old', 'Effective_Month',  'Due_Month']]
selected_features.head()

Unnamed: 0,loan_status_encoded,Principal,terms,age,DaysBetween,education_encoded,Gender_encoded,is_Young,is_Middle-aged,is_Old,Effective_Month,Due_Month
0,1,1.0,1.0,0.818182,0.433962,0,1,0,0,1,9,10
1,1,1.0,1.0,0.454545,0.433962,2,0,0,0,1,9,10
2,1,1.0,0.347826,0.272727,0.150943,1,1,1,0,0,9,9
3,1,1.0,1.0,0.30303,0.433962,1,0,0,1,0,9,10
4,1,1.0,1.0,0.333333,0.433962,1,1,0,1,0,9,10


In [50]:
selected_features.dtypes

loan_status_encoded      int64
Principal              float64
terms                  float64
age                    float64
DaysBetween            float64
education_encoded        int64
Gender_encoded           int64
is_Young                 int64
is_Middle-aged           int64
is_Old                   int64
Effective_Month          int32
Due_Month                int32
dtype: object

In [51]:
# apply oversampling to balance the classes
from imblearn.over_sampling import RandomOverSampler
X = selected_features.drop('loan_status_encoded', axis=1)
y = selected_features['loan_status_encoded']
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)
resampled_counts = y_resampled.value_counts()
resampled_counts

loan_status_encoded
1    260
0    260
Name: count, dtype: int64

In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
# train test split
X = df_resampled.drop(columns=['loan_status_encoded'])
y = df_resampled['loan_status_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# Initialize the Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Train the model on the training data
logreg_model.fit(X_train, y_train)

# Predict outcomes for the test set
y_pred = logreg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Add predictions to the test_data for inspection
test_data['Predicted_Outcome'] = y_pred

# Print first few rows of test data with predictions
print(test_data.head())


Accuracy: 0.5962
Confusion Matrix:
[[34 14]
 [28 28]]
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.71      0.62        48
           1       0.67      0.50      0.57        56

    accuracy                           0.60       104
   macro avg       0.61      0.60      0.59       104
weighted avg       0.61      0.60      0.59       104

     Principal     terms       age  DaysBetween  education_encoded  \
275        1.0  1.000000  0.575758     0.433962                  1   
93         1.0  1.000000  0.303030     0.433962                  1   
6          1.0  1.000000  0.303030     0.433962                  1   
167        1.0  0.347826  0.666667     0.150943                  1   
90         1.0  1.000000  0.272727     1.000000                  1   

     Gender_encoded  is_Young  is_Middle-aged  is_Old  Effective_Month  \
275               1         0               0       1                9   
93                1        

### Random forest

In [54]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)  # You can tune n_estimators

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Predict outcomes for the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Add predictions to the test_data for inspection
test_data['Predicted_Outcome'] = y_pred

# Print first few rows of test data with predictions
print(test_data.head())


Accuracy: 0.7596
Confusion Matrix:
[[41  7]
 [18 38]]
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.85      0.77        48
           1       0.84      0.68      0.75        56

    accuracy                           0.76       104
   macro avg       0.77      0.77      0.76       104
weighted avg       0.78      0.76      0.76       104

     Principal     terms       age  DaysBetween  education_encoded  \
275        1.0  1.000000  0.575758     0.433962                  1   
93         1.0  1.000000  0.303030     0.433962                  1   
6          1.0  1.000000  0.303030     0.433962                  1   
167        1.0  0.347826  0.666667     0.150943                  1   
90         1.0  1.000000  0.272727     1.000000                  1   

     Gender_encoded  is_Young  is_Middle-aged  is_Old  Effective_Month  \
275               1         0               0       1                9   
93                1        

In [55]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize the SVM model
svm_model = SVC(random_state=42, kernel='rbf')  # You can tune kernel type ('linear', 'rbf', 'poly', etc.)

# Train the model on the training data
svm_model.fit(X_train, y_train)

# Predict outcomes for the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Add predictions to the test_data for inspection
test_data['Predicted_Outcome'] = y_pred

# Print first few rows of test data with predictions
print(test_data.head())


Accuracy: 0.5769
Confusion Matrix:
[[38 10]
 [34 22]]
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.79      0.63        48
           1       0.69      0.39      0.50        56

    accuracy                           0.58       104
   macro avg       0.61      0.59      0.57       104
weighted avg       0.61      0.58      0.56       104

     Principal     terms       age  DaysBetween  education_encoded  \
275        1.0  1.000000  0.575758     0.433962                  1   
93         1.0  1.000000  0.303030     0.433962                  1   
6          1.0  1.000000  0.303030     0.433962                  1   
167        1.0  0.347826  0.666667     0.150943                  1   
90         1.0  1.000000  0.272727     1.000000                  1   

     Gender_encoded  is_Young  is_Middle-aged  is_Old  Effective_Month  \
275               1         0               0       1                9   
93                1        

### Parameater optimization for RF

In [57]:
import pandas as pd
import mlflow
import mlflow.sklearn
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import itertools

# Define hyperparameter space for Random Forest
param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Create all possible combinations of hyperparameters
param_keys = param_distributions.keys()
param_values = param_distributions.values()
all_combinations = list(itertools.product(*param_values))

# Initialize a list to collect results
results = []

# Iterate over each combination of hyperparameters
for i, combination in enumerate(all_combinations):
    params = dict(zip(param_keys, combination))

    # Initialize the RandomForestClassifier with the current set of parameters
    rf_model = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        bootstrap=params['bootstrap'],
        random_state=42
    )

    # Train the model
    rf_model.fit(X_train, y_train)

    # Predict outcomes for the test set
    y_pred = rf_model.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Create a unique run name
    run_name = f"rf-run-{int(time.time() * 1000)}-{i}"

    # Log metrics and hyperparameters to MLflow for this run
    with mlflow.start_run(run_name=run_name):
        # Log hyperparameters
        mlflow.log_param("n_estimators", params['n_estimators'])
        mlflow.log_param("max_depth", params['max_depth'])
        mlflow.log_param("min_samples_split", params['min_samples_split'])
        mlflow.log_param("min_samples_leaf", params['min_samples_leaf'])
        mlflow.log_param("bootstrap", params['bootstrap'])
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        # Save the model for this run
        mlflow.sklearn.log_model(rf_model, "random_forest_model")

    # Append results to the results list
    results.append({
        'run_name': run_name,
        'n_estimators': params['n_estimators'],
        'max_depth': params['max_depth'],
        'min_samples_split': params['min_samples_split'],
        'min_samples_leaf': params['min_samples_leaf'],
        'bootstrap': params['bootstrap'],
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    })

# Create a DataFrame from the results for further inspection
results_df = pd.DataFrame(results)

results_df




Unnamed: 0,run_name,n_estimators,max_depth,min_samples_split,min_samples_leaf,bootstrap,accuracy,precision,recall,f1_score
0,rf-run-1727206559904-0,100,,2,1,True,0.759615,0.775431,0.759615,0.758881
1,rf-run-1727206561366-1,100,,2,1,False,0.769231,0.794458,0.769231,0.767433
2,rf-run-1727206562840-2,100,,2,2,True,0.663462,0.676488,0.663462,0.662434
3,rf-run-1727206564260-3,100,,2,2,False,0.682692,0.696277,0.682692,0.681723
4,rf-run-1727206565749-4,100,,5,1,True,0.701923,0.705996,0.701923,0.702337
...,...,...,...,...,...,...,...,...,...,...
67,rf-run-1727206663336-67,300,20.0,2,2,False,0.692308,0.708392,0.692308,0.690940
68,rf-run-1727206665017-68,300,20.0,5,1,True,0.692308,0.697607,0.692308,0.692649
69,rf-run-1727206666633-69,300,20.0,5,1,False,0.730769,0.748252,0.730769,0.729573
70,rf-run-1727206668336-70,300,20.0,5,2,True,0.653846,0.668531,0.653846,0.652308


In [58]:
results_df.sort_values('f1_score', ascending=False)
# rf-run-1727206561366-1	100	NaN	2	1	False	0.769231	0.794458	0.769231	0.767433

Unnamed: 0,run_name,n_estimators,max_depth,min_samples_split,min_samples_leaf,bootstrap,accuracy,precision,recall,f1_score
1,rf-run-1727206561366-1,100,,2,1,False,0.769231,0.794458,0.769231,0.767433
65,rf-run-1727206660040-65,300,20.0,2,1,False,0.769231,0.794458,0.769231,0.767433
49,rf-run-1727206633832-49,300,,2,1,False,0.769231,0.794458,0.769231,0.767433
17,rf-run-1727206584854-17,100,20.0,2,1,False,0.769231,0.794458,0.769231,0.767433
16,rf-run-1727206583314-16,100,20.0,2,1,True,0.759615,0.775431,0.759615,0.758881
...,...,...,...,...,...,...,...,...,...,...
14,rf-run-1727206580397-14,100,10.0,5,2,True,0.644231,0.656700,0.644231,0.643144
6,rf-run-1727206568679-6,100,,5,2,True,0.644231,0.660577,0.644231,0.642088
22,rf-run-1727206592202-22,100,20.0,5,2,True,0.644231,0.660577,0.644231,0.642088
38,rf-run-1727206616702-38,200,10.0,5,2,True,0.644231,0.660577,0.644231,0.642088
