In [105]:
#basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#libraries for model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score  #useful for evaluating performance of probabilistic classifiers
from imblearn.over_sampling import SMOTE 
#classification models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier

In [106]:
df = pd.read_csv('modified_bail_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Crime_Type,Charges_Filed,Time_Served_Months,Prior_Criminal_History,Risk_of_Flight,Influence_on_Trial,Bail_Decision,Socio_Economic_Status
0,0,Cyber Crime,"['IPC 379', 'IPC 307', 'IPC 302']",33,0,0.945674,0.115675,Rejected,Low
1,1,Drug-related,"['Narcotics Act', 'Narcotics Act']",7,0,0.408927,0.946264,Granted,Low
2,2,Economic,['Cyber Laws'],41,0,0.66197,0.733362,Granted,High
3,3,Drug-related,['IPC 307'],8,0,0.464855,0.637499,Granted,High
4,4,Drug-related,"['Economic Offenses Act', 'Narcotics Act', 'IP...",37,1,0.630589,0.645278,Granted,Low


In [107]:
X = df.drop(columns=['Bail_Decision','Time_Served_Months','Risk_of_Flight','Prior_Criminal_History'],axis=1)

In [108]:
y = df['Bail_Decision']

In [109]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns


numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(sparse_output =False)
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),
    ]
)

In [110]:
X = preprocessor.fit_transform(X)

In [111]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((1600, 357), (400, 357))

In [112]:
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)

In [113]:
if not isinstance(X_train, np.ndarray):
    X_train =  X_train.toarray()
if not isinstance(X_train, np.ndarray):
    X_test = X_test.toarray()

In [114]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

Creating an Evalutation Functionn to give all metrics after model training

In [115]:
def evaluate_model(true, predicted, predicted_proba):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, pos_label=1)
    recall = recall_score(true, predicted, pos_label=1)
    f1 = f1_score(true, predicted, pos_label=1)
    roc_auc = roc_auc_score(true, predicted_proba) if predicted_proba  is not None else None
    return accuracy, precision, recall, f1, roc_auc

In [116]:
param_grids = {
    "Random Forest Classifier": {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [2, 10]
    },
    "Logistic Regression": {
        'C': [0.01, 0.1, 1.0],
        'penalty': ['l2'],
        'solver': ['lbfgs']
    },
    "Gradient Boosting Classifier": {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    },
    "SVC": {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear']
    }
}

In [118]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Support Vector Classifier": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "Ada Boost Classifier": AdaBoostClassifier(),
    "LightGBM": LGBMClassifier(),
    "Naive bayes Classifier": GaussianNB(),
    "Extra Tree Classifier": ExtraTreesClassifier(),
    "Bagging Classifier": BaggingClassifier(),
    "Ridge Classifier": RidgeClassifier(),
    "Neural Network MLPClassifier": MLPClassifier()
}
model_list = []
f1_list = []

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    if model_name in param_grids:
        grid_search = GridSearchCV(model, param_grids[model_name], cv=3, scoring='f1', n_jobs=-1)
        model = grid_search.fit(X_train, y_train).best_estimator_
    else:
        model.fit(X_train, y_train)
    
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    print(f"Cross-validated Fq Score (Training): {scores.mean():.4f}")

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_train_proba = model.predict_proba(X_train)[:,1]
        y_test_proba = model.predict_proba(X_test)[:,1]
    else:
        y_train_proba = None
        y_test_proba = None

    model_train_accuracy, model_train_precision, model_train_recall, model_train_f1, model_train_roc_auc = evaluate_model(y_train, y_train_pred, y_train_proba)
    model_test_accuracy, model_test_precision, model_test_recall, model_test_f1, model_test_roc_auc = evaluate_model(y_test, y_test_pred,y_test_proba)
    
    print(model_name)
    model_list.append(model_name)

    print("Model performance for Training set")
    print("-Accuracy: {:.4f}".format(model_train_accuracy))
    print("-Precision: {:.4f}".format(model_train_precision))
    print("-Recall: {:.4f}".format(model_train_recall))
    print("-f1 Score: {:.4f}".format(model_train_f1))
    if model_train_roc_auc is not None:
        print("-ROC AUC: {:.4f}".format(model_train_roc_auc))
    
    print('----------------------------------')

    print("Model performance for Test set")
    print("-Accuracy: {:.4f}".format(model_test_accuracy))
    print("-Precision: {:.4f}".format(model_test_precision))
    print("-Recall: {:.4f}".format(model_test_recall))
    print("-f1 Score: {:.4f}".format(model_test_f1))
    if model_test_roc_auc is not None:
        print("-ROC AUC {:.4f}".format(model_test_roc_auc))
    f1_list.append(model_test_f1)

    print('='*35)
    print('\n')




Training Logistic Regression...
Cross-validated Fq Score (Training): 0.5403
Logistic Regression
Model performance for Training set
-Accuracy: 0.6574
-Precision: 0.6441
-Recall: 0.7034
-f1 Score: 0.6725
-ROC AUC: 0.7341
----------------------------------
Model performance for Test set
-Accuracy: 0.4625
-Precision: 0.4696
-Recall: 0.5373
-f1 Score: 0.5012
-ROC AUC 0.4704


Training Random Forest Classifier...
Cross-validated Fq Score (Training): 0.5239
Random Forest Classifier
Model performance for Training set
-Accuracy: 0.8814
-Precision: 0.8563
-Recall: 0.9165
-f1 Score: 0.8854
-ROC AUC: 0.9612
----------------------------------
Model performance for Test set
-Accuracy: 0.4550
-Precision: 0.4601
-Recall: 0.4876
-f1 Score: 0.4734
-ROC AUC 0.4605


Training Decision Tree Classifier...
Cross-validated Fq Score (Training): 0.5230
Decision Tree Classifier
Model performance for Training set
-Accuracy: 1.0000
-Precision: 1.0000
-Recall: 1.0000
-f1 Score: 1.0000
-ROC AUC: 1.0000
-------------



Cross-validated Fq Score (Training): 0.5317
Ada Boost Classifier
Model performance for Training set
-Accuracy: 0.6162
-Precision: 0.6375
-Recall: 0.5387
-f1 Score: 0.5840
-ROC AUC: 0.6756
----------------------------------
Model performance for Test set
-Accuracy: 0.4700
-Precision: 0.4654
-Recall: 0.3682
-f1 Score: 0.4111
-ROC AUC 0.4665


Training LightGBM...
[LightGBM] [Info] Number of positive: 826, number of negative: 826
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000800 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 556
[LightGBM] [Info] Number of data points in the train set: 1652, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 661, number of negative: 660
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead o



Cross-validated Fq Score (Training): 0.5066
Neural Network MLPClassifier
Model performance for Training set
-Accuracy: 0.9292
-Precision: 0.9127
-Recall: 0.9492
-f1 Score: 0.9306
-ROC AUC: 0.9839
----------------------------------
Model performance for Test set
-Accuracy: 0.4750
-Precision: 0.4780
-Recall: 0.4876
-f1 Score: 0.4828
-ROC AUC 0.4715




