In [238]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer,roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


Loading dataset

In [239]:
df = pd.read_csv('modified_bail_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Crime_Type,Charges_Filed,Time_Served_Months,Prior_Criminal_History,Risk_of_Flight,Influence_on_Trial,Bail_Decision,Socio_Economic_Status
0,0,Cyber Crime,"['IPC 379', 'IPC 307', 'IPC 302']",33,0,0.945674,0.115675,Rejected,Low
1,1,Drug-related,"['Narcotics Act', 'Narcotics Act']",7,0,0.408927,0.946264,Granted,Low
2,2,Economic,['Cyber Laws'],41,0,0.66197,0.733362,Granted,High
3,3,Drug-related,['IPC 307'],8,0,0.464855,0.637499,Granted,High
4,4,Drug-related,"['Economic Offenses Act', 'Narcotics Act', 'IP...",37,1,0.630589,0.645278,Granted,Low


In [240]:
df['Bail_Decision'] = LabelEncoder().fit_transform(df['Bail_Decision'])

In [241]:
X = df.drop(['Unnamed: 0', 'Bail_Decision', 'Charges_Filed'], axis = 1)
y = df['Bail_Decision']

In [242]:
categorical_features = ['Crime_Type', 'Socio_Economic_Status']
numerical_features = ['Time_Served_Months', 'Prior_Criminal_History', 'Risk_of_Flight', 'Influence_on_Trial']

In [243]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features),
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features)
    ],
    remainder='passthrough'
)

In [244]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [245]:
log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, C=0.1))
])

In [246]:
gb_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.05, max_depth=3))
])

In [247]:
rf_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, max_depth=5, min_samples_split=20))
])

In [248]:
voting_clf = VotingClassifier(
    estimators=[
        ('log_reg', log_reg),
        ('gb_clf', gb_clf),
        ('rf_clf', rf_clf)
    ],
    voting='soft'
)

In [249]:
voting_clf.fit(X_train, y_train)

In [250]:
training_accuracy = voting_clf.score(X_train, y_train)
testing_accuracy = voting_clf.score(X_test, y_test)

training_accuracy, testing_accuracy

(0.6714285714285714, 0.54)

In [251]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(voting_clf, X_train, y_train, cv=5)
print(f'Cross-validation scores: {scores}')
print(f'Mean cross-validation score: {scores.mean()}')

Cross-validation scores: [0.53928571 0.5        0.54642857 0.51071429 0.49642857]
Mean cross-validation score: 0.5185714285714285
