In [634]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer,roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


In [635]:
df = pd.read_csv('modified_bail_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Crime_Type,Charges_Filed,Time_Served_Months,Prior_Criminal_History,Risk_of_Flight,Influence_on_Trial,Bail_Decision,Socio_Economic_Status
0,0,Cyber Crime,"['IPC 379', 'IPC 307', 'IPC 302']",33,0,0.945674,0.115675,Rejected,Low
1,1,Drug-related,"['Narcotics Act', 'Narcotics Act']",7,0,0.408927,0.946264,Granted,Low
2,2,Economic,['Cyber Laws'],41,0,0.66197,0.733362,Granted,High
3,3,Drug-related,['IPC 307'],8,0,0.464855,0.637499,Granted,High
4,4,Drug-related,"['Economic Offenses Act', 'Narcotics Act', 'IP...",37,1,0.630589,0.645278,Granted,Low


In [636]:
df['Bail_Decision'] = LabelEncoder().fit_transform(df['Bail_Decision'])

In [637]:
X = df.drop(['Unnamed: 0', 'Bail_Decision', 'Charges_Filed'], axis = 1)
y = df['Bail_Decision']

In [638]:
categorical_features = ['Crime_Type', 'Socio_Economic_Status']
numerical_features = ['Time_Served_Months', 'Prior_Criminal_History', 'Risk_of_Flight', 'Influence_on_Trial']

In [639]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features),
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features)
    ],
    remainder='passthrough'
)

In [640]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [641]:
log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, C=0.1))
])

In [642]:
gb_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.05, max_depth=3))
])

In [643]:
rf_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, max_depth=5, min_samples_split=20))
])

In [644]:
ada_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier(random_state=42, n_estimators=50, learning_rate=1.0))
])

In [645]:
voting_clf = VotingClassifier(
    estimators=[
        ('gb_clf', gb_clf),
        ('log_reg', log_reg)
    ],
    voting='soft'
)

In [646]:
voting_clf1 = VotingClassifier(
    estimators=[
        ('rf_clf', rf_clf),
        ('ada_clf', ada_clf)
    ],
    voting='soft'
)

In [647]:
final_voting_clf = VotingClassifier(
    estimators=[
        ('voting_clf', voting_clf),
        ('voting_clf1', voting_clf1)
    ],
    voting='soft'
)

In [648]:
final_voting_clf.fit(X_train, y_train)



In [649]:
training_accuracy = final_voting_clf.score(X_train, y_train)
testing_accuracy = final_voting_clf.score(X_test, y_test)

training_accuracy, testing_accuracy

(0.6742857142857143, 0.5383333333333333)

In [650]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(final_voting_clf, X_train, y_train, cv=5)
print(f'Cross-validation scores: {scores}')
print(f'Mean cross-validation score: {scores.mean()}')



Cross-validation scores: [0.54642857 0.5        0.54642857 0.50714286 0.49285714]
Mean cross-validation score: 0.5185714285714285




In [652]:
import joblib
joblib.dump(final_voting_clf, 'final_voting_clf_model.pkl')

['final_voting_clf_model.pkl']