In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import joblib

# 1. Load the preprocessed data
df = pd.read_csv('Crime_processed_no_outliers.csv')  # Ensure your data file is in the correct path

# 2. Define categorical and numerical columns explicitly
# (Update these lists based on your actual dataset columns)
categorical_cols = ['IUCR', 'Primary Type','Location Description']  # example categorical columns
numerical_cols = ['Latitude','Longitude','Beat']

X = df.drop('Arrest', axis=1)
y = df['Arrest']

# 3. Balance classes with under-sampling BEFORE split (important)
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = X,y

print(f"Original data shape: X={X.shape}, y={y.shape}")
print(f"Resampled data shape: X={X_resampled.shape}, y={y_resampled.shape}")

# 4. Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# 5. Create preprocessing pipeline
from sklearn.preprocessing import OrdinalEncoder

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols)
])


# 6. Create full pipeline with preprocessor + Random Forest
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', RandomForestClassifier(
        random_state=42,
        class_weight='balanced',
        n_estimators=500,
        max_depth=50,
        min_samples_split=20,
        min_samples_leaf=10
        ))
])


# 7. Train the pipeline
pipeline.fit(X_train, y_train)

threshold = 0.7

# Train predictions with custom threshold
y_train_probs = pipeline.predict_proba(X_train)[:, 1]
y_train_pred_thresh = (y_train_probs >= threshold).astype(int)

# Test predictions with custom threshold
y_test_probs = pipeline.predict_proba(X_test)[:, 1]
y_test_pred_thresh = (y_test_probs >= threshold).astype(int)

# 8. Cross-validation scores (using the full resampled dataset)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_resampled, y_resampled, cv=cv, scoring='accuracy')
print("Random Forest Cross-validation Accuracy Scores:", cv_scores)
print("Random Forest Mean CV Accuracy:", cv_scores.mean())

# 9. Training evaluation with custom threshold
print("\nRandom Forest Training Evaluation with Custom Threshold:")
print(classification_report(y_train, y_train_pred_thresh))

# 10. Testing evaluation with custom threshold
print("\nRandom Forest Testing Evaluation with Custom Threshold:")
print(classification_report(y_test, y_test_pred_thresh))

joblib.dump(pipeline, 'random_forest_pipeline.pkl')

Original data shape: X=(232146, 11), y=(232146,)
Resampled data shape: X=(232146, 11), y=(232146,)
Random Forest Cross-validation Accuracy Scores: [0.87632996 0.87882573 0.8757673  0.87749036 0.87615499]
Random Forest Mean CV Accuracy: 0.8769136689140018

Random Forest Training Evaluation with Custom Threshold:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97    164280
           1       0.84      0.58      0.69     21436

    accuracy                           0.94    185716
   macro avg       0.90      0.78      0.83    185716
weighted avg       0.94      0.94      0.93    185716


Random Forest Testing Evaluation with Custom Threshold:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     41071
           1       0.79      0.52      0.62      5359

    accuracy                           0.93     46430
   macro avg       0.86      0.75      0.79     46430
weighted avg       0.92      0.

['random_forest_pipeline.pkl']