In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
import joblib

# 1. Load data
df = pd.read_csv('Crime_processed_no_outliers.csv')

# 2. Features and target
top_features = ['IUCR', 'Primary Type', 'Longitude', 'Latitude', 'Day', 'Hour']
X = df[top_features]
y = df['Arrest']

# 3. Categorical and numerical columns
categorical_cols = ['IUCR', 'Primary Type', 'Day']
numerical_cols = ['Longitude', 'Latitude', 'Hour']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ]
)


pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(
        random_state=42,
        criterion='entropy',
        class_weight='balanced',
        max_depth=5,
        min_samples_split=20,
        min_samples_leaf=10
    ))
])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 7. Fit pipeline
pipeline.fit(X_train, y_train)

threshold = 0.6

y_train_probs = pipeline.predict_proba(X_train)[:, 1]
y_test_probs = pipeline.predict_proba(X_test)[:, 1]


y_train_pred_thresh = (y_train_probs >= threshold).astype(int)
y_test_pred_thresh = (y_test_probs >= threshold).astype(int)

print(f"Classification Report on TRAIN data (threshold={threshold}):")
print(classification_report(y_train, y_train_pred_thresh))

print(f"\nClassification Report on TEST data (threshold={threshold}):")
print(classification_report(y_test, y_test_pred_thresh))


print(f"Train Precision: {precision_score(y_train, y_train_pred_thresh):.4f}")
print(f"Train Recall:    {recall_score(y_train, y_train_pred_thresh):.4f}")
print(f"Train F1-score:  {f1_score(y_train, y_train_pred_thresh):.4f}")

print(f"Test Precision:  {precision_score(y_test, y_test_pred_thresh):.4f}")
print(f"Test Recall:     {recall_score(y_test, y_test_pred_thresh):.4f}")
print(f"Test F1-score:   {f1_score(y_test, y_test_pred_thresh):.4f}")


train_acc = accuracy_score(y_train, y_train_pred_thresh)
test_acc = accuracy_score(y_test, y_test_pred_thresh)
print(f"\nTrain Accuracy (threshold={threshold}): {train_acc:.4f}")
print(f"Test Accuracy  (threshold={threshold}): {test_acc:.4f}")

joblib.dump(pipeline, 'decision_tree_pipeline.pkl')




Classification Report on TRAIN data (threshold=0.6):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    164280
           1       0.95      0.33      0.49     21436

    accuracy                           0.92    185716
   macro avg       0.93      0.66      0.72    185716
weighted avg       0.92      0.92      0.90    185716


Classification Report on TEST data (threshold=0.6):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     41071
           1       0.95      0.33      0.50      5359

    accuracy                           0.92     46430
   macro avg       0.93      0.67      0.73     46430
weighted avg       0.92      0.92      0.90     46430

Train Precision: 0.9466
Train Recall:    0.3309
Train F1-score:  0.4904
Test Precision:  0.9482
Test Recall:     0.3350
Test F1-score:   0.4950

Train Accuracy (threshold=0.6): 0.9206
Test Accuracy  (threshold=0.6): 0.9211


['decision_tree_pipeline.pkl']