In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib

df = pd.read_csv('Crime_processed_no_outliers.csv')

# 2. Define target and features
top_features = ['IUCR', 'Primary Type', 'Longitude', 'Latitude', 'Day', 'Hour']
X = df[top_features]
y = df['Arrest']

# 3. Categorical and numerical columns
categorical_cols = ['IUCR', 'Primary Type', 'Day']
numerical_cols = ['Longitude', 'Latitude', 'Hour']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('LogisticRegression', LogisticRegression(max_iter=1000, random_state=42, solver='liblinear'))
])


pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
print("\n=== Training Evaluation ===")
print(classification_report(y_train, y_train_pred))
print("Confusion Matrix (Train):")
print(confusion_matrix(y_train, y_train_pred))

# Evaluate on testing
y_test_pred = pipeline.predict(X_test)
print("\n=== Testing Evaluation ===")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))

joblib.dump(pipeline, 'logistic_regression_pipeline.pkl')

KeyboardInterrupt: 