In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
import joblib
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
import numpy as np

data = pd.read_csv("/content/NYPD_ARREST_DATA.csv")
data['Latitude'] = data['Latitude'].round(3)
data['Longitude'] = data['Longitude'].round(3)

features = ['Latitude', 'Longitude']

target_crime = 'OFNS_DESC'

label_encoder_crime = LabelEncoder()
data[target_crime] = label_encoder_crime.fit_transform(data[target_crime])

crime_density = data.groupby(['Latitude', 'Longitude']).size().reset_index(name='Crime_Count')
data = pd.merge(data, crime_density, on=['Latitude', 'Longitude'], how='left')
data['Crime_Risk_Percentage'] = (data['Crime_Count'] / data['Crime_Count'].max()) * 100

In [15]:
joblib.dump(label_encoder_crime, 'LABEL_ENCODER_CRIME.pkl')

['LABEL_ENCODER_CRIME.pkl']

In [3]:
model_crime_risk = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ]), features)
        ]
    )),
    ('regressor', RandomForestRegressor())  # Use RandomForestRegressor for continuous crime risk percentage
])

model_crime_type = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
                # ('scaler', StandardScaler())  # Remove scaler if scaling is unnecessary
            ]), features)
        ]
    )),
    ('classifier', DecisionTreeClassifier(max_depth=10, random_state=42))  # Use DecisionTreeClassifier to save memory
])

# Split the data into training and testing sets
X = data[features]
y_crime_risk = data['Crime_Risk_Percentage']
y_crime_type = data[target_crime]
X_train_risk, X_test_risk, y_train_risk, y_test_risk = train_test_split(X, y_crime_risk, test_size=0.2, random_state=42)
X_train_type, X_test_type, y_train_type, y_test_type = train_test_split(X, y_crime_type, test_size=0.2, random_state=42)

In [4]:
data.head(3)

Unnamed: 0,ARREST_DATE,PD_DESC,OFNS_DESC,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,AGE_GROUP,PERP_SEX,PERP_RACE,Latitude,Longitude,Crime_Count,Crime_Risk_Percentage
0,01/26/2019,SEXUAL ABUSE,78,F,M,25,45-64,M,BLACK,40.801,-73.941,11140,36.485114
1,01/06/2016,RAPE 3,76,F,K,67,25-44,M,BLACK,40.649,-73.95,838,2.744571
2,11/15/2018,RAPE 1,76,F,K,77,25-44,M,BLACK,40.675,-73.93,3424,11.214096


In [5]:
y_train_risk = y_train_risk.fillna(y_train_risk.mean())
model_crime_risk.fit(X_train_risk, y_train_risk)

In [6]:
import numpy as np
y_pred_risk = model_crime_risk.predict(X_test_risk)
valid_indices = ~np.isnan(y_test_risk) & ~np.isnan(y_pred_risk)
y_test_risk = y_test_risk[valid_indices]
y_pred_risk = y_pred_risk[valid_indices]
print("Crime Risk Model Evaluation:")
print(f"Mean Squared Error: {mean_squared_error(y_test_risk, y_pred_risk)}")

Crime Risk Model Evaluation:
Mean Squared Error: 6.694904477886395e-05


In [7]:
sample_input = pd.DataFrame({
    'Latitude': [40.648650],
    'Longitude': [-73.950336]
})

crime_risk_percentage = model_crime_risk.predict(sample_input)[0]
print(f"Predicted Crime Risk Percentage: {crime_risk_percentage:.2f}%")

Predicted Crime Risk Percentage: 2.74%


In [8]:
model_crime_type.fit(X_train_type, y_train_type)

In [9]:
y_pred_type = model_crime_type.predict(X_test_type)
print("\nCrime Type Model Evaluation:")
print(classification_report(y_test_type, y_pred_type, zero_division=0))


Crime Type Model Evaluation:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00      2736
           2       0.00      0.00      0.00        26
           3       0.00      0.00      0.00        29
           4       0.00      0.00      0.00      1075
           5       0.00      0.00      0.00       138
           6       0.00      0.00      0.00       363
           7       0.18      0.21      0.19    102532
           8       0.00      0.00      0.00      1622
           9       0.21      0.00      0.00     14538
          10       0.00      0.00      0.00        26
          11       0.00      0.00      0.00        72
          12       0.00      0.00      0.00      3860
          13       0.17      0.00      0.00     28875
          14       0.32      0.01      0.02     39717
          15       0.28      0.88      0.42    216142
          16       0.14      0.00      0.00     392

In [13]:
crime_type_probabilities = model_crime_type.predict_proba(sample_input)[0]
predicted_crime_types = label_encoder_crime.inverse_transform(crime_type_probabilities.argsort()[::-1][:3])

print("\nTop 3 Likely Crime Types:")
for i, crime_type in enumerate(predicted_crime_types):
    print(f"{i + 1}. {crime_type}: {crime_type_probabilities[crime_type_probabilities.argsort()[::-1][i]] * 100:.2f}%")


Top 3 Likely Crime Types:
1. DANGEROUS DRUGS: 15.95%
2. ASSAULT 3 & RELATED OFFENSES: 13.38%
3. OTHER TRAFFIC INFRACTION: 5.99%


In [14]:
mse = mean_squared_error(y_test_risk, y_pred_risk)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_risk, y_pred_risk)

y_range = max(y_test_risk) - min(y_test_risk)
y_variance = np.var(y_test_risk)

# Print Results
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print(f"Target Range: {y_range:.4f}")
print(f"Target Variance: {y_variance:.4f}")

Mean Squared Error (MSE): 0.0001
Root Mean Squared Error (RMSE): 0.0082
R-squared (R²): 1.0000
Target Range: 99.9967
Target Variance: 266.2285


In [12]:
joblib.dump(model_crime_risk, 'AJU_MODEL_CRIME_RISK.pkl',compress=9)

joblib.dump(model_crime_type, 'AJU_MODEL_CRIME_TYPE.pkl',compress=9)

['AJU_MODEL_CRIME_TYPE.pkl']

In [21]:
import os
def split_large_pkl(file_path, output_dir, chunk_size=25):
    chunk_size_bytes = chunk_size * 1024 * 1024  # Convert MB to bytes
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    # Read the full file
    with open(file_path, 'rb') as f:
        data = f.read()
    # Split into chunks
    chunks = [data[i:i + chunk_size_bytes] for i in range(0, len(data), chunk_size_bytes)]
    # Save each chunk
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    for idx, chunk in enumerate(chunks):
        chunk_file = os.path.join(output_dir, f"{base_name}_PART{idx + 1}.pkl")
        with open(chunk_file, 'wb') as f:
            f.write(chunk)
        print(f"Saved chunk: {chunk_file}")
# Usage
split_large_pkl('AJU_MODEL_CRIME_RISK.pkl', output_dir='CRINE_RISK_CHUNKS', chunk_size=25)


Saved chunk: CRINE_RISK_CHUNKS/AJU_MODEL_CRIME_RISK_PART1.pkl
Saved chunk: CRINE_RISK_CHUNKS/AJU_MODEL_CRIME_RISK_PART2.pkl
Saved chunk: CRINE_RISK_CHUNKS/AJU_MODEL_CRIME_RISK_PART3.pkl
Saved chunk: CRINE_RISK_CHUNKS/AJU_MODEL_CRIME_RISK_PART4.pkl
Saved chunk: CRINE_RISK_CHUNKS/AJU_MODEL_CRIME_RISK_PART5.pkl
Saved chunk: CRINE_RISK_CHUNKS/AJU_MODEL_CRIME_RISK_PART6.pkl
Saved chunk: CRINE_RISK_CHUNKS/AJU_MODEL_CRIME_RISK_PART7.pkl
