# Predicción de categorías delictivas

In [1]:
import pandas as pd
import numpy as np
from tabulate import tabulate
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



## Carga de datos

In [2]:
crimes_afluency_ft = pd.read_csv('./data_crimes_afluency_line_type.csv', 
                 sep=',', engine='python', encoding="UTF-8", index_col=False)

In [3]:
crimes_afluency_ft.head(3)

Unnamed: 0,line,type,weekday,id_crime_category,afluency_distribution,crime_frequency
0,1,Intermedia,1,5,11130.083333,1.0
1,1,Intermedia,1,6,11130.083333,10.0
2,1,Intermedia,1,7,11130.083333,0.0


In [4]:
crimes_afluency_ft.columns.values

array(['line', 'type', 'weekday', 'id_crime_category',
       'afluency_distribution', 'crime_frequency'], dtype=object)

## Preprocesamiento

Establecemos columnas categóricas a codificar con OneHotEncoder, y por otro lado las numéricas que serán sometidas a reescalamiento

In [5]:
# Columnas categóricas a codificar con OneHotEncoder
categorical_cols = ['line', 'type', 'weekday', 'id_crime_category']

# Columnas numéricas a escalar con StandardScaler
numeric_cols = ['afluency_distribution']

Definimos pipeline con el preprocesamiento deseado

In [6]:
# Definir el preprocesamiento con ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_cols),
        ('scaler', StandardScaler(), numeric_cols)
    ])

Filtramos dataset para categoría delictiva específica
- 5: ROBO A TRANSEUNTE EN VÍA PÚBLICA CON Y SIN VIOLENCIA
- 6: DELITO DE BAJO IMPACTO
- 7: ROBO DE VEHÍCULO CON Y SIN VIOLENCIA
- 11: ROBO A NEGOCIO CON VIOLENCIA
- 13: ROBO A PASAJERO A BORDO DEL METRO CON Y SIN VIOLENCIA

In [7]:
crimes_afluency_ft = crimes_afluency_ft[crimes_afluency_ft['id_crime_category'] == 6]

## Training with 80% of data

In [8]:
target = 'crime_frequency'

In [9]:
X = crimes_afluency_ft.drop([target], axis = 1)
y = crimes_afluency_ft[target]

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [11]:
def regression_metrics(y_real, y_pred):
    mae = mean_absolute_error(y_real, y_pred)
    mse = mean_squared_error(y_real, y_pred)
    rmse = mean_squared_error(y_real, y_pred, squared=False)
    r2 = r2_score(y_real, y_pred)
    print("MAE:", mae)
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("R²:", r2)

## Regression models

### MultiLayer Perceptron Regressor

In [12]:
from sklearn.neural_network import MLPRegressor

In [13]:
mlpr = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', MLPRegressor(activation = 'tanh', alpha = 0.0007, learning_rate_init = 0.0001, random_state=1, max_iter=500, verbose = True))
    ])

In [14]:
mlpr.fit(X_train, y_train)

Iteration 1, loss = 10.39983813
Iteration 2, loss = 8.35197502
Iteration 3, loss = 7.06099972
Iteration 4, loss = 6.19531408
Iteration 5, loss = 5.55695458
Iteration 6, loss = 5.07054044
Iteration 7, loss = 4.70670686
Iteration 8, loss = 4.44476394
Iteration 9, loss = 4.26288071
Iteration 10, loss = 4.14232031
Iteration 11, loss = 4.06264491
Iteration 12, loss = 4.00944943
Iteration 13, loss = 3.97157478
Iteration 14, loss = 3.94325906
Iteration 15, loss = 3.91969738
Iteration 16, loss = 3.89889223
Iteration 17, loss = 3.87861480
Iteration 18, loss = 3.85958946
Iteration 19, loss = 3.83998814
Iteration 20, loss = 3.82086158
Iteration 21, loss = 3.80102000
Iteration 22, loss = 3.78117113
Iteration 23, loss = 3.76045681
Iteration 24, loss = 3.73954457
Iteration 25, loss = 3.71758303
Iteration 26, loss = 3.69581880
Iteration 27, loss = 3.67359765
Iteration 28, loss = 3.65078526
Iteration 29, loss = 3.62768821
Iteration 30, loss = 3.60545994
Iteration 31, loss = 3.58099210
Iteration 32, lo

Iteration 253, loss = 2.71689045
Iteration 254, loss = 2.71772506
Iteration 255, loss = 2.71688517
Iteration 256, loss = 2.71712438
Iteration 257, loss = 2.71684630
Iteration 258, loss = 2.71641399
Iteration 259, loss = 2.71640542
Iteration 260, loss = 2.71646363
Iteration 261, loss = 2.71600092
Iteration 262, loss = 2.71616074
Iteration 263, loss = 2.71623364
Iteration 264, loss = 2.71619800
Iteration 265, loss = 2.71594452
Iteration 266, loss = 2.71564206
Iteration 267, loss = 2.71561946
Iteration 268, loss = 2.71559031
Iteration 269, loss = 2.71561742
Iteration 270, loss = 2.71528507
Iteration 271, loss = 2.71491392
Iteration 272, loss = 2.71529589
Iteration 273, loss = 2.71525501
Iteration 274, loss = 2.71506397
Iteration 275, loss = 2.71521670
Iteration 276, loss = 2.71483441
Iteration 277, loss = 2.71438101
Iteration 278, loss = 2.71459205
Iteration 279, loss = 2.71465101
Iteration 280, loss = 2.71463082
Iteration 281, loss = 2.71452385
Iteration 282, loss = 2.71439588
Iteration 

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['line', 'type', 'weekday',
                                                   'id_crime_category']),
                                                 ('scaler', StandardScaler(),
                                                  ['afluency_distribution'])])),
                ('regressor',
                 MLPRegressor(activation='tanh', alpha=0.0007,
                              learning_rate_init=0.0001, max_iter=500,
                              random_state=1, verbose=True))])

In [15]:
y_pred = mlpr.predict(X_val)

In [16]:
regression_metrics(y_val, y_pred)

MAE: 1.5346818102959159
MSE: 5.565966114570813
RMSE: 2.359229983399417
R²: 0.6462276215893921


### SGD Regressor

In [17]:
from sklearn.linear_model import SGDRegressor

In [18]:
sgdr = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', SGDRegressor(max_iter=1000, tol=1e-3))
    ])

In [19]:
sgdr.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['line', 'type', 'weekday',
                                                   'id_crime_category']),
                                                 ('scaler', StandardScaler(),
                                                  ['afluency_distribution'])])),
                ('regressor', SGDRegressor())])

In [20]:
y_pred = sgdr.predict(X_val)

In [21]:
regression_metrics(y_val, y_pred)

MAE: 2.0410096284762145
MSE: 7.979481634855251
RMSE: 2.8247976272390294
R²: 0.49282476063651337


### Gradient Boosting Regressor

In [22]:
from sklearn.ensemble import GradientBoostingRegressor

In [23]:
gbr = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', GradientBoostingRegressor(random_state=0))
    ])

In [24]:
gbr.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['line', 'type', 'weekday',
                                                   'id_crime_category']),
                                                 ('scaler', StandardScaler(),
                                                  ['afluency_distribution'])])),
                ('regressor', GradientBoostingRegressor(random_state=0))])

In [25]:
y_pred = gbr.predict(X_val)

In [26]:
regression_metrics(y_val, y_pred)

MAE: 1.579872727124761
MSE: 5.763568150831782
RMSE: 2.4007432496691066
R²: 0.6336680513534507


### Random Forest Regressor

In [27]:
from sklearn.ensemble import RandomForestRegressor

In [28]:
rfr = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(max_depth=None, random_state=0))
    ])

In [29]:
rfr.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['line', 'type', 'weekday',
                                                   'id_crime_category']),
                                                 ('scaler', StandardScaler(),
                                                  ['afluency_distribution'])])),
                ('regressor', RandomForestRegressor(random_state=0))])

In [30]:
y_pred = rfr.predict(X_val)

In [31]:
regression_metrics(y_val, y_pred)

MAE: 1.6563728772244866
MSE: 6.548465884143069
RMSE: 2.55899704652879
R²: 0.5837800117558469


### SVR

In [32]:
from sklearn.svm import SVR

In [33]:
svr = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('regressor', SVR(kernel='rbf'))
    ])

In [34]:
svr.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['line', 'type', 'weekday',
                                                   'id_crime_category']),
                                                 ('scaler', StandardScaler(),
                                                  ['afluency_distribution'])])),
                ('regressor', SVR())])

In [35]:
y_pred = svr.predict(X_val)

In [36]:
regression_metrics(y_val, y_pred)

MAE: 1.5006908728970973
MSE: 5.671840022329213
RMSE: 2.3815625169894687
R²: 0.6394982841503332


## Validación de resultados con el mejor modelo

In [37]:
from tabulate import tabulate

In [38]:
y_val_preds = gbr.predict(X_val)

In [39]:
res = list()
for i in range(len(y_val_preds)):
    y_pred = y_val_preds[i]
    y_real = y_val.iloc[i]
    error = abs(y_pred - y_real)
    res.append([round(y_real, 2), round(y_pred, 2), round(error, 2)])

In [40]:
print(tabulate(res, headers = ['Real y', 'Predicted y', 'Abs. Error'], tablefmt = 'fancy_grid', numalign = 'center'))

╒══════════╤═══════════════╤══════════════╕
│  Real y  │  Predicted y  │  Abs. Error  │
╞══════════╪═══════════════╪══════════════╡
│    0     │     1.11      │     1.11     │
├──────────┼───────────────┼──────────────┤
│    7     │     6.35      │     0.65     │
├──────────┼───────────────┼──────────────┤
│    16    │     13.54     │     2.46     │
├──────────┼───────────────┼──────────────┤
│    0     │     3.18      │     3.18     │
├──────────┼───────────────┼──────────────┤
│    2     │     8.48      │     6.48     │
├──────────┼───────────────┼──────────────┤
│    2     │     1.08      │     0.92     │
├──────────┼───────────────┼──────────────┤
│    1     │     4.74      │     3.74     │
├──────────┼───────────────┼──────────────┤
│    2     │     2.22      │     0.22     │
├──────────┼───────────────┼──────────────┤
│    11    │     9.18      │     1.82     │
├──────────┼───────────────┼──────────────┤
│    0     │     0.34      │     0.34     │
├──────────┼───────────────┼────

## Simulating data

In [41]:
line_values = crimes_afluency_ft.line.unique()
type_station_values = crimes_afluency_ft.type.unique()
weekday_values = crimes_afluency_ft.weekday.unique()
crime_category_values = crimes_afluency_ft.id_crime_category.unique()
afluency_dist_values = crimes_afluency_ft.afluency_distribution.unique()

In [42]:
def random_sample(arr):
    return np.random.choice(arr)

In [43]:
def simulated_data(unique_values, col_names, n = 50):
    data = [[random_sample(arr) for arr in unique_values] for i in range(n)]
    return pd.DataFrame(data, columns = col_names)

In [44]:
unique_values = [line_values, type_station_values, weekday_values, crime_category_values, afluency_dist_values]

In [45]:
features_names = list(crimes_afluency_ft.columns)[:-1]
features_names

['line', 'type', 'weekday', 'id_crime_category', 'afluency_distribution']

In [46]:
simulated_df = simulated_data(unique_values, features_names, n = 100)

In [47]:
simulated_df

Unnamed: 0,line,type,weekday,id_crime_category,afluency_distribution
0,1,Terminal,0,6,20988.166667
1,8,Terminal / Transbordo,1,6,6991.000000
2,A,Terminal,3,6,16206.000000
3,A,Terminal,1,6,11810.500000
4,4,Terminal,2,6,34746.000000
...,...,...,...,...,...
95,7,Terminal / Transbordo,2,6,13366.000000
96,A,Transbordo,2,6,13551.000000
97,8,Terminal,4,6,8374.000000
98,9,Transbordo,6,6,4352.666667


In [48]:
y_sim_preds = gbr.predict(simulated_df)

In [49]:
simulated_df_preds = pd.concat([simulated_df, pd.Series(y_sim_preds, name = 'pred_crime_frequency')], axis = 1)

In [50]:
simulated_df_preds[(simulated_df_preds['line'] == '2')]

Unnamed: 0,line,type,weekday,id_crime_category,afluency_distribution,pred_crime_frequency
6,2,Transbordo,6,6,7666.714286,4.230584
9,2,Terminal,5,6,27045.0,0.221582
23,2,Transbordo,2,6,30984.25,8.78663
82,2,Intermedia,5,6,18352.166667,12.073677
87,2,Transbordo,5,6,28831.333333,8.37968


In [51]:
validation_df_preds = pd.concat([X_val.reset_index(), pd.Series(y_val_preds, name = 'pred_crime_frequency')], axis = 1)

In [52]:
validation_df_preds[(validation_df_preds['line'] == '2')]

Unnamed: 0,index,line,type,weekday,id_crime_category,afluency_distribution,pred_crime_frequency
2,68041,2,Intermedia,5,6,26160.625000,13.544258
12,143651,2,Transbordo,5,6,15677.000000,6.011029
29,203641,2,Intermedia,4,6,13698.312500,12.142034
52,214251,2,Transbordo,1,6,18296.833333,7.587454
157,149051,2,Transbordo,4,6,0.000000,5.339413
...,...,...,...,...,...,...,...
8679,80246,2,Terminal,3,6,92905.500000,0.522069
8710,5846,2,Terminal,2,6,104438.000000,0.522069
8711,51251,2,Transbordo,5,6,27386.000000,7.959806
8727,19641,2,Intermedia,1,6,28233.375000,14.097487


## Export model

In [53]:
import pickle

In [54]:
with open('crimes_regressor.pkl', 'wb') as f:
    pickle.dump(gbr, f)