In [21]:
from modeling import categorize_arrests, split_last
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import numpy as np

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
df = pd.read_sql_query("""
                        SELECT latitude, 
                                longitude, 
                                ap_t_high100, 
                                n_arrests
                        FROM manhattan_loc_d_ar_wea 
                        ;"""
                        , 'postgresql:///walk')

In [10]:
categorized_df = categorize_arrests(df)

In [11]:
# add combined lat/long location feature
categorized_df['latlong'] = (categorized_df['latitude'].astype(str) 
                            + categorized_df['longitude'].astype(str))

In [31]:
X_train, y_train, X_eval, y_eval = split_last(categorized_df, target_col='n_arrests',
                                                  sort_col=None, cut=.8)

In [32]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24889479 entries, 0 to 24889478
Data columns (total 4 columns):
latitude        float64
longitude       float64
ap_t_high100    int64
latlong         object
dtypes: float64(2), int64(1), object(1)
memory usage: 759.6+ MB


In [33]:
y_train.info()

AttributeError: 'Series' object has no attribute 'info'

## Nieve Log Loss as baseline

In [19]:
preds = np.zeros(y_train.shape)
preds[0]=1

In [20]:
log_loss(y_train, preds)

1.0406075360295546

## Modeling based on temp, lat/long

In [22]:
# train test split to allow for evaluation w/out the time needed for cross validation,
# also avoiding overfitting on evaluation data set via repeated trials
X_train, y_train, X_test, y_test = train_test_split(X_train, y_train)

### Random Forest w/out balanced class weights

In [23]:
rfc_unbalanced = RandomForestClassifier(n_estimators=100, max_depth=20)

In [28]:
unbal_column_transformer = ColumnTransformer( 
    transformers=[
    ('ohe', OneHotEncoder(categories='auto'), ['latlong']),
    ('pass', 'passthrough', ['latitude', 'longitude', 'ap_t_high100']),
    ], remainder='drop')

In [25]:
pipe_unbalanced = Pipeline([
                            ('preprocessor', unbal_column_transformer),
                            ('model', rfc_unbalanced)
                        ])

In [30]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6222370 entries, 22618179 to 193229
Data columns (total 4 columns):
latitude        float64
longitude       float64
ap_t_high100    int64
latlong         object
dtypes: float64(2), int64(1), object(1)
memory usage: 237.4+ MB


In [27]:
pipe_unbalanced.fit(X_train, y_train)

TypeError: '<' not supported between instances of 'str' and 'float'

In [None]:
train_probas = pipe.predict_proba(X_train)
test_probas = pipe.predict_proba(X_test)

In [None]:
print(f"training log loss: {log_loss(y_train, train_probas)}") 
print(f"test log loss: {log_loss(y_test, test_probas)}")

In [None]:
test_data = categorized_df.head(2000).copy().drop(columns='n_arrests')
test_data['ap_t_high100'] = 5000
test_data

In [None]:
preds = pipe_unbalanced.predict_proba(test_data)

In [None]:
probas = pd.Series(preds[:, 1], name='Probability')

In [None]:
probas.hist()

In [None]:
probas.max()

In [None]:
joblib_pipeline(pipe_unbalanced, file_name='pipeline_no_balance_all_dates.joblib')