In [271]:
from model2 import *
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [255]:
df = pd.read_sql_query("""
                        SELECT latitude, 
                                longitude, 
                                ap_t_high100, 
                                n_arrests
                        FROM manhattan_loc_d_ar_wea 
                        ;"""
                        , 'postgresql:///walk')

In [256]:
categorized_df = categorize_arrests(df)

In [257]:
    # add combined lat/long location feature
    categorized_df['latlong'] = (categorized_df['latitude'].astype(str) 
                                + categorized_df['longitude'].astype(str))
    X_train, y_train, X_eval, y_eval = split_last(categorized_df, target_col='n_arrests',
                                                  sort_col=None, cut=.8)

    

## Nieve Log Loss as baseline

In [None]:
preds = np.zeros(y_train.shape)
preds[0]=1

In [None]:
log_loss(y_train, preds)

## Random Forest w/ OneHotEncoded latlong, latitude, longitude, daily high temp

In [259]:
column_transformer = ColumnTransformer( 
    transformers=[
    ('ohe', OneHotEncoder(categories='auto'), ['latlong']),
    ('ss', 'passthrough', ['latitude', 'longitude', 'ap_t_high100']),
    ], remainder='drop')

In [260]:
rfc = RandomForestClassifier(n_estimators=10, max_depth=20,
                            class_weight='balanced', max_features=2000)

In [261]:
pipe = Pipeline([
    ('preprocessor', column_transformer),
    ('model', rfc)
])

In [264]:
# log_loss_cvs(pipe, X_train, y_train) 

## Evaluation w/ temp, lat/long

In [265]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ohe',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                n_values=None,
                                                                sparse=True),
                                                  ['latlong']),
                                                 ('

In [266]:
train_probas = pipe.predict_proba(X_train)
test_probas = pipe.predict_proba(X_eval)
test_predict = pipe.predict(X_eval)

In [267]:
print(f"training log loss: {log_loss(y_train, train_probas)}") 
print(f"test neg log loss: {log_loss(y_eval, test_probas)}")
print(f"confusion matrix: \n{confusion_matrix(y_eval, test_predict)}")
print(f"accuracy: {accuracy_score(y_eval, test_predict)}")
print(f"F1: {f1_score(y_eval, test_predict)}")

training log loss: 0.44835417606880607
test neg log loss: 0.4555676287549664
confusion matrix: 
[[518571 158280]
 [  3275   7882]]
accuracy: 0.7651844164602737
F1: 0.08890192252381303


In [272]:
joblib_pipeline(pipe)