In [1]:
import sqlalchemy
import pandas as pd
import numpy as np

In [2]:
n_lats = (40.92-40.498)/.001 +1
lats = np.linspace(40.498, 40.92, int(n_lats))

n_longs = ((74.254)-73.681)/.001 + 1
longs = np.linspace(-74.254, -73.681, int(n_longs))

In [4]:
from weather import get_dates_list
from datetime import datetime

# get most recent date from NYC's year to date arrests
final_date_df = pd.read_csv('../nyc_ytd.csv', 
                    usecols=['ARREST_DATE'],
                    nrows=2)
final_date = datetime.strptime(final_date_df.loc[0, 'ARREST_DATE'], '%m/%d/%Y')

initial_date = datetime.strptime('01/01/2006', '%m/%d/%Y')

dates = get_dates_list(initial_date, final_date)


In [None]:
from itertools import product

# lat_long_df = pd.DataFrame(product(lats, longs))
lat_long_arr = np.array(list(product(lats, longs, dates)))

In [None]:
lat_long_arr.shape

## MVP

In [1]:
# from sqlalchemy import create_engine
# engine = create_engine('postgresql:///walk')

In [2]:
df = pd.read_sql_query("""
    SELECT * 
    FROM location_day_arrests_weather
    WHERE date > '2019-04-01';
    """, 'postgresql:///walk')

In [3]:
from sys import getsizeof
getsizeof(df)

2326918399

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X = df.drop(columns=['n_arrests'])
y = df['n_arrests']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import time

In [149]:
class DateTransformer():
    """Transforms df so that there are columns representing the features of the date.
    """
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = pd.Series([pd.to_datetime(x, format="%Y-%m-%d") for x in X['date']])
        X_transf = pd.DataFrame()
        X_transf['year'] = [x.year for x in X]
        X_transf['month'] = [x.month for x in X]
        X_transf['day'] = [x.day for x in X]
        X_transf['weekday'] = [x.weekday() for x in X]
        return X_transf

In [150]:
x = X_train.loc[0, 'date']
pd.to_datetime(x, format="%Y-%m-%d")

Timestamp('2019-04-02 00:00:00')

In [312]:
class TimeOfDayTransformer():    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print(X.head())
        print(X.info())
        print(list(X)[0:5])
        structtime = [time.gmtime(x) for x in X]
        X_transf = [(x.tm_hour*60 + x.tm_min) for x in structtime]
        return X_transf

In [313]:
class StringTransformer():    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return np.array(X.to_string()).reshape(-1, 1)

In [314]:
# X_train.isna().sum()

In [315]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6006888 entries, 6454250 to 6723365
Data columns (total 19 columns):
latitude                 float64
longitude                float64
date                     object
ap_t_high100             int64
ap_t_low100              int64
cloud                    int64
humidity                 int64
icon                     object
moon_phase               int64
precip_inten_max10000    int64
precip_proba100          int64
precipType               object
pressure                 int64
sunriseTime              int64
sunsetTime               int64
uvIndex                  int64
wind_gust100             int64
precip_accum100          object
ozone10                  int64
dtypes: float64(2), int64(13), object(4)
memory usage: 1.1+ GB


In [316]:
X_train.head(5).T

Unnamed: 0,6454250,2366346,1388385,2158270,3123603
latitude,40.649,40.65,40.732,40.825,40.673
longitude,-73.956,-73.843,-73.897,-73.894,-73.773
date,2019-06-18,2019-04-30,2019-04-18,2019-04-27,2019-05-09
ap_t_high100,7362,5950,5996,5835,5848
ap_t_low100,6771,5203,5642,4594,5433
cloud,97,92,74,53,80
humidity,93,87,73,51,67
icon,rain,rain,partly-cloudy-day,rain,partly-cloudy-day
moon_phase,55,87,48,78,18
precip_inten_max10000,1321,285,3,286,0


In [317]:
X_train.columns

Index(['latitude', 'longitude', 'date', 'ap_t_high100', 'ap_t_low100', 'cloud',
       'humidity', 'icon', 'moon_phase', 'precip_inten_max10000',
       'precip_proba100', 'precipType', 'pressure', 'sunriseTime',
       'sunsetTime', 'uvIndex', 'wind_gust100', 'precip_accum100', 'ozone10'],
      dtype='object')

In [318]:
impute_zero_features = ['precip_accum100']
impute_zero_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

impute_mean_features = ['ozone10']
impute_mean_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

date_transformer = Pipeline(steps=[
    ('date_expansion', DateTransformer()),
    ('ohe', OneHotEncoder(categories='auto'))
])

categorical_features = ['icon', 'precipType']
categorical_transformer = Pipeline(steps=[
    ('string', StringTransformer()),
    ('ohe', OneHotEncoder(categories='auto'))
])

time_of_day_features = ['sunriseTime', 'sunsetTime']
time_of_day_transformer = Pipeline(steps=[
    ('time', TimeOfDayTransformer()),
    ('scaler', StandardScaler())
])

other_numeric_features = ['latitude', 'longitude', 'ap_t_high100',
                         'ap_t_low100', 'cloud', 'humidity', 'moon_phase',
                          'precip_inten_max10000', 'precip_proba100', 'pressure',
                         'uvIndex', 'wind_gust100',]

preprocessor = ColumnTransformer( transformers=[
    ('missing0', impute_zero_transformer, impute_zero_features),
    ('missing_mean', impute_mean_transformer, impute_mean_features),
    ('date', date_transformer, ['date']),
    ('categoricals', categorical_transformer, categorical_features),
    ('sunrise', time_of_day_transformer, ['sunriseTime']),
    ('sunset', time_of_day_transformer, ['sunsetTime']),
    ('ss', StandardScaler(), other_numeric_features)
])

target_preprocessor = ColumnTransformer( transformers=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0))
])



In [319]:
mini_X_train = X_train.head(1000).reset_index()

In [320]:
preprocessor.fit_transform(mini_X_train)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  result = op(x, *args, **kwargs)


   sunriseTime
0   1560849922
1   1556618200
2   1555582402
3   1556359236
4   1557395158
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
sunriseTime    1000 non-null int64
dtypes: int64(1)
memory usage: 7.9 KB
None
['sunriseTime']


TypeError: an integer is required (got type str)

In [322]:
time.gmtime(mini_X_train['sunriseTime'][0])

time.struct_time(tm_year=2019, tm_mon=6, tm_mday=18, tm_hour=9, tm_min=25, tm_sec=22, tm_wday=1, tm_yday=169, tm_isdst=0)

  (0, 0)	1.0
