# Build training data and engineer new features

In [1]:
import numpy as np
import pandas as pd
import psycopg2 as pg
import datetime as dt
import ast
import os
import string
import re
import pickle
import yaml
import math
import matplotlib.pyplot as plt

from lib.geography import haversine, build_highway_features_two_way, build_highway_features_one_way
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

%matplotlib inline
import matplotlib.pyplot as plt

### Inputs and connection string

In [5]:
# model configurations

file_args = yaml.load(open('./conf/pipeline_args.yml','r'))

In [3]:
# create db connection objects

conn_str_file = './conf/db_conn_str.txt'

pg_conn_str = open(conn_str_file, 'r').read()
conn = pg.connect(pg_conn_str)
cur = conn.cursor()

OperationalError: could not connect to server: Connection refused (0x0000274D/10061)
	Is the server running on host "localhost" (::1) and accepting
	TCP/IP connections on port 5432?
could not connect to server: Connection refused (0x0000274D/10061)
	Is the server running on host "localhost" (127.0.0.1) and accepting
	TCP/IP connections on port 5432?


### Load SQL table to memory via random sampling

In [4]:
cur.execute("""
SELECT count(*) FROM segments_time_level_selected WHERE NOT level_mean IS NULL
""")

results = cur.fetchone()
positive_count = results[0]
print('Number of segments selected: '+str(positive_count))

Number of segments selected: 1720515


In [12]:
negative_positive_ratio = 4

In [13]:
sampled_df = pd.read_sql("""(SELECT * FROM segments_time_level_selected WHERE NOT level_mean IS NULL) UNION 
                        (SELECT * FROM segments_time_level_selected WHERE level_mean IS NULL ORDER BY RANDOM() LIMIT """ + 
                        str(positive_count*negative_positive_ratio) + ');',
                        con=conn)

In [14]:
sampled_df.head()

Unnamed: 0,segment_id,street,road_type,lat1,lon1,lat2,lon2,time_id,date,day_of_week,month,time,level_min,level_max,level_mean,level_count
0,18,Adams Ave,2,32.762914,-117.128252,32.763334,-117.126578,3,2017-06-07,2,6,11:30:00,,,,
1,18,Adams Ave,2,32.762914,-117.128252,32.763334,-117.126578,7,2017-04-13,3,4,11:00:00,3.0,3.0,3.0,1.0
2,18,Adams Ave,2,32.762914,-117.128252,32.763334,-117.126578,8,2017-02-16,3,2,20:30:00,,,,
3,18,Adams Ave,2,32.762914,-117.128252,32.763334,-117.126578,17,2017-04-10,0,4,16:30:00,,,,
4,18,Adams Ave,2,32.762914,-117.128252,32.763334,-117.126578,25,2017-03-26,6,3,02:30:00,,,,


### Checkpoint (so I don't have to run SQL query again)

In [None]:
pickle.dump(sampled_df, open('./data/train_test/sampled_df.pkl', 'wb'), protocol=4)

#### Reload sampled_df

In [6]:
sampled_df = pickle.load(open('./data/train_test/sampled_df.pkl', 'rb'))

In [7]:
sampled_df['segment_id'].nunique()

6687

In [19]:
sampled_df['date'].nunique()

KeyError: 'datetime'

## Process the data:
1. replace all na with zero
1. add 'level_binary' column
1. set 'time' column
1. add number of days since earliest date
1. add number of minutes since midnight
1. encode categorical data to numeric using sklearn's labelencoder
1. One hot encode road_type

In [48]:
#Sort by date
sampled_df.sort_values('date')

Unnamed: 0,segment_id,street,road_type,lat1,lon1,lat2,lon2,time_id,date,day_of_week,month,time,level_min,level_max,level_mean,level_count
5461221,113063,SR-15 S,3,32.714173,-117.118001,32.713381,-117.118226,3915,2017-02-08,2,2,16:00:00,,,,
4331775,88723,Exit 3A: I-8 E,4,32.761832,-117.164269,32.761950,-117.164001,3777,2017-02-08,2,2,18:00:00,4.0,4.0,4.000000,3.0
5297154,110051,Camino del Rio West,6,32.752876,-117.204823,32.752997,-117.204840,2251,2017-02-08,2,2,20:30:00,2.0,2.0,2.000000,1.0
2685929,55481,Linda Vista Rd,7,32.764677,-117.197472,32.765102,-117.197123,884,2017-02-08,2,2,23:30:00,,,,
3012949,61479,Exit 15: N Park Way / University Ave,4,32.750655,-117.123520,32.750059,-117.123438,399,2017-02-08,2,2,18:30:00,4.0,4.0,4.000000,3.0
924759,21611,Linda Vista Rd,7,32.768653,-117.192299,32.768816,-117.191700,2923,2017-02-08,2,2,19:00:00,2.0,2.0,2.000000,1.0
6952085,142385,W Mission Bay Dr,7,32.760837,-117.229255,32.761125,-117.229729,1021,2017-02-08,2,2,23:00:00,,,,
1155400,25820,Front St,7,32.720894,-117.164763,32.719876,-117.164755,2923,2017-02-08,2,2,19:00:00,,,,
5806658,119779,Upas St,2,32.741104,-117.131931,32.741104,-117.132018,2251,2017-02-08,2,2,20:30:00,,,,
3733337,75562,Florida Dr,2,32.725538,-117.142779,32.726712,-117.142760,2923,2017-02-08,2,2,19:00:00,,,,


In [49]:
#Replace na values with zeros for assumption of no congestion
level_cols = [c for c in sampled_df.columns if c.startswith('level')]

for c in level_cols:
    sampled_df[c].fillna(0, inplace=True)

In [50]:
# create 'target' column and set it to appropriate value based on input
print('creating level_binary column...')

sampled_df['level_binary'] = 0
sampled_df['level_binary'][sampled_df['level_mean'] != 0] = 1

creating level_binary column...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [51]:
# add date_idx for number of days since earliest date
print('adding date_idx for number of days since earliest date...')
td = pd.to_datetime(sampled_df['date']) - pd.to_datetime(sampled_df.date.min())
date_idx_vals = (td / np.timedelta64(1, 'D')).astype(int)
sampled_df['date_idx'] = date_idx_vals

adding date_idx for number of days since earliest date...


In [52]:
# add time_idx for number of minutes since midnight
print('adding time_idx for number of minutes since midnight...')
time_idx_vals = list(map(lambda t: t.hour*60 + t.minute, sampled_df['time'].values))
sampled_df['time_idx'] = time_idx_vals

adding time_idx for number of minutes since midnight...


In [53]:
# encode categorical data using label encoder - do not encode date and time
print('encoding categorical columns as numeric...')

le = preprocessing.LabelEncoder()

for col in ['day_of_week','month']:
    print('processing {} column'.format(col))
    sampled_df[col] = le.fit_transform(sampled_df[col])

encoding categorical columns as numeric...
processing day_of_week column
processing month column


In [54]:
# one hot encode the road_type column. this is necessary because they are not necessarily numeric columns

sampled_df = pd.concat([sampled_df,pd.get_dummies(sampled_df['road_type'],prefix='road_type')],axis=1)
sampled_df.drop(columns=['road_type'], inplace=True)

# Add features

## Create features for major highways

### Query all unique street names

In [4]:
cur.execute('ROLLBACK')

NameError: name 'cur' is not defined

In [85]:
cur.execute("""
SELECT street, SUM(seg_length)*10000, count(*) FROM segments 
GROUP BY street
""")
results = cur.fetchall()
with open('./data/street_names/street_names.txt','w') as f:
    for item in sorted(results):
        f.write(item[0]+'\t'+str(item[1])+'\t'+str(item[2])+'\n')

In [17]:
cur.execute("""
SELECT street, SUM(seg_length)*10000, count(*) FROM segments 
WHERE NOT street LIKE '%Ave' AND NOT street LIKE '%Rd' 
AND NOT street LIKE '%St' AND NOT street LIKE '%Dr' AND NOT street LIKE '%Way'
AND NOT street LIKE '%Cir' AND NOT street LIKE '%Pl' AND NOT street LIKE '%Blvd'
AND NOT street LIKE '%Trl' AND NOT street LIKE '%Street' AND NOT street LIKE '%Av'
AND NOT street LIKE '%Ln' AND NOT street LIKE '%Tr' AND NOT street LIKE '%Ter'
AND NOT street LIKE '%Ct' AND NOT street LIKE '%Pkwy' AND NOT street LIKE '%Drive'
GROUP BY street
""")
results = cur.fetchall()
with open('./data/street_names/street_names_no_suffix.txt','w') as f:
    for item in sorted(results):
        f.write(item[0]+'\t'+str(item[1])+'\t'+str(item[2])+'\n')

In [18]:
cur.execute("""
SELECT street, SUM(seg_length)*10000, count(*) FROM segments
WHERE street LIKE '%N' OR street LIKE '%E' OR street LIKE '%S' OR street LIKE '%W'
GROUP BY street
""")
results = cur.fetchall()
with open('./data/street_names/street_names_nesw.txt','w') as f:
    for item in sorted(results):
        f.write(item[0]+'\t'+str(item[1])+'\t'+str(item[2])+'\n')

### Query major streets with high traffic

In [86]:
high_traffic_df = sampled_df.loc[sampled_df['level_max'] > 2].groupby('street')['street'].count().sort_values(ascending=False)[:200]
high_traffic_df.to_csv('./data/train_test/processed_training_data.pkl')

### Create fields for major highways (and streets)

In [26]:
highways = ['I-5 N','I-8 E','I-15 N','I-805 N','SR-15 N','SR-52 E','SR-54 E',
           'SR-56 E','SR-67 N','SR-75 N','SR-94 E','SR-125 N','SR-163 N','SR-905 E',
           'I-5 S','I-8 W','I-15 S','I-805 S','SR-15 S','SR-52 W','SR-54 W',
           'SR-56 W','SR-67 S','SR-75 S','SR-94 W','SR-125 S','SR-163 S','SR-905 W']

In [None]:
if highway_features_two_way = True:
    sampled_df = build_highway_features_two_way(sampled_df)

In [None]:
if highway_features_two_way = False:
    sampled_df = build_highway_features_one_way(sampled_df)

## Add average of previous time to segments

Averaging segments based on previous day of week and time. There are a variety of ways to do this including using different time buckets but sticking with this method for simplicity for now.

In [56]:
level_max_average = 'all'

In [58]:
# Sort by segment_id, day_of_week, time_idx to group segments within the same DoW and time bucket
#sampled_df = sampled_df.sort_values(['segment_id','day_of_week','time_idx','date']).copy(deep=True)
#sampled_df = sampled_df.reset_index()

if level_max_average == 'time series':
    sampled_df['level_max_average'] = sampled_df.groupby(['segment_id','day_of_week','time_idx'])['level_max'].cumsum() \
                            /(np.floor(sampled_df['date_idx']/7.0)+1)
    

if level_max_average == 'all':
    ndow = sampled_df.groupby(['day_of_week'])['date'].nunique()
    overall_level_max_average = sampled_df.groupby(['segment_id','day_of_week','time_idx'])['level_max'].sum()
    overall_level_max_average = overall_level_max_average.reset_index()
    overall_level_max_average['level_max_average'] = overall_level_max_average.apply(lambda x: x['level_max']/ndow[x['day_of_week']], axis=1)
    sampled_df = pd.merge(sampled_df, overall_level_max_average, how = 'left', on=['segment_id','day_of_week','time_idx'])


In [62]:
sampled_df.sort_values('level_max_average', ascending=False)

Unnamed: 0,index,segment_id,street,lat1,lon1,lat2,lon2,time_id,date,day_of_week,...,date_idx,time_idx,road_type_1,road_type_2,road_type_3,road_type_4,road_type_6,road_type_7,level_max_y,level_max_average
7796896,7796109,158569,to SR-163 N,32.725009,-117.155072,32.725138,-117.155198,1016,2017-06-08,3,...,120,900,0,0,0,1,0,0,72.0,4.0
3810855,3811034,77233,to SR-163 N,32.728364,-117.155284,32.728909,-117.155058,4128,2017-06-02,4,...,114,690,0,0,0,1,0,0,72.0,4.0
6162622,6163121,126757,to I-8 E,32.757798,-117.203912,32.758087,-117.203629,3182,2017-04-18,1,...,69,960,0,0,0,1,0,0,68.0,4.0
7111184,7111896,145232,to I-8 E / Morena Blvd,32.758955,-117.200735,32.759102,-117.200509,4126,2017-03-07,1,...,27,1020,0,0,0,1,0,0,68.0,4.0
6162621,6163118,126757,to I-8 E,32.757798,-117.203912,32.758087,-117.203629,3165,2017-04-11,1,...,62,960,0,0,0,1,0,0,68.0,4.0
6118171,6119228,125921,to I-8 E / Morena Blvd,32.758724,-117.201036,32.758955,-117.200735,5426,2017-02-28,1,...,20,990,0,0,0,1,0,0,68.0,4.0
6118172,6118519,125921,to I-8 E / Morena Blvd,32.758724,-117.201036,32.758955,-117.200735,2682,2017-03-07,1,...,27,990,0,0,0,1,0,0,68.0,4.0
6118173,6118261,125921,to I-8 E / Morena Blvd,32.758724,-117.201036,32.758955,-117.200735,1732,2017-03-14,1,...,34,990,0,0,0,1,0,0,68.0,4.0
1336683,1336224,29603,to I-8 E,32.756866,-117.204484,32.757289,-117.204272,1357,2017-02-23,3,...,15,930,0,0,0,1,0,0,72.0,4.0
1336682,1336752,29603,to I-8 E,32.756866,-117.204484,32.757289,-117.204272,3459,2017-02-16,3,...,8,930,0,0,0,1,0,0,72.0,4.0


## Add Events

### Add Collisions

In [110]:
collisions_df = pd.read_sql('SELECT * FROM collisions', con=conn)

In [111]:
collisions_df.sort_values(by='PRIMARY_RD').groupby('PRIMARY_RD')['PRIMARY_RD'].count()\
.to_csv('./data/street_names/collision_df_street_names.txt')

In [12]:
collisions_df.columns

Index(['CASE_ID', 'ACCIDENT_YEAR', 'PROC_DATE', 'JURIS', 'COLLISION_DATE',
       'COLLISION_TIME', 'OFFICER_ID', 'REPORTING_DISTRICT', 'DAY_OF_WEEK',
       'CHP_SHIFT', 'POPULATION', 'CNTY_CITY_LOC', 'SPECIAL_COND', 'BEAT_TYPE',
       'CHP_BEAT_TYPE', 'CITY_DIVISION_LAPD', 'CHP_BEAT_CLASS', 'BEAT_NUMBER',
       'PRIMARY_RD', 'SECONDARY_RD', 'DISTANCE', 'DIRECTION', 'INTERSECTION',
       'WEATHER_1', 'WEATHER_2', 'STATE_HWY_IND', 'CALTRANS_COUNTY',
       'CALTRANS_DISTRICT', 'STATE_ROUTE', 'ROUTE_SUFFIX', 'POSTMILE_PREFIX',
       'POSTMILE', 'LOCATION_TYPE', 'RAMP_INTERSECTION', 'SIDE_OF_HWY',
       'TOW_AWAY', 'COLLISION_SEVERITY', 'NUMBER_KILLED', 'NUMBER_INJURED',
       'PARTY_COUNT', 'PRIMARY_COLL_FACTOR', 'PCF_CODE_OF_VIOL',
       'PCF_VIOL_CATEGORY', 'PCF_VIOLATION', 'PCF_VIOL_SUBSECTION',
       'HIT_AND_RUN', 'TYPE_OF_COLLISION', 'MVIW', 'PED_ACTION',
       'ROAD_SURFACE', 'ROAD_COND_1', 'ROAD_COND_2', 'LIGHTING',
       'CONTROL_DEVICE', 'CHP_ROAD_TYPE', 'PEDESTRIAN_

### Add Non-Padres events

In [12]:
events_df = pd.read_sql('SELECT * FROM events', con=conn)
events_df['event_type'].unique()

array(['ATHLETIC', 'FESTIVAL', 'FARMERS', 'PARADES', 'EXHIBITS', 'MUSEUM',
       'CONCERTS'], dtype=object)

In [13]:
def clean_event_title(e):
    e_clean = e.translate(str.maketrans("","",string.punctuation)).replace(' ','_')
    return e_clean

In [19]:
# get events from dataframe
events_df = pd.read_sql('SELECT * FROM events', con=conn)
events_df['event_start'] = pd.to_datetime(events_df['event_start'])
events_df['event_end'] = pd.to_datetime(events_df['event_end'])

# modify str
events_df['exp_attendance'] = events_df['exp_attendance'].map(lambda x: re.sub('[^0-9]','',x))
events_df['exp_attendance'].loc[events_df['exp_attendance'] == ''] = np.NaN
events_df['exp_attendance'] = events_df['exp_attendance'].astype('float64')

# subset to events larger than event_attendance_threshold
events_of_interest = events_df[(events_df['exp_attendance']>=file_args['event_attendance_threshold']) & 
                                  events_df['event_type'].isin(['ATHLETIC','FESTIVAL','CONCERTS'])]

# add datetime column to data
sampled_df['datetime'] = sampled_df[['date','time']].apply(lambda row: dt.datetime.combine(row['date'], row['time']), axis=1)

# add event columns to data
event_features = sampled_df[['datetime']].copy()

# add columns for events
sampled_df['event_festival'] = 0
sampled_df['event_athletic'] = 0
sampled_df['event_concerts'] = 0
sampled_df['distance_festival'] = 0.0
sampled_df['distance_athletic'] = 0.0
sampled_df['distance_concerts'] = 0.0
sampled_df['exp_attendance'] = 0

# set values for padres_event column to 1 if event occurring
event_start_window_before = 2
event_start_window_after = 0.5
event_end_window_before = 0.5
event_end_window_after = 1

for index, row in events_of_interest.iterrows():
    
    # set active before/after game start time
    start = row['event_start'] - dt.timedelta(hours=event_start_window_before)
    end = row['event_start'] + dt.timedelta(hours=event_start_window_after)
    events_start_df = sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end)]
    sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end), 'distance_'+row['event_type'].lower()] = \
                        events_start_df.apply(lambda x: max(1/haversine(row['latitude'],row['longitude'],x['lat1'],x['lon1']), x['distance_'+row['event_type'].lower()]), axis=1)
    sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end) & (sampled_df['distance_'+row['event_type'].lower()]>=1),'event_'+row['event_type'].lower()] = 1
    
    # set active before/after game end time
    start = row['event_end'] - dt.timedelta(hours=event_end_window_before)
    end = row['event_end'] + dt.timedelta(hours=event_end_window_after)
    events_end_df = sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end)]
    sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end), 'distance_'+row['event_type'].lower()] = \
                        events_end_df.apply(lambda x: max(1/haversine(row['latitude'],row['longitude'],x['lat1'],x['lon1']), x['distance_'+row['event_type'].lower()]),axis=1)
    sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end) & (sampled_df['distance_'+row['event_type'].lower()]>=1), 'event_'+row['event_type'].lower()] = 1

# drop added datetime column
sampled_df.drop('datetime', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [20]:
sampled_df.loc[sampled_df['event_athletic'] == 1]

Unnamed: 0,segment_id,street,lat1,lon1,lat2,lon2,time_id,date,day_of_week,month,...,road_type_4,road_type_6,road_type_7,event_festival,event_athletic,event_concerts,distance_festival,distance_athletic,distance_concerts,exp_attendance
3625,64,W Broadway,32.715696,-117.165612,32.715701,-117.166489,2761,2017-06-04,6,4,...,0,0,1,0,1,0,0.000000,2.052882,0.0,0
3685,64,W Broadway,32.715696,-117.165612,32.715701,-117.166489,2979,2017-06-04,6,4,...,0,0,1,0,1,0,0.000000,2.052882,0.0,0
3839,64,W Broadway,32.715696,-117.165612,32.715701,-117.166489,3407,2017-06-04,6,4,...,0,0,1,0,1,0,0.000000,2.052882,0.0,0
3876,64,W Broadway,32.715696,-117.165612,32.715701,-117.166489,3510,2017-06-04,6,4,...,0,0,1,0,1,0,0.000000,2.052882,0.0,0
12746,214,1st Ave,32.724058,-117.163879,32.724058,-117.163878,2761,2017-06-04,6,4,...,0,0,0,0,1,0,0.000000,1.769154,0.0,0
12792,214,1st Ave,32.724058,-117.163879,32.724058,-117.163878,2979,2017-06-04,6,4,...,0,0,0,0,1,0,0.000000,1.769154,0.0,0
12897,214,1st Ave,32.724058,-117.163879,32.724058,-117.163878,3529,2017-06-04,6,4,...,0,0,0,0,1,0,0.090536,1.769154,0.0,0
13623,216,I-5 S,32.728239,-117.168790,32.725948,-117.166723,1103,2017-06-04,6,4,...,0,0,0,0,1,0,0.000000,3.402300,0.0,0
14095,216,I-5 S,32.728239,-117.168790,32.725948,-117.166723,2761,2017-06-04,6,4,...,0,0,0,0,1,0,0.000000,3.402300,0.0,0
14150,216,I-5 S,32.728239,-117.168790,32.725948,-117.166723,2979,2017-06-04,6,4,...,0,0,0,0,1,0,0.000000,3.402300,0.0,0


### Add Padres games

In [18]:
# get padres from database
padres_df = pd.read_sql('SELECT * FROM padres_games', con=conn)
padres_df['game_start'] = pd.to_datetime(padres_df['game_start'])
padres_df['game_end'] = pd.to_datetime(padres_df['game_end'])

# add datetime column to data
sampled_df['datetime'] = sampled_df[['date','time']].apply(lambda row: dt.datetime.combine(row['date'], row['time']), axis=1)

# add padres_game column to data
sampled_df['event_padres'] = 0
sampled_df['distance_padres'] = 0.0

# set values for padres_event column to 1 if padres game was occurring
padres_start_window_before = 2
padres_start_window_after = 0.5
padres_end_window_before = 0.5
padres_end_window_after = 1

petco_park_coordinates = (32.7076,-117.1570)

for index, row in padres_df.iterrows():
    # set active before/after game start time
    start = row['game_start'] - dt.timedelta(hours=padres_start_window_before)
    end = row['game_start'] + dt.timedelta(hours=padres_start_window_after)
    padres_sampled_df = sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end)]
    sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end), 'event_padres'] = 1
    sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end), 'distance_padres'] = \
                padres_sampled_df.apply(lambda x: haversine(32.707,-117.1570,x['lat1'],x['lon1']), axis=1)
        
    # set active before/after game end time
    start = row['game_end'] - dt.timedelta(hours=padres_end_window_before)
    end = row['game_end'] + dt.timedelta(hours=padres_end_window_after)
    padres_sampled_df = sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end)]
    sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end), 'event_padres'] = 1
    sampled_df.loc[(sampled_df['datetime']>=start) & (sampled_df['datetime']<=end), 'distance_padres'] = \
                padres_sampled_df.apply(lambda x: haversine(32.707,-117.1570,x['lat1'],x['lon1']), axis=1)

# drop added datetime column
sampled_df.drop('datetime', axis=1, inplace=True)

## Delete unneeded features

In [None]:
# define features and target
print('subsetting data to date, time, features and target...')
features = ['date_idx','time_idx','day_of_week','month','segment_id','road_type','lat1','lon1','lat2','lon2']
targets = [c for c in sampled_df.columns if c.startswith('level')]

### create data structure with params and data and write to pickle file

In [24]:
# write clustered training data to database
if file_args['write_sql_db']:
    sampled_df.to_sql(name='processed_training_data', con=conn, 
                  if_exists='replace', index=False, chunksize=1000)

In [26]:
# write processed data to pickle file
if file_args['write_pickle_file']:
    pickle.dump(sampled_df, open('./data/train_test/processed_training_data.pkl', 'wb'), protocol=4)
    print('pickle file dump complete...')

pickle file dump complete...
