In [165]:
import pandas as pd
import numpy as np

import datetime as dt

import holidays
from calendra.asia import Qatar
from calendra.asia import HongKong

from sklearn.pipeline import Pipeline

In [163]:
import pandas as pd
import numpy as np
import datetime as dt

class DataRetrieval():
    
    #def __init__(self):

    
    def get_calls_data(self, filepath='../data/Calls_Table_data.csv', delimiter='\t'):
        """Retrieves call data from filepath"""
        df = pd.read_csv(filepath, delimiter='\t', encoding='utf-16')
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)        
        return df
    
    def get_weather_data(self, filepath='../data/historical_weather.csv'):
        """Retrieves weather data from filepath"""
        df = pd.read_csv(filepath)
        df['date'] = pd.to_datetime(df['DATE']).dt.date
        weather_hist = (df[['date', 'TMAX', 'PRCP', 'SNOW']]
                        .rename(columns={'PRCP':'precip', 'TMAX':'temp_max',
                                         'PRCP':'precip', 'SNOW':'snow'}))
        weather_hist['precip^2'] = weather_hist['precip']**2
        weather_hist['snow^2'] = weather_hist['snow']**2      
        return weather_hist
    
    def get_seahawks_schedule(self, filepath='../data/seahawks_schedule.csv'):
        """
        Retrieves Seahawks game schedule from filepath
        
        Will need to rewrite scraper used to retrieve this data
        """
        df_in = pd.read_csv(filepath)
        df_dropna = df_in.copy()[df_in['Opp'].notna()].reset_index(drop=True)
        df = df_dropna.copy()[df_dropna['Opp'] != 'Bye Week'].reset_index(drop=True)
        
        
        df['date'] = ''
        for i in range(len(df)):
            df['date'][i] = (f"{df.iloc[i]['Date']}, {df.iloc[i]['year']}")
            
        df['date'] = pd.to_datetime(df['date']).dt.date
        
        df['seahawks_game'] = ''
        for i in range(len(df)):
            if type(df.iloc[i]['Unnamed: 10']) == str:
                if df.iloc[i]['Week'] == 'SuperBowl':
                    df['seahawks_game'][i] = 'SuperBowl'
                elif df.iloc[i]['Week'] in ['Wild Card', 'Division','Conf. Champ.']:
                    df['seahawks_game'][i] = 'away_Playoffs'
                else:
                    df['seahawks_game'][i] = 'away_Regular'
            else:     
                if df.iloc[i]['Week'] == 'SuperBowl':
                    df['seahawks_game'][i] = 'SuperBowl'
                elif df.iloc[i]['Week'] in ['Wild Card', 'Division','Conf. Champ.']:
                    df['seahawks_game'][i] = 'home_Playoffs'
                else:
                    df['seahawks_game'][i] = 'home_Regular'
        return df[['date', 'seahawks_game']].copy()
    
    def get_huskies_schedule(self, filepath='../data/huskies_schedule.csv'):
        """
        Retrieves Huskies game schedule from filepath

        Will need to rewrite scraper used to retrieve this data
        """
        df_in = pd.read_csv(filepath)
        df = df_in.copy()
        df['date'] = pd.to_datetime(df['Date']).dt.date

        df['huskies_game'] = ''
        for i in range(len(df)):
            if type(df.iloc[i]['Unnamed: 6']) == str:
                df['huskies_game'][i] = 'away'
            else:
                df['huskies_game'][i] = 'home'
        return df[['date','huskies_game']].copy()

    def get_sounders_schedule(self, filepath='../data/sounders_schedule.csv'):
        """
        Retrieves Sounders FC game schedule from filepath

        Will write instructions for getting game data
        """
        df_in = pd.read_csv(filepath)
        df = df_in.copy()
        df['date'] = pd.to_datetime(df_in['date_dd_mm_yy']).dt.date
        
        df['sounders_game'] = ''
        for i in range(len(df)):
            if df['home_team'][i].split()[0] == 'Seattle':
                df['sounders_game'][i] = 'home'
            else:
                df['sounders_game'][i] = 'away'
        return df[['date','sounders_game']].drop_duplicates()



In [502]:
import pandas as pd
import numpy as np
import datetime as dt


class CountCalls():
    """Counts calls by date either by city or neighborhood"""
    
    #def __init__(self):
    
    def fit(self, dataframe, how='city'):
        self.dataframe = dataframe
        self.how = how
        return self
    
    def transform(self):
        
        if self.how == 'city':
            df = self.dataframe[['ORIG_TIME_QUEUED', 'EVENT']].copy()
            df['date'] = pd.to_datetime(df['ORIG_TIME_QUEUED']).dt.date
            df.drop('ORIG_TIME_QUEUED', axis=1, inplace=True)
            return df.groupby('date').count().rename(columns={'EVENT':'num_calls'}).reset_index()
        
        else:
            df = self.dataframe[['NEIGHBORHOOD', 'ORIG_TIME_QUEUED', 'EVENT']].copy()
            df['date'] = pd.to_datetime(df['ORIG_TIME_QUEUED']).dt.date
            df.drop('ORIG_TIME_QUEUED', axis=1, inplace=True)
            counts = df.groupby(['NEIGHBORHOOD', 'date']).count().rename(columns=
                                                                         {'NEIGHBORHOOD':'neighborhood',
                                                                          'EVENT':'num_calls'}).reset_index()
            
            neighborhoods = list(counts['NEIGHBORHOOD'].unique())
            num_days = int(np.timedelta64((max(counts['date']) - min(counts['date'])), 'D')/np.timedelta64(1,'D'))+1
            start = pd.to_datetime(min(counts['date']))
            neighboor_arr = np.array([(neighborhoods*num_days)])
            neighboor_arr = neighboor_arr.flatten()
            dates = [(start + np.timedelta64(i,'D')) for i in range(num_days)]*len(neighborhoods)
            
            df2 = pd.DataFrame({"dt_time": dates})
            df2['date'] = df2["dt_time"].dt.date
            df2['neighborhood'] = neighboor_arr
            df3 = pd.merge(df2, counts, how='outer', left_on=['date','neighborhood'],
                           right_on=['date','NEIGHBORHOOD']).fillna(0)
            return df3[['date', 'neighborhood', 'num_calls']]
        

class FeaturizeCalls():
    """Clean incoming df to fit into model"""
    
    #def __init__(self):
    
    def fit(self, dataframe):
        self.dataframe = dataframe
        return self

    
    def transform(self):
        """tranform and clean incoming training or test"""
        df = self.dataframe.copy()      

        num_days = int(np.timedelta64((max(df['date']) 
                                        - min(df['date'])), 'D')/np.timedelta64(1,'D'))
        #day_seq won't work on neighborhoods
        df['day_seq'] = np.arange(num_days+1)
        df['dt_time'] = pd.to_datetime(df['date'])
        df['date'] = df['dt_time'].dt.date
        df['year'] = df['dt_time'].dt.year
        df['month'] = df['dt_time'].dt.month
        df['day'] = df['dt_time'].dt.day
        df['day_of_week'] = df['dt_time'].dt.weekday
        df['month_day'] = df['dt_time'].dt.strftime('%m/%d')
        df['month_weekday'] = df['dt_time'].dt.strftime('%b_%a')
        df['month'] = df['dt_time'].dt.strftime('%m/%d')        
        return df

#use new classes for dummies in order to use Pipeline

class HolidayDummies():
    
    def fit(self, holiday_dict):
        self.holiday_dict = holiday_dict
        return self

    def transform(self):
        _holidays = []
        for date in sorted(self.holiday_dict.keys()):
            for name in self.holiday_dict[date]:
                _holidays.append([date, name])
        _holidays = pd.DataFrame(_holidays, columns=['date', 'holiday'])
        return pd.get_dummies(_holidays.set_index('date')).reset_index()
    
    
class EventDummies():
    
    def fit(self, event_dict=None):
        self.event_dict = event_dict            
            
        events_dict = ({
            'Pride Parade' : ['6/30/2019', '6/24/2018', '6/25/2017', '6/26/2016', '6/28/2015',
                              '6/29/2014', '6/30/2013', '6/24/2012', '6/26/2011', '6/27/2010'],
            'Seafair' : ['8/2/2019', '8/3/2019', '8/4/2019', '8/3/2018', '8/4/2018', '8/5/2018',
                         '8/4/2017', '8/5/2017', '8/6/2017', '8/5/2016', '8/6/2016', '8/7/2016',
                         '7/31/2015', '8/1/2015', '8/2/2015', '8/1/2014', '8/2/2014', '8/3/2014',
                         '8/2/2013', '8/3/2013', '8/4/2013', '8/3/2012', '8/4/2012', '8/5/2012',
                         '8/5/2011', '8/6/2011', '8/7/2011', '8/6/2010', '8/7/2010', '8/8/2010' ],
            'Soltice Parade': ['6/30/2019', '6/16/2018', '6/17/2017', '6/18/2016', '6/20/2015',
                               '6/21/2014', '6/22/2013', '6/16/2012', '6/18/2011', '6/19/2010'],
            'Womens March' : ['1/19/2019', '1/20/2018', '1/21/2017'],})
        
        if self.event_dict == None:
            self.event_dict = events_dict
        return self
    
    def transform(self):
        _events = []
        for event in self.event_dict.keys():
            for day in self.event_dict[event]:
                _events.append([dt.datetime.strptime(day, '%m/%d/%Y'), event])
                
        _events = pd.DataFrame(_events, columns=['date','local_event'])
        _events['date'] = _events['date'].dt.date
        return pd.get_dummies(_events.set_index('date')).reset_index()
        

class JoinDataFrames():        

    def fit(self, dataframe, new_data):
        self.dataframe = dataframe
        self.new_data = new_data
        return self
    
    def transform(self):
        return self.dataframe.join(self.new_data.set_index('date'), on='date')

In [476]:
len(df2), len(neighboor_arr), len(dates)

(188505, 188505, 188505)

In [23]:
import pandas as pd
import numpy as np
import datetime as dt

import holidays
from calendra.asia import Qatar
from calendra.asia import HongKong

class SeattleHolidays:
    
    
    class CustomHolidays(holidays.US):
        def _populate(self, year=2019, start_year=2009, end_year=2030):
            # Populate the holiday list with the default US holidays
            holidays.US._populate(self, year)
            # Example: Add Ninja Turtle Day
            #self[dt.date(year, 7, 13)] = "Ninja Turtle Day"
            for year in range(start_year, end_year):
                # Add Valentine's day
                self[dt.date(year, 2, 14)] = "Valentines Day"
                # Add St Patricks Day
                self[dt.date(year, 3, 17)] = "St Patricks Day"
                # Add Easter
                self[holidays.easter(year=year)] = "Easter"
                # Add Good Friday
                self[holidays.easter(year=year)  -  dt.timedelta(days=2)] = "Good Friday"
                # Add Christmas Eve
                self[dt.date(year, 12, 24)] = "Christmas Eve"
                # Add New Years Eve
                self[dt.date(year, 12, 31)] = "New Years Eve"
                # Add Chinese New Year
                chinese = HongKong()
                for date, label in chinese.get_chinese_new_year(year):
                    self[date] = label


    class IslamicHolidays(holidays.HolidayBase):
        qatar_holidays = Qatar()
        def _populate(self, year=2019, start_year=2009, end_year=2030):
            # Populate the holiday list with blank base holidays
            holidays.HolidayBase._populate(self, year)
            for year in range(start_year, end_year):
                days = qatar_holidays.get_calendar_holidays(year)
                # Add Ramadan
                for i in range(1, len(days)):
                    if qatar_holidays.get_calendar_holidays(year)[i][1] == 'Start of ramadan':
                        for day in range(30):
                            self[qatar_holidays.get_calendar_holidays(year)[i][0] 
                                 + dt.timedelta(days=day-1)] = "Ramadan"
                    else:
                        self[qatar_holidays.get_calendar_holidays(2018)[1][0] 
                             - dt.timedelta(days=2)] = qatar_holidays.get_calendar_holidays(year)[i][1]


    class JewishHolidays(holidays.HolidayBase):
        def retrieve_data(self, filepath):
            df = pd.read_csv(filepath)
            return df

        def get_holidays(self, paths_list):
            df = self.retrieve_data(paths_list[0])
            for filepath in paths_list[1 : ]:
                cal = self.retrieve_data(filepath)
                df = pd.concat([df, cal])
            return df

        def _populate(self, year=2019, paths_list=['../data/hebcal_2010_usa.csv',
                                                   '../data/hebcal_2015_usa.csv', 
                                                   '../data/hebcal_2020_usa.csv',
                                                   '../data/hebcal_2025_usa.csv'],
                      start_year=2009, end_year=2030):
            hebcal = self.get_holidays(paths_list)
            hebcal.reset_index(drop=True, inplace=True)
            hebcal['date'] = pd.to_datetime(hebcal["Start Date"]).dt.date
            # Populate the holiday list with blank base holidays
            holidays.HolidayBase._populate(self, year)
            for year in range(start_year, end_year):
                for i in range(len(hebcal)):
                    self[hebcal['date'][i]] = hebcal['Subject'][i]


In [208]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

pipe = Pipeline(steps=[
    ('counter', CountCalls()),
    ('feturizer', FeaturizeCalls()),
    ('holiday_dummifier', HolidayDummies()),
    ('event_dummifier', EventDummies()),
    ('data_joiner', JoinDataFrames())
])


In [427]:
neighborhoods = list(test['NEIGHBORHOOD'].unique())
num_days = int(np.timedelta64((max(test['date']) - min(test['date'])), 'D')/np.timedelta64(1,'D'))+1
start = pd.to_datetime(min(test['date']))
neighboor_arr = np.array([(neighborhoods*num_days)])
neighboor_arr = neighboor_arr.flatten()
dates = [(start + np.timedelta64(i,'D')) for i in range(num_days)]*len(neighborhoods)


df = pd.DataFrame({"dt_time": dates})
df['date'] = df["dt_time"].dt.date
df['neighborhood'] = neighboor_arr


z = pd.merge(df, test, how='outer', left_on=['date','neighborhood'], right_on=['date','NEIGHBORHOOD']).fillna(0)

z[['date', 'neighborhood', 'num_calls']]

In [503]:
retriever = DataRetrieval()

In [504]:
calls = retriever.get_calls_data()

In [505]:
len(calls)

96208

In [506]:
calls

Unnamed: 0,BEAT,CALL_CODE,CALL_DESC,CALL GROUP,YEAR,MAP COLOR,CALL VOLUME,Current Year Mark,Select View,Boundary Selection,...,Neighborho,precinct (MCPP MAP.shp),Precinct,sector (BEAT MAP.shp),st area sh (MCPP MAP.shp),St Area Sh,st area sh (BEAT MAP.shp),st length (MCPP MAP.shp),St Length,st length (BEAT MAP.shp)
0,R3,082,DV - DOMESTIC VIOLENCE (ARREST DISCRETIONARY),DOMESTIC DISTURBANCE/VIOLENCE,2010,S,1,Previous Year,Monthly View,COLUMBIA CITY,...,COLUMBIA CITY,S,S,R,1.295459e+07,5.095194e+08,8.914460e+07,15714.863481,141935.732322,54424.916209
1,N2,081,"DV - ARGUMENTS, DISTURBANCE (NO ARREST)",DOMESTIC DISTURBANCE/VIOLENCE,2010,N,1,Previous Year,Monthly View,BITTERLAKE,...,BITTERLAKE,N,N,N,9.231481e+07,9.711400e+08,4.819913e+07,43678.575571,135039.609019,31293.433337
2,N3,081,"DV - ARGUMENTS, DISTURBANCE (NO ARREST)",DOMESTIC DISTURBANCE/VIOLENCE,2010,N,1,Previous Year,Monthly View,NORTHGATE,...,NORTHGATE,N,N,N,1.326893e+08,9.711400e+08,3.775017e+07,51398.038115,135039.609019,26476.939081
3,F2,081,"DV - ARGUMENTS, DISTURBANCE (NO ARREST)",DOMESTIC DISTURBANCE/VIOLENCE,2010,SW,1,Previous Year,Monthly View,SOUTH DELRIDGE,...,SOUTH DELRIDGE,SW,SW,F,3.577942e+06,5.470075e+08,3.222655e+07,8996.661825,128227.674069,25020.762799
4,Q3,080,DV - DOMESTIC THREATS BY PHONE OR WRITING,DOMESTIC DISTURBANCE/VIOLENCE,2010,W,1,Previous Year,Monthly View,SLU/CASCADE,...,SLU/CASCADE,W,W,Q,3.165731e+07,3.903963e+08,2.505669e+07,29369.502273,115256.614048,23207.296025
5,W1,087,DV - ENFORCE COURT ORDER (ARREST MANDATED),DOMESTIC DISTURBANCE/VIOLENCE,2010,SW,1,Previous Year,Monthly View,ALKI,...,ALKI,SW,SW,W,5.775714e+07,5.470075e+08,1.492775e+08,55869.551220,128227.674069,68345.256069
6,B3,085,DV - SERVICE OF COURT ORDER,DOMESTIC DISTURBANCE/VIOLENCE,2010,N,1,Previous Year,Monthly View,FREMONT,...,FREMONT,N,N,B,2.820270e+07,9.711400e+08,5.961233e+07,27946.255412,135039.609019,36973.284197
7,R2,081,"DV - ARGUMENTS, DISTURBANCE (NO ARREST)",DOMESTIC DISTURBANCE/VIOLENCE,2010,S,1,Previous Year,Monthly View,CLAREMONT/RAINIER VISTA,...,CLAREMONT/RAINIER VISTA,S,S,R,1.278125e+07,5.095194e+08,4.714219e+07,18261.885153,141935.732322,37970.778527
8,S3,081,"DV - ARGUMENTS, DISTURBANCE (NO ARREST)",DOMESTIC DISTURBANCE/VIOLENCE,2010,S,1,Previous Year,Monthly View,RAINIER BEACH,...,RAINIER BEACH,S,S,S,1.919211e+07,5.095194e+08,6.056041e+07,21134.006142,141935.732322,50181.257862
9,S1,081,"DV - ARGUMENTS, DISTURBANCE (NO ARREST)",DOMESTIC DISTURBANCE/VIOLENCE,2010,S,1,Previous Year,Monthly View,MID BEACON HILL,...,MID BEACON HILL,S,S,S,5.094619e+07,5.095194e+08,5.988522e+07,38538.828754,141935.732322,41643.418137


In [507]:
counter = CountCalls()

In [508]:
counter.fit(calls, 'neighborhood')

<__main__.CountCalls at 0xac3ac60b8>

In [488]:
df = calls[['NEIGHBORHOOD', 'ORIG_TIME_QUEUED', 'EVENT']].copy()
df['date'] = pd.to_datetime(df['ORIG_TIME_QUEUED']).dt.date
df.drop('ORIG_TIME_QUEUED', axis=1, inplace=True)

In [499]:
counts = df.groupby(['NEIGHBORHOOD', 'date']).count().rename(columns=
                                                             {'NEIGHBORHOOD':'neighborhood',
                                                              'EVENT':'num_calls'}).reset_index()
num_days = int(np.timedelta64((max(counts['date']) - min(counts['date'])), 'D')/np.timedelta64(1,'D'))+1
start = pd.to_datetime(min(counts['date']))

In [500]:
neighborhoods = list(counts['NEIGHBORHOOD'].unique())
neighboor_arr = np.array([(neighborhoods*num_days)])

In [501]:
len(counts), len(df), num_days, len(neighboor_arr)


(65781, 96208, 3195, 1)

96208

In [None]:
def _transform(self):

        df = self.dataframe[['NEIGHBORHOOD', 'ORIG_TIME_QUEUED', 'EVENT']].copy()
        df['date'] = pd.to_datetime(df['ORIG_TIME_QUEUED']).dt.date
        df.drop('ORIG_TIME_QUEUED', axis=1, inplace=True)
        counts = df.groupby(['NEIGHBORHOOD', 'date']).count().rename(columns=
                                                                     {'NEIGHBORHOOD':'neighborhood',
                                                                      'EVENT':'num_calls'}).reset_index()

        neighborhoods = list(counts['NEIGHBORHOOD'].unique())
        num_days = int(np.timedelta64((max(counts['date']) - min(counts['date'])), 'D')/np.timedelta64(1,'D'))+1
        start = pd.to_datetime(min(counts['date']))
        neighboor_arr = np.array([(neighborhoods*num_days)])
        dates = [(start + np.timedelta64(i,'D')) for i in range(num_days)]*len(neighborhoods)

        df2 = pd.DataFrame({"dt_time": dates})
        df2['date'] = df2["dt_time"].dt.date
        df2['neighborhood'] = neighboor_arr
        df3 = pd.merge(df2, counts, how='outer', left_on=['date','neighborhood'],
                       right_on=['date','NEIGHBORHOOD']).fillna(0)
        return df3[['date', 'neighborhood', 'num_calls']]

In [509]:
calls_xfrmd = counter.transform()

In [510]:
calls_xfrmd

Unnamed: 0,date,neighborhood,num_calls
0,2010-01-01,ALASKA JUNCTION,1.0
1,2010-01-02,ALKI,0.0
2,2010-01-03,BALLARD NORTH,1.0
3,2010-01-04,BALLARD SOUTH,0.0
4,2010-01-05,BELLTOWN,0.0
5,2010-01-06,BITTERLAKE,1.0
6,2010-01-07,BRIGHTON/DUNLAP,1.0
7,2010-01-08,CAPITOL HILL,1.0
8,2010-01-09,CENTRAL AREA/SQUIRE PARK,0.0
9,2010-01-10,CHINATOWN/INTERNATIONAL DISTRICT,0.0


In [438]:
featurizer = FeaturizeCalls()

In [439]:
featurizer.fit(df)

<__main__.FeaturizeCalls at 0x14fd90198>

In [440]:
featurizer.transform()

ValueError: Length of values does not match length of index

In [151]:
weather_df = retriever.get_weather_data()

In [131]:
seahawks_schedule = retriever.get_seahawks_schedule()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [146]:
huskies_schedule = retriever.get_huskies_schedule()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [152]:
sounders_schedule = retriever.get_sounders_schedule()

In [82]:
cf = FeaturizeCalls()

In [83]:
cf.fit(df)

<__main__.FeaturizeCalls at 0x12b04c2e8>

In [84]:
calls_df = cf.transform()

In [156]:
holiday_test = SeattleHolidays.CustomHolidays()

In [157]:
holiday_test._populate()

In [158]:
event_dummies = EventDummies()

In [159]:
event_dummies.fit()

<__main__.EventDummies at 0x140f389b0>

In [160]:
event_dummies.transform()

Unnamed: 0,date,local_event_Pride Parade,local_event_Seafair,local_event_Soltice Parade,local_event_Womens March
0,2019-06-30,1,0,0,0
1,2018-06-24,1,0,0,0
2,2017-06-25,1,0,0,0
3,2016-06-26,1,0,0,0
4,2015-06-28,1,0,0,0
5,2014-06-29,1,0,0,0
6,2013-06-30,1,0,0,0
7,2012-06-24,1,0,0,0
8,2011-06-26,1,0,0,0
9,2010-06-27,1,0,0,0
