In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import matplotlib.style as style

%matplotlib inline
style.use('seaborn-notebook')

In [2]:
# Load Model
import pickle
filename = '../models/finalized_model.sav'
m = pickle.load(open(filename, 'rb'))
m

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.3, gamma=0.0,
       learning_rate=0.05, max_delta_step=0, max_depth=3,
       min_child_weight=7, missing=nan, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=2.3, seed=None,
       silent=True, subsample=1, verbosity=1)

In [3]:
# Generate new data
input_filepath = '../data/raw/'
sar_header = ['DEM', 'date', 'responders', 'hours', 'miles']
sar_data = pd.read_csv(f'{input_filepath}KCSARA.csv', header=None, 
                       parse_dates=['date'], names=sar_header)
sar_data = pd.DataFrame(sar_data)
sar_data.head()

date_range = pd.date_range(start='1/1/2019', end='4/15/2019')
clean_table = []

for d in date_range:
    if sar_data.date.isin([d]).any():
        clean_table.append([d,1])
    else:
        clean_table.append([d,0])
               
sar_clean = pd.DataFrame(clean_table)
sar_clean.columns = ['date','mission']
test = sar_clean
test.head()

Unnamed: 0,date,mission
0,2019-01-01,1
1,2019-01-02,0
2,2019-01-03,0
3,2019-01-04,0
4,2019-01-05,0


In [4]:
def add_datepart(df, fldname, drop=True, time=False, errors="raise"):
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

add_datepart(test, 'date', drop=False)
test.head()

Unnamed: 0,date,mission,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,2019-01-01,1,2019,1,1,1,1,1,False,True,False,True,False,True,1546300800
1,2019-01-02,0,2019,1,1,2,2,2,False,False,False,False,False,False,1546387200
2,2019-01-03,0,2019,1,1,3,3,3,False,False,False,False,False,False,1546473600
3,2019-01-04,0,2019,1,1,4,4,4,False,False,False,False,False,False,1546560000
4,2019-01-05,0,2019,1,1,5,5,5,False,False,False,False,False,False,1546646400


In [5]:
test.mission.mean()

0.3523809523809524

In [6]:
boeing = pd.read_csv(f'{input_filepath}boeing.csv', parse_dates=[2])
gardner = pd.read_csv(f'{input_filepath}gardner.csv', parse_dates=[2])
weather = pd.merge(boeing, gardner, how='inner', on='DATE')
weather = weather.fillna(0)
test = pd.merge(test, weather, how='inner', left_on='date', right_on='DATE')

In [7]:
test.drop(['date', 'STATION_x', 'NAME_x', 'STATION_y','NAME_y', 'WDF2', 'WDF5', 'WSF2', 'WSF5', 'PGTM', 'FMTM'], axis=1, inplace=True)
interim_filepath = '../data/interim/'


In [8]:
test.to_csv(f'{interim_filepath}test_weather_df.csv', index=False)

In [9]:
import pandas.tseries.holiday as hol

us_cal = hol.USFederalHolidayCalendar()
dr = pd.date_range(start='2002-01-01', end='2019-05-01')
us_holidays = us_cal.holidays(start=dr.min(), end=dr.max())

# Add column holiday bool to data

test['holiday'] = 0
test['DATE'] =  pd.to_datetime(test['DATE'], infer_datetime_format=True,
                              format='datetime64[ns]')
test['holiday'] = test.DATE.isin(us_holidays)

# Insert the number of days before and after closest holiday

holiday = test.holiday
since = []
d = 0
for i in range(len(holiday)):
    d += 1
    if holiday[i]:
        d = 0   # if it's a holiday, reset
    since.append(d)
#data['holiday_days_since'] = since

before = []
d = 0
for i in range(len(holiday)):
    d += 1
    if holiday[len(holiday) - (i+1)]:
        d = 0   # if it's a holiday, reset
    before.append(d)
#data['holiday_days_before'] = before

test['holiday_closest'] = np.minimum(since, before)

In [10]:
raw_filepath = '../data/raw/'

trends = pd.read_csv(f'{raw_filepath}/googletrends_hiking.csv', header=None, names=['date_t', 'trend'])

trends['date_t'] =  pd.to_datetime(trends['date_t'], infer_datetime_format=True,
                              format='datetime64[ns]')

test = pd.merge(test, trends, how='outer', left_on='DATE', right_on='date_t')

test['trend'] = test.trend.interpolate(limit_direction='both')
test.drop('date_t', axis=1, inplace=True)

In [11]:
test.drop('DATE', axis=1, inplace=True)


test['holiday'] = test['holiday'].astype(bool)
test['Is_month_end'] = test['Is_month_end'].astype(bool)
test['Is_quarter_end'] = test['Is_quarter_end'].astype(bool)
test['Is_year_end'] = test['Is_year_end'].astype(bool)
test['Is_year_start'] = test['Is_year_start'].astype(bool)

to_remove =['WT08', 'Is_month_start', 'WT02', 'Is_quarter_start', 'WT05', 'TSUN']
test.drop(to_remove, axis=1, inplace=True)
test.dropna(inplace=True)

In [12]:
y_test = test['mission']
X_test = test.drop('mission', axis=1)
y_test.shape
m.predict(X_test)

array([1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0.,
       0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
       1., 1., 0.])

In [13]:
m.predict_proba(X_test)

array([[0.4993189 , 0.5006811 ],
       [0.5320902 , 0.46790984],
       [0.6381767 , 0.36182335],
       [0.62094724, 0.37905273],
       [0.39355272, 0.6064473 ],
       [0.5622722 , 0.4377278 ],
       [0.55571353, 0.44428644],
       [0.61663187, 0.3833681 ],
       [0.673285  , 0.32671502],
       [0.6653117 , 0.33468834],
       [0.5697073 , 0.4302927 ],
       [0.32358754, 0.67641246],
       [0.32983536, 0.67016464],
       [0.5185962 , 0.48140383],
       [0.51515627, 0.48484373],
       [0.54272056, 0.4572794 ],
       [0.6596545 , 0.34034547],
       [0.6707053 , 0.3292947 ],
       [0.4607234 , 0.5392766 ],
       [0.3354674 , 0.6645326 ],
       [0.6025838 , 0.39741617],
       [0.6838429 , 0.31615707],
       [0.69087195, 0.30912808],
       [0.6577412 , 0.34225884],
       [0.599999  , 0.400001  ],
       [0.30601943, 0.6939806 ],
       [0.37426412, 0.6257359 ],
       [0.5872369 , 0.41276315],
       [0.6016642 , 0.39833578],
       [0.59741485, 0.40258512],
       [0.

In [14]:
pd.set_option('display.max_rows', 1000)
y_test

0      1.0
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.0
8      0.0
9      1.0
10     0.0
11     1.0
12     1.0
13     1.0
14     0.0
15     1.0
16     0.0
17     0.0
18     1.0
19     1.0
20     0.0
21     1.0
22     0.0
23     0.0
24     0.0
25     0.0
26     1.0
27     0.0
28     1.0
29     0.0
30     0.0
31     1.0
32     1.0
33     0.0
34     0.0
35     0.0
36     1.0
37     1.0
38     1.0
39     0.0
40     0.0
41     0.0
42     1.0
43     1.0
44     0.0
45     0.0
46     0.0
47     0.0
48     1.0
49     0.0
50     0.0
51     0.0
52     0.0
53     1.0
54     0.0
55     0.0
56     0.0
57     0.0
58     0.0
59     0.0
60     1.0
61     1.0
62     1.0
63     1.0
64     0.0
65     0.0
66     0.0
67     0.0
68     0.0
69     1.0
70     0.0
71     0.0
72     0.0
73     1.0
74     0.0
75     1.0
76     0.0
77     0.0
78     0.0
79     0.0
80     0.0
81     1.0
82     1.0
83     0.0
84     0.0
85     0.0
86     1.0
87     1.0
88     1.0
89     1.0
90     0.0