In [1]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import joblib
from sklearn.pipeline import  Pipeline
import warnings
warnings.filterwarnings('ignore')
from utils import collect_error_metrics, get_model_acc_via_r2, get_spi_from_precip_col
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import set_config
import xgboost as xgb
set_config(display='diagram')

color_pal = sns.color_palette()

In [2]:
dsci_df = pd.read_csv('Data/DSCI_data.csv', index_col=['MapDate'], parse_dates=['MapDate'])
weather_data_df = pd.read_parquet('Data/Tifton_SPI_FE.parquet')

In [3]:
weather_data_df.columns

Index(['prcp_accum', 'air_temp_avg', 'smp_2', 'smp_4', 'smp_8', 'smp_20',
       'smp_40', 'soil_temp_2', 'soil_temp_4', 'soil_temp_8', 'soil_temp_20',
       'soil_temp_40', 'wind_dir_avg', 'wind_speed_avg', 'PRCP', 'year',
       'month', 'day', 'SPI', 'date', 'hour', 'dayofweek', 'weekday',
       'quarter', 'dayofyear', 'dayofmonth', 'weekofyear', 'date_offset',
       'week', 'season', 'soil_temp_avg', 'smp_avg'],
      dtype='object')

In [4]:
weather_data_df

Unnamed: 0_level_0,prcp_accum,air_temp_avg,smp_2,smp_4,smp_8,smp_20,smp_40,soil_temp_2,soil_temp_4,soil_temp_8,...,weekday,quarter,dayofyear,dayofmonth,weekofyear,date_offset,week,season,soil_temp_avg,smp_avg
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-11-01,4.90,12.0,8.8,8.5,11.0,12.3,24.0,13.8,14.3,15.7,...,Saturday,4,306,1,44,781,44,Fall,16.16,12.92
2008-11-02,4.90,12.8,8.7,8.3,10.4,12.2,24.1,15.3,15.5,16.1,...,Sunday,4,307,2,44,782,44,Fall,16.72,12.74
2008-11-03,4.90,15.2,8.6,8.2,10.6,12.2,23.9,15.3,15.7,16.5,...,Monday,4,308,3,45,783,45,Fall,16.84,12.70
2008-11-04,4.90,16.9,8.5,8.1,10.0,11.7,23.9,17.1,17.4,17.7,...,Tuesday,4,309,4,45,784,45,Fall,17.90,12.44
2008-11-05,4.90,16.6,8.5,8.0,10.0,11.9,23.7,16.7,17.0,17.5,...,Wednesday,4,310,5,45,785,45,Fall,17.74,12.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-11-27,3.18,13.0,7.5,8.1,10.0,9.2,24.5,13.5,13.2,16.1,...,Monday,4,331,27,48,807,48,Fall,14.60,11.86
2017-11-28,3.18,10.5,7.1,7.9,10.5,8.8,24.7,13.2,12.9,15.6,...,Tuesday,4,332,28,48,808,48,Fall,14.26,11.80
2017-11-29,3.18,15.6,6.7,7.9,10.0,9.2,24.6,15.2,14.7,16.6,...,Wednesday,4,333,29,48,809,48,Fall,15.22,11.68
2017-11-30,3.18,17.4,7.0,7.7,9.4,8.8,24.3,15.7,15.3,17.4,...,Thursday,4,334,30,48,810,48,Fall,15.74,11.44


In [5]:
def set_flag_col(df, col_name, find_in, series):
    df[col_name] = df[find_in].isin(series)
    return df

In [6]:
temp = dsci_df.index.to_series()

In [7]:
weather_data_df = set_flag_col(weather_data_df, 'flag','date',temp)

In [8]:
weather_data_df[['flag','date']]

Unnamed: 0_level_0,flag,date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-11-01,False,2008-11-01
2008-11-02,False,2008-11-02
2008-11-03,False,2008-11-03
2008-11-04,False,2008-11-04
2008-11-05,False,2008-11-05
...,...,...
2017-11-27,False,2017-11-27
2017-11-28,True,2017-11-28
2017-11-29,False,2017-11-29
2017-11-30,False,2017-11-30


In [9]:
weather_data_df = weather_data_df['2008-12-28':]

In [10]:
#calculate sum of values, grouped by week
weather_data_df = weather_data_df.fillna(method='ffill')
weather_data_df = weather_data_df.drop(['flag'],axis=1)
# weather_data_df = weather_data_df.groupby([pd.Grouper(key='date', freq='W')])[weather_data_df.columns].mean()

In [11]:
weather_data_df

Unnamed: 0_level_0,prcp_accum,air_temp_avg,smp_2,smp_4,smp_8,smp_20,smp_40,soil_temp_2,soil_temp_4,soil_temp_8,...,weekday,quarter,dayofyear,dayofmonth,weekofyear,date_offset,week,season,soil_temp_avg,smp_avg
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-12-28,11.80,19.7,8.5,7.8,9.9,12.1,24.0,15.9,16.3,16.8,...,Sunday,4,363,28,52,908,52,Winter,16.36,12.46
2008-12-29,11.80,19.3,8.1,7.7,9.5,11.9,24.1,17.3,17.5,17.2,...,Monday,4,364,29,1,909,1,Winter,17.00,12.26
2008-12-30,11.80,17.0,7.5,7.5,9.5,11.8,23.6,14.2,14.8,16.3,...,Tuesday,4,365,30,1,910,1,Winter,15.78,11.98
2008-12-31,11.80,11.7,7.7,7.5,9.0,11.5,23.9,13.1,13.6,14.9,...,Wednesday,4,366,31,1,911,1,Winter,14.88,11.92
2009-01-01,11.80,12.0,7.2,7.1,9.0,11.5,23.9,12.0,12.7,14.1,...,Thursday,1,1,1,1,1081,1,Winter,14.16,11.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-11-27,3.18,13.0,7.5,8.1,10.0,9.2,24.5,13.5,13.2,16.1,...,Monday,4,331,27,48,807,48,Fall,14.60,11.86
2017-11-28,3.18,10.5,7.1,7.9,10.5,8.8,24.7,13.2,12.9,15.6,...,Tuesday,4,332,28,48,808,48,Fall,14.26,11.80
2017-11-29,3.18,15.6,6.7,7.9,10.0,9.2,24.6,15.2,14.7,16.6,...,Wednesday,4,333,29,48,809,48,Fall,15.22,11.68
2017-11-30,3.18,17.4,7.0,7.7,9.4,8.8,24.3,15.7,15.3,17.4,...,Thursday,4,334,30,48,810,48,Fall,15.74,11.44


In [12]:
dsci_df.index.names = ['date']

In [13]:
dsci_df

Unnamed: 0_level_0,State,County,FIPS,DSCI
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-12-30,GA,Tift County,13277,0
2009-01-06,GA,Tift County,13277,0
2009-01-13,GA,Tift County,13277,0
2009-01-20,GA,Tift County,13277,0
2009-01-27,GA,Tift County,13277,0
...,...,...,...,...
2018-11-27,GA,Tift County,13277,0
2018-12-04,GA,Tift County,13277,0
2018-12-11,GA,Tift County,13277,0
2018-12-18,GA,Tift County,13277,0


In [14]:
merge=pd.merge(weather_data_df,dsci_df, how='left', left_index=True, right_index=True)

In [15]:
merge

Unnamed: 0_level_0,prcp_accum,air_temp_avg,smp_2,smp_4,smp_8,smp_20,smp_40,soil_temp_2,soil_temp_4,soil_temp_8,...,weekofyear,date_offset,week,season,soil_temp_avg,smp_avg,State,County,FIPS,DSCI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-12-28,11.80,19.7,8.5,7.8,9.9,12.1,24.0,15.9,16.3,16.8,...,52,908,52,Winter,16.36,12.46,,,,
2008-12-29,11.80,19.3,8.1,7.7,9.5,11.9,24.1,17.3,17.5,17.2,...,1,909,1,Winter,17.00,12.26,,,,
2008-12-30,11.80,17.0,7.5,7.5,9.5,11.8,23.6,14.2,14.8,16.3,...,1,910,1,Winter,15.78,11.98,GA,Tift County,13277.0,0.0
2008-12-31,11.80,11.7,7.7,7.5,9.0,11.5,23.9,13.1,13.6,14.9,...,1,911,1,Winter,14.88,11.92,,,,
2009-01-01,11.80,12.0,7.2,7.1,9.0,11.5,23.9,12.0,12.7,14.1,...,1,1081,1,Winter,14.16,11.74,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-11-27,3.18,13.0,7.5,8.1,10.0,9.2,24.5,13.5,13.2,16.1,...,48,807,48,Fall,14.60,11.86,,,,
2017-11-28,3.18,10.5,7.1,7.9,10.5,8.8,24.7,13.2,12.9,15.6,...,48,808,48,Fall,14.26,11.80,GA,Tift County,13277.0,100.0
2017-11-29,3.18,15.6,6.7,7.9,10.0,9.2,24.6,15.2,14.7,16.6,...,48,809,48,Fall,15.22,11.68,,,,
2017-11-30,3.18,17.4,7.0,7.7,9.4,8.8,24.3,15.7,15.3,17.4,...,48,810,48,Fall,15.74,11.44,,,,


In [16]:
merge= merge.fillna(method='ffill', limit=7)

In [17]:
merge

Unnamed: 0_level_0,prcp_accum,air_temp_avg,smp_2,smp_4,smp_8,smp_20,smp_40,soil_temp_2,soil_temp_4,soil_temp_8,...,weekofyear,date_offset,week,season,soil_temp_avg,smp_avg,State,County,FIPS,DSCI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-12-28,11.80,19.7,8.5,7.8,9.9,12.1,24.0,15.9,16.3,16.8,...,52,908,52,Winter,16.36,12.46,,,,
2008-12-29,11.80,19.3,8.1,7.7,9.5,11.9,24.1,17.3,17.5,17.2,...,1,909,1,Winter,17.00,12.26,,,,
2008-12-30,11.80,17.0,7.5,7.5,9.5,11.8,23.6,14.2,14.8,16.3,...,1,910,1,Winter,15.78,11.98,GA,Tift County,13277.0,0.0
2008-12-31,11.80,11.7,7.7,7.5,9.0,11.5,23.9,13.1,13.6,14.9,...,1,911,1,Winter,14.88,11.92,GA,Tift County,13277.0,0.0
2009-01-01,11.80,12.0,7.2,7.1,9.0,11.5,23.9,12.0,12.7,14.1,...,1,1081,1,Winter,14.16,11.74,GA,Tift County,13277.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-11-27,3.18,13.0,7.5,8.1,10.0,9.2,24.5,13.5,13.2,16.1,...,48,807,48,Fall,14.60,11.86,GA,Tift County,13277.0,100.0
2017-11-28,3.18,10.5,7.1,7.9,10.5,8.8,24.7,13.2,12.9,15.6,...,48,808,48,Fall,14.26,11.80,GA,Tift County,13277.0,100.0
2017-11-29,3.18,15.6,6.7,7.9,10.0,9.2,24.6,15.2,14.7,16.6,...,48,809,48,Fall,15.22,11.68,GA,Tift County,13277.0,100.0
2017-11-30,3.18,17.4,7.0,7.7,9.4,8.8,24.3,15.7,15.3,17.4,...,48,810,48,Fall,15.74,11.44,GA,Tift County,13277.0,100.0


In [18]:
merge.to_parquet('Data/Tifton_weekly_with_DSCI.parquet')