In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

ValueError: Mismatched version between the Python package and the native shared object.  Python package version: 1.7.2. Shared object version: 1.5.0. Shared object is loaded from: /Users/charlesmeyer/opt/anaconda3/envs/Drought-Detection-Modeling/lib/libxgboost.dylib.
Likely cause:
  * XGBoost is first installed with anaconda then upgraded with pip. To fix it please remove one of the installations.

In [None]:
df = pd.read_parquet('./Data/Tifton_SPI_FE.parquet')

In [None]:
color_pal = sns.color_palette()

In [None]:
df.info()

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))
sns.boxplot(data=df, x='year', y='SPI')
ax.set_title('SPI by year')
plt.show()

In [None]:
df.SPI.plot(style='.',
            figsize=(15, 5),
            color=color_pal[0],
            title="SPI"
            )
plt.show()

In [None]:
train = df.loc[df.index < '01-01-2015']
test = df.loc[df.index >= '01-01-2015']
fig, ax = plt.subplots(figsize=(15, 5))
train.SPI.plot(ax=ax, label='Training Set', title='Data Train/Test Split')
test.SPI.plot(ax=ax, label='Test Set')
ax.axvline('01-01-2015', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()

In [None]:
df[['SPI']].loc[(df.index > '01-01-2012') & (df.index < '01-31-2012')].plot(figsize=(15, 5), title='Month Of Data')
plt.show()

In [None]:
df = df.drop(['hour'], axis=1)

## Outlier Analysis

In [None]:
df['SPI'].plot(kind='hist',bins=50)

In [None]:
df.query('SPI > 1')['SPI'] \
    .plot(style='.',
          figsize=(15, 5),
          color=color_pal[5],
          title='Outliers')

## Time series cross validation

In [None]:
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=5,gap=24)
df = df.sort_index()
df

In [None]:
fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)

fold = 0
for train_idx, val_idx in tss.split(df):
    train = df.iloc[train_idx]
    test = df.iloc[val_idx]
    train['SPI'].plot(ax=axs[fold],
                          label='Training Set',
                          title=f'Data Train/Test Split Fold {fold}')
    test['SPI'].plot(ax=axs[fold],
                         label='Test Set')
    axs[fold].axvline(test.index.min(), color='black', ls='--')
    fold += 1
plt.show()

In [None]:
def add_lags(df):
    target_map = df['SPI'].to_dict()
    df['lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
    df['lag2'] = (df.index - pd.Timedelta('728 days')).map(target_map)
    df['lag3'] = (df.index - pd.Timedelta('1092 days')).map(target_map)
    return df

In [None]:
df = add_lags(df)

In [None]:
df.head(1).transpose()

In [None]:
FEATURES = ['Precipitation Accumulation  (In)',
            'Air Temperature Observed  (Degc)',
            'Air Temperature Maximum  (Degc)',
            'Air Temperature Minimum  (Degc)',
            'Air Temperature Average  (Degc)',
            'Soil Moisture Percent -2" (Pct)',
            'Soil Moisture Percent -4" (Pct)',
            'Soil Moisture Percent -8" (Pct)',
            'Soil Moisture Percent -20" (Pct)',
            'Soil Moisture Percent -40" (Pct)',
            'Soil Temperature Observed -2" (Degc)',
            'Soil Temperature Observed -4" (Degc)',
            'Soil Temperature Observed -8" (Degc)',
            'Soil Temperature Observed -20" (Degc)',
            'Soil Temperature Observed -40" (Degc)',
            'Salinity -2" (Gram)',
            'Salinity -4" (Gram)',
            'Salinity -8" (Gram)',
            'Salinity -20" (Gram)',
            'Salinity -40" (Gram)',
            'Wind Direction Average  (Degr)',
            'Wind Speed Average  (Mph)',
            'PRCP',
            'year',
            'month',
            'day',
            'dayofweek',
            'weekday',
            'quarter',
            'dayofyear',
            'dayofmonth',
            'weekofyear',
            'date_offset',
            'season','lag1','lag2','lag3']
TARGET = 'SPI'

In [None]:
tss = TimeSeriesSplit(n_splits=5, gap=24)
df = df.sort_index()
fold = 0
preds = []
scores = []

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['season'] = le.fit_transform(df['season'])
df['weekday'] = le.fit_transform(df['weekday'])
df['day'] = le.fit_transform(df['day'])
df = df.drop(['date', 'PRCP_ECDF'], axis=1)
df.info()

In [None]:
train = df.loc[df.index < '01-01-2015']
test = df.loc[df.index >= '01-01-2015']

In [None]:
X_train = train[FEATURES]
y_train = train[TARGET]

X_test = test[FEATURES]
y_test = test[TARGET]


In [None]:
# Set up pipeline
numeric_features = FEATURES[:-15]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
    ("num_transform", numeric_transformer, numeric_features),

])

In [None]:
pca = PCA(svd_solver='full', n_components=10,random_state=42)
xgb_reg = xgb.XGBRegressor()
xgb_pipeline = Pipeline(steps=[("preprocesser", preprocessor), ('pca', pca), ("clf", xgb_reg)])
xgb_pipeline

In [None]:
xgb_pipeline.fit(X_train, y_train)

In [None]:
fi = pd.DataFrame(data=reg.feature_importances_,
             index=reg.feature_names_in_,
             columns=['importance'])
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()