In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.metrics import mean_squared_error
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

ModuleNotFoundError: No module named 'xgboost'

In [None]:
df = pd.read_csv('/kaggle/input/data-fixed/Parking_Data_Fixed.csv')
df = df.set_index('Date')
df.index = pd.to_datetime(df.index)

In [None]:
df.plot(style='.',
        figsize=(10, 5),
        color=color_pal[0],
        title='Availability')
plt.show()

In [None]:
# plt.bar("Date", "Available", xlabel="Players", ylabel="Goal Scored" ,data = df, color = "blue")
# plt.show()

# Train / Test Split

In [None]:
train = df.loc[df.index < '01-06-2020']
test = df.loc[df.index >= '01-06-2020']

fig, ax = plt.subplots(figsize=(15, 5))
train.plot(ax=ax, label='Training Set', title='Data Train/Test Split')
test.plot(ax=ax, label='Test Set')
ax.axvline('01-06-2020', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()

In [None]:
df.loc[(df.index > '01-01-2018') & (df.index < '08-01-2018')] \
    .plot(figsize=(15, 5), title='Week Of Data')
plt.show()

# Feature Creation

In [None]:
def create_features(df):
    """
    Create time series features based on time series index.
    """
    df = df.copy()
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week
    return df

df = create_features(df)

# Visualize our Feature / Target Relationship

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=df, x='Day', y='Occupied')
ax.set_title('Time vs Day')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=df, x='month', y='Occupied', palette='Blues')
ax.set_title('MW by Month')
plt.show()

# Create our Model¶

In [None]:
train = create_features(train)
test = create_features(test)

FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year']
TARGET = 'Occupied'

X_train = train[FEATURES]
y_train = train[TARGET]

X_test = test[FEATURES]
y_test = test[TARGET]

In [None]:
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',    
                       n_estimators=1000,
                       early_stopping_rounds=50,
                       objective='reg:linear',
                       max_depth=3,
                       learning_rate=0.01)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=100)

# Feature Importance

In [None]:
fi = pd.DataFrame(data=reg.feature_importances_,
             index=reg.feature_names_in_,
             columns=['importance'])
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()

# Forecast on Test¶

In [None]:
test['prediction'] = reg.predict(X_test)
df = df.merge(test[['prediction']], how='left', left_index=True, right_index=True)
ax = df[['Occupied']].plot(figsize=(15, 5))
df['prediction'].plot(ax=ax, style='.')
plt.legend(['Truth Data', 'Predictions'])
ax.set_title('Raw Dat and Prediction')
plt.show()

In [None]:
ax = df.loc[(df.index > '04-01-2018') & (df.index < '04-08-2018')]['Occupied'] \
    .plot(figsize=(15, 5), title='Week Of Data')
df.loc[(df.index > '04-01-2018') & (df.index < '04-08-2018')]['prediction'] \
    .plot(style='.')
plt.legend(['Truth Data','Prediction'])
plt.show()

# Score (RMSE)¶

In [None]:
score = np.sqrt(mean_squared_error(test['Occupied'], test['prediction']))
print(f'RMSE Score on Test set: {score:0.2f}')

# Calculate Error
#Look at the worst and best predicted days

In [None]:
test['error'] = np.abs(test[TARGET] - test['prediction'])
test['date'] = test.index.date
test.groupby(['date'])['error'].mean().sort_values(ascending=False).head(10)

# Next Steps
#More robust cross validation
#Add more features (weather forecast, holidays)

In [None]:
df.plot(style='.',
        figsize=(15, 5),
        color=color_pal[0],
        title='Occupancy')
plt.show()

In [None]:
df['Occupied'].plot(kind='hist', bins=500)

In [None]:
train = df.loc[df.index < '06-01-2020']
test = df.loc[df.index >= '06-01-2020']

fig, ax = plt.subplots(figsize=(15, 5))
train.plot(ax=ax, label='Training Set', title='Data Train/Test Split')
test.plot(ax=ax, label='Test Set')
ax.axvline('06-01-2020', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()

In [None]:
from sklearn.model_selection import TimeSeriesSplit

tss = TimeSeriesSplit(n_splits=3, test_size=8*365*1, gap=8)
df = df.sort_index()

In [None]:
fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)

fold = 0
for train_idx, val_idx in tss.split(df):
    train = df.iloc[train_idx]
    test = df.iloc[val_idx]
    train['Available'].plot(ax=axs[fold],
                          label='Training Set',
                          title=f'Data Train/Test Split Fold {fold}')
    test['Available'].plot(ax=axs[fold],
                         label='Test Set')
    axs[fold].axvline(test.index.min(), color='black', ls='--')
    fold += 1
plt.show()

# The End

In [None]:
def create_features(df):
    """
    Create time series features based on time series index.
    """
    df = df.copy()
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week
    return df

df = create_features(df)

In [None]:
def add_lags(df):
    target_map = df['Occupied'].to_dict()
    df['lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
    df['lag2'] = (df.index - pd.Timedelta('728 days')).map(target_map)
   
    return df

In [None]:
df = add_lags(df)

In [None]:
tss = TimeSeriesSplit(n_splits=3, test_size=8*365*1, gap=8)
df = df.sort_index()


fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
    train = df.iloc[train_idx]
    test = df.iloc[val_idx]

    train = create_features(train)
    test = create_features(test)

    FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year',
                'lag1','lag2']
    TARGET = 'Occupied'

    X_train = train[FEATURES]
    y_train = train[TARGET]

    X_test = test[FEATURES]
    y_test = test[TARGET]

    reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',    
                           n_estimators=1000,
                           early_stopping_rounds=50,
                           objective='reg:linear',
                           max_depth=3,
                           learning_rate=0.01)
    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=100)

    y_pred = reg.predict(X_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)

In [None]:
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')

In [None]:
# Retrain on all data
df = create_features(df)

FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year',
            'lag1','lag2']
TARGET = 'Occupied'

X_all = df[FEATURES]
y_all = df[TARGET]

reg = xgb.XGBRegressor(base_score=0.5,
                       booster='gbtree',    
                       n_estimators=1500000,
                       objective='reg:linear',
                       max_depth=3,
                       learning_rate=0.01)
reg.fit(X_all, y_all,
        eval_set=[(X_all, y_all)],
        verbose=100)

In [None]:
df.index.max()

In [None]:
future  = pd.date_range(start='2021-06-03 00:00:00', end='2022-06-03 23:59:59', freq='15min')

# Filter the dates to include only those between 7am and 9am
time_range = future[(future.time >= pd.Timestamp('07:15:00').time()) & (future.time <= pd.Timestamp('09:00:00').time())]

In [None]:
time_range

In [None]:
# Create future dataframe

future_df = pd.DataFrame(index=time_range)
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future = create_features(df_and_future)
df_and_future = add_lags(df_and_future)

In [None]:
df_and_future

In [None]:
future_w_features = df_and_future.query('isFuture').copy()

In [None]:
future_w_features

In [None]:
future_w_features['pred'] = reg.predict(future_w_features[FEATURES])

In [None]:
future_w_features['pred'].plot(figsize=(10, 5),
                               color=color_pal[4],
                               ms=1,
                               lw=1,
                               title='Future Predictions')
plt.show()

In [None]:
future_w_features.to_csv('Future_Prediction_New.csv')

In [None]:
reg.save_model('model.json')

In [None]:
!ls -lh

In [None]:
reg_new = xgb.XGBRegressor()
reg_new.load_model('model.json')
future_w_features['pred'] = reg_new.predict(future_w_features[FEATURES])
future_w_features['pred'].plot(figsize=(10, 5),
                               color=color_pal[4],
                               ms=1, lw=1,
                               title='Future Predictions')