In [41]:
import time
import os
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [42]:
tp_data = pd.read_csv("data/throughput_metrics.csv")

In [43]:
tp_data.head()

Unnamed: 0,Time,SiteA,SiteB,SiteC,SiteD,SiteE,SiteF
0,1/1/18 8:00,14110930000.0,1109243000.0,82898310.0,56650005.21,11178680.0,827420.9772
1,1/1/18 9:00,13453620000.0,1242256000.0,43757.39,49307351.26,7070847.0,913017.6007
2,1/1/18 10:00,12168880000.0,2006322000.0,43157.25,56843652.59,51328260.0,874471.3644
3,1/1/18 11:00,11231200000.0,1073181000.0,82771540.0,46645240.59,5217827.0,837246.7783
4,1/1/18 12:00,10780850000.0,864506900.0,33435.48,30966947.37,7495363.0,827143.7497


In [44]:
tp_data.shape

(8760, 7)

### Assumption: using all other columns to predict the 'throughput metrics' of siteF

In [45]:
tp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    8760 non-null   object 
 1   SiteA   8760 non-null   float64
 2   SiteB   8760 non-null   float64
 3   SiteC   8760 non-null   float64
 4   SiteD   8760 non-null   float64
 5   SiteE   8760 non-null   float64
 6   SiteF   8760 non-null   float64
dtypes: float64(6), object(1)
memory usage: 479.2+ KB


In [46]:
tp_data["Time"] = pd.to_datetime(tp_data["Time"], format='%m/%d/%y %H:%M')

In [47]:
tp_data.head()

Unnamed: 0,Time,SiteA,SiteB,SiteC,SiteD,SiteE,SiteF
0,2018-01-01 08:00:00,14110930000.0,1109243000.0,82898310.0,56650005.21,11178680.0,827420.9772
1,2018-01-01 09:00:00,13453620000.0,1242256000.0,43757.39,49307351.26,7070847.0,913017.6007
2,2018-01-01 10:00:00,12168880000.0,2006322000.0,43157.25,56843652.59,51328260.0,874471.3644
3,2018-01-01 11:00:00,11231200000.0,1073181000.0,82771540.0,46645240.59,5217827.0,837246.7783
4,2018-01-01 12:00:00,10780850000.0,864506900.0,33435.48,30966947.37,7495363.0,827143.7497


In [48]:
tp_data["hour"] = tp_data.Time.dt.hour

In [49]:
tp_data["day"] = tp_data.Time.dt.day

In [50]:
tp_data["month"] = tp_data.Time.dt.month

In [51]:
tp_data["year"] = tp_data.Time.dt.year

In [52]:
#tp_data = tp_data.drop(["Time"], axis=1)

In [53]:
tp_data.head()

Unnamed: 0,Time,SiteA,SiteB,SiteC,SiteD,SiteE,SiteF,hour,day,month,year
0,2018-01-01 08:00:00,14110930000.0,1109243000.0,82898310.0,56650005.21,11178680.0,827420.9772,8,1,1,2018
1,2018-01-01 09:00:00,13453620000.0,1242256000.0,43757.39,49307351.26,7070847.0,913017.6007,9,1,1,2018
2,2018-01-01 10:00:00,12168880000.0,2006322000.0,43157.25,56843652.59,51328260.0,874471.3644,10,1,1,2018
3,2018-01-01 11:00:00,11231200000.0,1073181000.0,82771540.0,46645240.59,5217827.0,837246.7783,11,1,1,2018
4,2018-01-01 12:00:00,10780850000.0,864506900.0,33435.48,30966947.37,7495363.0,827143.7497,12,1,1,2018


In [54]:
X = tp_data[["hour", "day", "month", "year"]]
y = tp_data["SiteF"]

In [55]:
tp_data.to_csv("data/throughput_metrics_modified.csv")

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.10)

In [40]:
xgb_reg = xgb.XGBRegressor()


In [17]:
xgb_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [21]:
joblib.dump(xgb_reg, "xgb_model.joblib")

['xgb_model.joblib']

In [20]:
mean_squared_error(y_test, y_pred,squared=False)

746419.8041826863

In [22]:
a = pd.DataFrame({"value":X_test.values[1]}).values.reshape(-1, 4)

In [23]:
a.shape

(1, 4)

In [24]:
xgb_reg.predict(a)

array([636685.], dtype=float32)