In [55]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [3]:
discard_df = pd.read_csv("../csv/datasets/complete_discard.csv")
average_df = pd.read_csv("../csv/datasets/complete_avg.csv")
predict_df = pd.read_csv("../csv/datasets/complete_pred.csv")

# Random Forest Regression

In [57]:
def rfr(df, n_splits):
    tscv = TimeSeriesSplit(n_splits)

    model = RandomForestRegressor(random_state=42)

    df['Date'] = pd.to_datetime(df['Date']).values.astype(float)

    X = df.drop(labels=['Sales'], axis=1)
    y = df['Sales']

    avg_score = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        score = model.score(X_test, y_test)

        avg_score.append(score)

        # print("Score:", score)

    print('Avg Score:', sum(avg_score) / len(avg_score))


In [65]:
splits = 12

rfr(discard_df, splits)

rfr(average_df, splits)

rfr(predict_df, splits)

Avg Score: 0.799667461563672
Avg Score: 0.8011692623053106
Avg Score: 0.7933930177913372


# Linear Regression

In [59]:
def lin_reg(df, n_splits):
    tscv = TimeSeriesSplit(n_splits)

    model = LinearRegression()

    df['Date'] = pd.to_datetime(df['Date']).values.astype(float)

    X = df.drop(labels=['Sales'], axis=1)
    y = df['Sales']

    avg_score = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        score = model.score(X_test, y_test)

        avg_score.append(score)

        # print("Score:", score)

    print('Avg Score:', sum(avg_score) / len(avg_score))

In [60]:
splits = 10

lin_reg(discard_df, splits)

lin_reg(average_df, splits)

lin_reg(predict_df, splits)

Avg Score: 0.7452236748362815
Avg Score: 0.7479015232868174
Avg Score: 0.7427895778682464


# XGBoost Regressor

In [61]:
def xgb_reg(df, n_splits):
    tscv = TimeSeriesSplit(n_splits)

    model = XGBRegressor(random_state=42)

    df['Date'] = pd.to_datetime(df['Date']).values.astype(float)

    X = df.drop(labels=['Sales'], axis=1)
    y = df['Sales']

    avg_score = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        score = model.score(X_test, y_test)

        avg_score.append(score)

        # print("Score:", score)

    print('Avg Score:', sum(avg_score) / len(avg_score))

In [62]:
splits = 12

xgb_reg(discard_df, splits)

xgb_reg(average_df, splits)

xgb_reg(predict_df, splits)

Avg Score: 0.8043508460254354
Avg Score: 0.7885673679237818
Avg Score: 0.7761244731651772
