In [13]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [14]:
discard_df = pd.read_csv("../csv/datasets/complete_discard.csv")
average_df = pd.read_csv("../csv/datasets/complete_avg.csv")
predict_df = pd.read_csv("../csv/datasets/complete_pred.csv")

# Random Forest Regression

In [15]:
def rfr(df, n_splits):
    tscv = TimeSeriesSplit(n_splits)

    model = RandomForestRegressor(random_state=42)

    df['Date'] = pd.to_datetime(df['Date']).values.astype(float)

    X = df.drop(labels=['Sales'], axis=1)
    y = df['Sales']

    avg_score = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        score = model.score(X_test, y_test)

        avg_score.append(score)

        # print("Score:", score)
    return sum(avg_score) / len(avg_score)


In [16]:
rfr_split_test = {}
for n in range(2, 21):
    print(n)
    avg = rfr(discard_df, n)
    print(avg)
    rfr_split_test[str(n)] = avg


2
0.578126471920388
3
0.6484014248413773
4
0.7283821173882454
5
0.6826728882329715
6
0.719407212494452
7
0.7601387435340914
8
0.6900027002039644
9
0.7873922676124545
10
0.7700774039778545
11
0.7425826319705464
12
0.799667461563672
13
0.775234263071609
14
0.7575244059228997
15
0.6520810521934656
16
0.8072842710889779
17
0.7956217210466384
18
0.8119940189644941
19
0.7975710936940923
20
0.7498346738739861
{'2': 0.578126471920388, '3': 0.6484014248413773, '4': 0.7283821173882454, '5': 0.6826728882329715, '6': 0.719407212494452, '7': 0.7601387435340914, '8': 0.6900027002039644, '9': 0.7873922676124545, '10': 0.7700774039778545, '11': 0.7425826319705464, '12': 0.799667461563672, '13': 0.775234263071609, '14': 0.7575244059228997, '15': 0.6520810521934656, '16': 0.8072842710889779, '17': 0.7956217210466384, '18': 0.8119940189644941, '19': 0.7975710936940923, '20': 0.7498346738739861}


In [21]:
print(rfr_split_test)

{'2': 0.578126471920388, '3': 0.6484014248413773, '4': 0.7283821173882454, '5': 0.6826728882329715, '6': 0.719407212494452, '7': 0.7601387435340914, '8': 0.6900027002039644, '9': 0.7873922676124545, '10': 0.7700774039778545, '11': 0.7425826319705464, '12': 0.799667461563672, '13': 0.775234263071609, '14': 0.7575244059228997, '15': 0.6520810521934656, '16': 0.8072842710889779, '17': 0.7956217210466384, '18': 0.8119940189644941, '19': 0.7975710936940923, '20': 0.7498346738739861}


In [20]:
splits = 12

print(rfr(discard_df, splits))

print(rfr(average_df, splits))

print(rfr(predict_df, splits))

0.799667461563672
0.8011692623053106
0.7933930177913372


# Linear Regression

In [18]:
def lin_reg(df, n_splits):
    tscv = TimeSeriesSplit(n_splits)

    model = LinearRegression()

    df['Date'] = pd.to_datetime(df['Date']).values.astype(float)

    X = df.drop(labels=['Sales'], axis=1)
    y = df['Sales']

    avg_score = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        score = model.score(X_test, y_test)

        avg_score.append(score)

        # print("Score:", score)

    print('Avg Score:', sum(avg_score) / len(avg_score))

In [19]:
splits = 10

lin_reg(discard_df, splits)

lin_reg(average_df, splits)

lin_reg(predict_df, splits)

Avg Score: 0.7452236748362817
Avg Score: 0.7479015232868157
Avg Score: 0.7427895778682461


# XGBoost Regressor

In [61]:
def xgb_reg(df, n_splits):
    tscv = TimeSeriesSplit(n_splits)

    model = XGBRegressor(random_state=42)

    df['Date'] = pd.to_datetime(df['Date']).values.astype(float)

    X = df.drop(labels=['Sales'], axis=1)
    y = df['Sales']

    avg_score = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        score = model.score(X_test, y_test)

        avg_score.append(score)

        # print("Score:", score)

    print('Avg Score:', sum(avg_score) / len(avg_score))

In [62]:
splits = 12

xgb_reg(discard_df, splits)

xgb_reg(average_df, splits)

xgb_reg(predict_df, splits)

Avg Score: 0.8043508460254354
Avg Score: 0.7885673679237818
Avg Score: 0.7761244731651772
