In [None]:
globals().clear
import time
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
from datetime import datetime

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
import xgboost as xgb

In [None]:
# Install Libraries

# mount Google Drive
from google.colab import drive
drive.mount('/content/colabIntern')

%cd "/content/colabIntern/MyDrive/Capstone/Capstone Project/Collab"
!ls | wc -l

Drive already mounted at /content/colabIntern; to attempt to forcibly remount, call drive.mount("/content/colabIntern", force_remount=True).
/content/colabIntern/.shortcut-targets-by-id/1_cBH5e_TBWyCYfxXIK1yDOdiYjVxi7ae/Capstone Project/Collab
14


In [None]:
# Load Dataset
df = pd.read_csv('df.csv')

In [None]:
# list of tickers for stocks in our data set. Sort the ticker list since our df will be alphabetically arranged.
tickers = ["AAPL", 'XOM', 'IBM', 'KO', 'CVX', 'BA', 'PFE', 'MSFT', 'T', 'WMT',
       'F', 'NFLX', 'JPM', 'MCD', 'GE', 'NVDA', 'JNJ', 'BAC', 'C', 'AMZN',
       'INTC', 'CSCO', 'TSLA', 'GOOGL', 'AMD', 'BABA', 'VZ', 'DIS',
       'META']
tickers.sort()

In [None]:
# For this specific trial drop PLTR since data is not complete
df = df[df.stock_ID != "PLTR"]

In [None]:
df['DATETIME']= pd.to_datetime(df['DATETIME'], format='%m/%d/%Y %H:%M')

In [None]:
# Sort by time so that the first 29 rows occupy the first time value for all the stocks.
# Fill in 0's for missing values for now.
df = df.set_index('DATETIME')
df = df.fillna(0)

In [None]:
# Select the length of the df ; For this file we need 7 months
# We use the first 3 months to make the first prediction, then shift window
# Then we will repeat this for the next 4 months (Hence total is 7 months needed)
df = df.sort_index().loc['2022-06-01':'2022-12-30']

In [None]:
#Set the DATETIME for fecha
DATETIME = df.index.values

In [None]:
# Sort dataframe such that it is both in sequential order, and also in alphabetical order for each day
#(i.e first entry for each time entry should be AAPL, and last should be XOM).
df.sort_values(["DATETIME", "stock_ID"], inplace=True)

In [None]:
file_name = 'XGB.60d.1d'
num_companies = 29  # 29 companies in our dataset.
step_rows  = 24 * num_companies  # 24 time periods per day per stock
total_rows = len(df['2022-06-01':'2022-12-30'])  # Define total length to predict on
train_rows = len(df['2022-06-01':'2022-08-31'])  # Define length of training window

scale_X = MinMaxScaler()
model = XGBRegressor(n_estimators=300, max_depth=100, learning_rate=0.1, objective='reg:squarederror'
                     , alpha=10)

result = pd.DataFrame(columns=['DATETIME', 'ACTUAL', 'PREDICTED', 'DIFFERENCE', 'TRAIN_DURATION'])

for i in range(0, total_rows - train_rows, step_rows):
    st = time.time()
    # 1. Obtain X and y
    train  = df.iloc[i:train_rows + i, 1:]
    test   = df.iloc[train_rows + i:train_rows + i + step_rows, 1:]
    X_train, y_train = train.iloc[:, 1:], train.iloc[:, 0]
    X_test, y_test = test.iloc[:, 1:], test.iloc[:, 0]


    # 2. Scale X
    X_train = scale_X.fit_transform(X_train)
    X_test = scale_X.transform(X_test)

    # 3. Fit and Predict
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    et = time.time()

    # 4. Save data with prediction
    fecha = DATETIME[train_rows + i:train_rows + i + step_rows]
    datos = {
        'DATETIME': fecha.ravel(),
        'ACTUAL': y_test.ravel(),
        'PREDICTED': y_hat.ravel(),
        'DIFFERENCE': abs(y_hat.ravel() - y_test.ravel()),
        'TRAIN_DURATION': np.full(y_hat.ravel().shape[0], et - st)
    }
    data = pd.DataFrame(data=datos)
    result = pd.concat([result, data], ignore_index=True)

    print("Count Down:", int((total_rows - train_rows - i) / step_rows))
    # print(data.head())

Count Down: 81
Count Down: 80
Count Down: 79
Count Down: 78
Count Down: 77
Count Down: 76
Count Down: 75
Count Down: 74
Count Down: 73
Count Down: 72
Count Down: 71
Count Down: 70
Count Down: 69
Count Down: 68
Count Down: 67
Count Down: 66
Count Down: 65
Count Down: 64
Count Down: 63
Count Down: 62
Count Down: 61
Count Down: 60
Count Down: 59
Count Down: 58
Count Down: 57
Count Down: 56
Count Down: 55
Count Down: 54
Count Down: 53
Count Down: 52
Count Down: 51
Count Down: 50
Count Down: 49
Count Down: 48
Count Down: 47
Count Down: 46
Count Down: 45
Count Down: 44
Count Down: 43
Count Down: 42
Count Down: 41
Count Down: 40
Count Down: 39
Count Down: 38
Count Down: 37
Count Down: 36
Count Down: 35
Count Down: 34
Count Down: 33
Count Down: 32
Count Down: 31
Count Down: 30
Count Down: 29
Count Down: 28
Count Down: 27
Count Down: 26
Count Down: 25
Count Down: 24
Count Down: 23
Count Down: 22
Count Down: 21
Count Down: 20
Count Down: 19
Count Down: 18
Count Down: 17
Count Down: 16
Count Down

In [None]:
# Create Dataframe for the results
metrics_df = pd.DataFrame(columns = ['stock_ID', "RMSE", "MAPE", "MPE" ,"MTT"])

In [None]:
result[25::29]

Unnamed: 0,DATETIME,ACTUAL,PREDICTED,DIFFERENCE,TRAIN_DURATION
25,2022-09-01 09:30:00,274.66,262.165100,12.494900,20.251358
54,2022-09-01 09:45:00,273.62,260.465546,13.154454,20.251358
83,2022-09-01 10:00:00,271.36,262.252533,9.107467,20.251358
112,2022-09-01 10:15:00,273.14,260.388153,12.751847,20.251358
141,2022-09-01 10:30:00,269.54,260.034454,9.505546,20.251358
...,...,...,...,...,...
56256,2022-12-30 14:15:00,121.46,111.815285,9.644715,21.693337
56285,2022-12-30 14:30:00,121.30,111.882156,9.417844,21.693337
56314,2022-12-30 14:45:00,121.51,112.683121,8.826879,21.693337
56343,2022-12-30 15:00:00,122.42,112.963661,9.456339,21.693337


In [None]:
def mean_positive_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    error = np.mean(np.maximum((y_pred - y_true),0))
    return error

In [None]:
new_rows = []
for i in range(len(tickers)):
    stock_result = result[i::29]
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(stock_result['ACTUAL'], stock_result['PREDICTED']))
    # Calculate MAPE %
    mape = mean_absolute_percentage_error(stock_result['ACTUAL'], stock_result['PREDICTED']) * 100
    # Calculate MPE $
    mpe = mean_positive_error(stock_result['ACTUAL'],stock_result['PREDICTED'])
    # Calculate MTT in seconds
    mtt = np.mean(stock_result['TRAIN_DURATION'])
    new_row = pd.Series([tickers[i],rmse, mape, mpe, mtt], index=metrics_df.columns)
    new_rows.append(new_row)
metrics_df = pd.DataFrame(new_rows)

In [None]:
metrics_df

Unnamed: 0,stock_ID,RMSE,MAPE,MPE,MPEP,MTT
0,AAPL,2.989345,1.531154,1.517863,1.517863,23.561106
1,AMD,2.666043,3.026459,1.678621,1.678621,23.561106
2,AMZN,3.916131,2.459338,1.883763,1.883763,23.561106
3,BA,5.130852,2.333083,0.97931,0.97931,23.561106
4,BABA,3.915415,3.553206,1.575027,1.575027,23.561106
5,BAC,0.947147,1.999981,0.474655,0.474655,23.561106
6,C,0.865176,1.475404,0.500131,0.500131,23.561106
7,CSCO,0.781105,1.165675,0.239698,0.239698,23.561106
8,CVX,4.361342,2.004471,1.04522,1.04522,23.561106
9,DIS,4.307316,2.584733,1.976647,1.976647,23.561106


In [None]:
avg_values=metrics_df.mean()
avg_df = pd.DataFrame(avg_values).T
avg_df