In [None]:
!nvidia-smi

In [1]:
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [2]:
import torch
print(torch.__version__)
torch.cuda.set_device(0)

1.13.0+cu117


In [3]:
torch.cuda.is_available()

True

In [4]:
torch.cuda.current_device()

0

In [5]:
torch.version.cuda

'11.7'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split
import my_utils

In [4]:
ts_data = np.loadtxt(open("ts_data.csv","rb"), delimiter=",")

In [5]:
print(ts_data.shape)

(106251, 36)


# 0. Preprocess data that can be fed into models

### Define input and output window length

In [6]:
input_dim = 30
output_dim = 5
T_span = input_dim + output_dim
n_id = 20

# 3. RF Prediction

## 3.0 RF Data Preprocessing

In [7]:
# Differentiate X and y
X = ts_data[:,:(input_dim+1)]
y_price = ts_data[:,input_dim:]
y_return = np.diff(y_price, axis=1)/y_price[:,:-1]

#Standardization
mu_X, std_X, X = my_utils.standardization(X)
mu_y, std_y, y = my_utils.standardization(y_return)


# Train Test Split With All Companies Included
X_i = X[np.where(ts_data[:,0]==0)]
y_i = y[np.where(ts_data[:,0]==0)]
X_train, X_test, y_train, y_test = train_test_split(X_i, y_i, test_size = 0.2, random_state = 42)
    
for i in range(1, n_id):
    X_i = X[np.where(ts_data[:,0]==i)]
    y_i = y[np.where(ts_data[:,0]==i)]
    X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_i, y_i, test_size = 0.2, random_state = 42)

    X_train = np.concatenate((X_train, X_train_i), axis=0)
    X_test = np.concatenate((X_test, X_test_i), axis=0)
    y_train = np.concatenate((y_train, y_train_i), axis=0)
    y_test = np.concatenate((y_test, y_test_i), axis=0)


# Reshuffle Entire Datasets
idx = np.arange(X_train.shape[0])
np.random.shuffle(idx)
X_train, y_train = X_train[idx], y_train[idx]

idx = np.arange(X_test.shape[0])
np.random.shuffle(idx)
X_test, y_test = X_test[idx], y_test[idx]

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(84990, 31)
(21261, 31)
(84990, 5)
(21261, 5)


## 3.1 RF Training

In [9]:
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(n_estimators=100, random_state=42, max_features=3)
RF.fit(X_train, y_train[:,0])

## 3.2 RF Performance Evaluation

In [25]:
y_train_pred_RF = RF.predict(X_train)
y_test_pred_RF = RF.predict(X_test)

In [22]:
"""
# for reformation if standardization was used

print(y_train.shape)
print(y_train_pred_RF.shape)
print(y_test.shape)
print(y_test_pred_RF.shape)
print(std_y)
print(mu_y)

y_train_reform = y_train*std_y + mu_y
y_train_pred_RF_reform = y_train_pred_RF*std_y + mu_y

y_test_reform = y_test*std_y + mu_y
y_test_pred_RF_reform = y_test_pred_RF*std_y + mu_y
"""

(84990, 5)
(84990, 5)
(21261, 5)
(21261, 5)
[0.0264163  0.02641753 0.02640583 0.02640844 0.02640861]
[0.00094278 0.00093985 0.00093564 0.00094066 0.00093839]


In [26]:
# for reformation if standardization was used

print(y_train[:,0].shape)
print(y_train_pred_RF.shape)
print(y_test[:,0].shape)
print(y_test_pred_RF.shape)
print(std_y)
print(mu_y)

y_train_reform = y_train[:,0]*std_y[0] + mu_y[0]
y_train_pred_RF_reform = y_train_pred_RF*std_y[0] + mu_y[0]

y_test_reform = y_test[:,0]*std_y[0] + mu_y[0]
y_test_pred_RF_reform = y_test_pred_RF*std_y[0] + mu_y[0]

(84990,)
(84990,)
(21261,)
(21261,)
[0.0264163  0.02641753 0.02640583 0.02640844 0.02640861]
[0.00094278 0.00093985 0.00093564 0.00094066 0.00093839]


In [27]:
# Calculate R2 value
from sklearn.metrics import r2_score
R2_train = r2_score(y_train_reform, y_train_pred_RF_reform)
R2_test = r2_score(y_test_reform, y_test_pred_RF_reform)

print('R2_train: {val}'.format(val = R2_train))
print('R2_test: {val}'.format(val = R2_test))

R2_train: 0.8553883367897561
R2_test: -0.02794744091940471


# 3.3 Get Data Needed for Optimization

### 3.3.0 Save and Load Trained Model

In [None]:
from sklearn.externals import joblib
# Save to file in the current working directory
joblib_file = "RF_pred1d_nEstimators100_maxFeat3.pkl"
joblib.dump(RF, joblib_file)

In [None]:
"""
# Load from file
RF = joblib.load(joblib_file)
# Calculate the accuracy and predictions
score = RF.score(Xtest, Ytest)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = RF.predict(Xtest)
"""

### 3.3.1 Get Newest Inputs: Comany_id + 30-d Prices

In [32]:
raw_df = pd.read_csv('raw.csv', sep=',')

In [33]:
raw = raw_df.values.T[1:]

In [34]:
companies = np.unique(ts_data[:,0]).reshape(20,1)
X_final = raw[:,(raw.shape[1]-30):]
X_final = np.concatenate((companies, X_final), axis=1)
print(X_final.shape)

(20, 31)


In [None]:
print(X_final)

### 3.3.2 Get Prediction Mean for 20 Companies

In [35]:
Return_mean = RF.predict(X_final)

In [37]:
np.savetxt("Return_mean.csv", Return_mean, delimiter=',')

### 3.3.3 Get Distribution Variance of Predictions from All Regressors

In [36]:
numberTrees = 100

In [37]:
pred_all = []
for tree in range(numberTrees):
    pred_all.append(RF.estimators_[tree].predict(X_final)*std_y[0]+mu_y[0])
pred_all = np.array(pred_all)
print(pred_all.shape)

(100, 20)


In [51]:
Return_std = np.std(pred_all, axis=0)
print(Return_std)

[1.19944256 1.1975316  1.1975316  1.1975316  1.1975316  0.93112443
 0.81367212 1.1975316  1.1975316  1.1975316  1.1975316  1.1975316
 0.87391744 1.1975316  1.16076412 1.1975316  1.1975316  1.1975316
 1.1975316  1.1975316 ]


In [38]:
np.savetxt("Return_std.csv", Return_std, delimiter=',')

# 3.4 Benchmark Prediction

In [10]:
prices_final = np.loadtxt(open("prices_final.csv","rb"), delimiter=",")
return_final = np.loadtxt(open("return_final.csv","rb"), delimiter=",")

In [None]:
test = np.concatenate((prices_final[:,0].reshape(20,1),prices_final[:, (31+i):(31+i+30)]), axis=1)
print((test-mu_X)/std_X)

In [21]:
y_BMpred = np.zeros((20,30))
for i in range(30):
    X_BM = np.concatenate((prices_final[:,0].reshape(20,1),prices_final[:, (31+i):(31+i+30)]), axis=1)
    X_BM = (X_BM - mu_X) / (std_X)
    y_BMpred[:,i] = RF.predict(X_BM)
    
print(y_BMpred.shape)

(20, 30)


In [23]:
y_BMpred = y_BMpred*std_y[0] + mu_y[0]
np.savetxt("Return_mean_RF.csv", y_BMpred, delimiter=',')

### Prediction Variance

In [25]:
numberTrees = 100

In [29]:
Return_std = []
for i in range(30):
    X_BM = np.concatenate((prices_final[:,0].reshape(20,1),prices_final[:, (31+i):(31+i+30)]), axis=1)
    X_BM = (X_BM - mu_X) / (std_X)
    
    pred_all = []
    for tree in range(numberTrees):
        tree_pred = RF.estimators_[tree].predict(X_BM)
        tree_pred = tree_pred*std_y[0] + mu_y[0]
        pred_all.append(tree_pred)
    pred_all = np.array(pred_all)
    #print(pred_all.shape)
    std = np.std(pred_all, axis=0)
    Return_std.append(std)
Return_std = np.array(Return_std).T
print(Return_std.shape)
#print(pred_all.shape)

(20, 30)


In [41]:
Return_UB = y_BMpred + Return_std*0.5
Return_LB = y_BMpred - Return_std*0.5
np.savetxt("Return_UB_RF.csv", Return_UB, delimiter=',')
np.savetxt("Return_LB_RF.csv", Return_LB, delimiter=',')

In [None]:
np.savetxt("Return_std.csv", Return_std, delimiter=',')