# Machine Learning Project (CS-UY 4563) S22
## Tokyo Stock Exchange Price Prediction
### Submitted by Anagh Kanungo and Tommy Qiu

## Contents of Notebook:
- Initial Setup
- ML Models
- Running ML Models

## Note:
The dataset could not be uploaded to GitHub since it is a very large dataset, but it can be downloaded from [https://www.kaggle.com/competitions/jpx-tokyo-stock-exchange-prediction/data] and saved to the path "dataset/{dataset here}"

# Initial Setup

### Required Imports

In [None]:
from cv2 import split
import numpy as np
import pandas as pd
import mplfinance
import matplotlib.pyplot as plt
import math

# Simple Linear Regression
from sklearn.linear_model import LinearRegression

# Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures

# SVR
from sklearn.svm import SVR
from sklearn.datasets import load_iris
from matplotlib.colors import Normalize


# LSTM
import tensorflow
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
from keras.callbacks import EarlyStopping

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, confusion_matrix, r2_score

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

# Multiple Linear Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit

### File Management

In [None]:
path = "dataset/train_files/stock_prices.csv"
df = pd.read_csv(path)

### Defining the List of Companies

In [None]:
list_of_companies = [
    {
        'Name': 'Astellas Pharma Inc.',
        'SecuritiesCode': 4503,
        'Industry': 'Pharmaceutical'
    },
    {
        'Name': 'Canon Electronics Inc.',
        'SecuritiesCode': 7739,
        'Industry': 'Electric Appliances'

    },
    {
        'Name': 'Honda Motor Co. Ltd. ',
        'SecuritiesCode': 7267,
        'Industry': 'Transportation Equipment'
    },
    {
        'Name': 'Hitachi Ltd.',
        'SecuritiesCode': 6501,
        'Industry': 'Electric Appliances'
    },
    {
        'Name': 'SoftBank Corp.',
        'SecuritiesCode': 9434,
        'Industry': 'Information and Communication'
    },
    {
        'Name': 'Mitsubishi Motors Corporation',
        'SecuritiesCode': 7211,
        'Industry': 'Transportation Equipment'
    },
    {
        'Name': 'Nissan Motors Corp',
        'SecuritiesCode': 7201,
        'Industry': 'Transportation Equipment'
    },
    {
        'Name': 'Toyota Motor Corp',
        'SecuritiesCode': 7203,
        'Industry': 'Transportation Equipment'
    },
    {
        'Name': 'Sony Group Corporation',
        'SecuritiesCode': 6758,
        'Industry': 'Electric Appliances'
    },
    {
        'Name': 'Panasonic Corporation',
        'SecuritiesCode': 6752,
        'Industry': 'Electric Appliances'
    }
]

### Initializing Helper Functions

In [1]:
# Function to fetch company data for one company from the whole dataset
def fetchCompanyData(code, df):
    df2 = df.loc[df['SecuritiesCode'] == code]
    df2 = df2[df2['High'].notna()] # Remove missing values
    return df2

In [None]:
# Helper function for Regression which splits the dataset into 80 - 20 manually (not random)
def splitData(dates, prices):
    training_data_len = math.floor(len(dates) * 0.8)

    # Create training data set
    train_data = prices[0:training_data_len, :]

    # Split into xtrain, ytrain
    xtrain = []
    ytrain = []

    print("training data len: ", training_data_len)
    xtrain = dates[0:training_data_len]
    xtest = dates[training_data_len:]
    ytrain = prices[0:training_data_len]
    ytest = prices[training_data_len:]
    xtrain, ytrain = np.array(xtrain), np.array(ytrain)
    xtest, ytest = np.array(xtest), np.array(ytest)

    return xtrain, ytrain, xtest, ytest



# ML Model Functions

## Linear Regression: Fitting a Straight Line using train_test_split

In [None]:
# Linear Regression using train_test_split
def simpleLinearRegression(df):
    dates = []
    for x in range(0, len(df["Date"])):
        dates.append(x)
    prices = df['Close']
    print(len(dates), prices.shape)
    dates = np.asanyarray(dates)
    prices = np.asanyarray(prices)
    dates = np.reshape(dates, (len(dates), 1))
    prices = np.reshape(prices, (len(prices), 1))
    print(dates.shape, prices.shape)
    xtrain, xtest, ytrain, ytest = train_test_split(
        dates, prices, test_size=0.2)


    # Find the best linear regression fit
    best = 0
    for _ in range(100):
        xtrain, xtest, ytrain, ytest = train_test_split(
            dates, prices, test_size=0.2)
        reg = LinearRegression().fit(xtrain, ytrain)
        acc = reg.score(xtest, ytest)
        if acc > best:
            best = acc
            bestReg = reg

    mean = 0
    for i in range(10):
        msk = np.random.rand(len(df)) < 0.8
        xtest = dates[~msk]
        ytest = prices[~msk]
        mean += bestReg.score(xtest, ytest)

    print("Average Accuracy: ", mean/10)

    print("R^2 Score: ", bestReg.score(dates, prices))

    # Plot Predicted VS Actual Data
    plt.plot(xtest, ytest, color='green', linewidth=1,
             label='Actual Price')
    plt.plot(xtest, bestReg.predict(xtest), color='blue', linewidth=3,
             label='Predicted Price')
    plt.title('Linear Regression | Time vs. Price ')
    plt.legend()
    plt.xlabel('Date Integer')
    plt.show()

    return bestReg.score(dates, prices)

## Linear Regression for Price Forecasting

In [None]:
# Linear regression using manual 80 - 20 non-random split for price forecasting
def simpleLinearRegression2(df):
    dates = []
    for x in range(0, len(df["Date"])):
        dates.append(x)
    prices = df['Close']

    dates = np.asanyarray(dates)
    prices = np.asanyarray(prices)
    dates = np.reshape(dates, (len(dates), 1))
    prices = np.reshape(prices, (len(prices), 1))

    xtrain, ytrain, xtest, ytest = splitData(dates, prices)
    print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)


    best = 0
    # bestReg
    reg = LinearRegression().fit(xtrain, ytrain)

    print("R^2 Score: ", reg.score(xtest, ytest))

    # Plot Predicted VS Actual Data
    plt.plot(xtest, ytest, color='green', linewidth=1,
             label='Actual Price')  
    plt.plot(xtest, reg.predict(xtest), color='blue', linewidth=3,
             label='Predicted Price')
    plt.title('Linear Regression | Time vs. Price ')
    plt.legend()
    plt.xlabel('Date Integer')
    plt.show()

    return reg.score(dates, prices)

## Polynomial Regression for Price Forecasting

In [None]:
# Polynomial Regression
def polynomialRegression(df):
    dates = []
    for x in range(0, len(df["Date"])):
        dates.append(x)

    prices = df['Close']

    dates = np.asanyarray(dates)
    prices = np.asanyarray(prices)
    dates = np.reshape(dates, (len(dates), 1))
    prices = np.reshape(prices, (len(prices), 1))

    x_train, y_train, x_test, y_test = splitData(dates, prices)

    print(x_train[0:5])
    print(x_test[0:5])

    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_test = np.array(x_test)
    y_test = np.array(y_test)

    x_train = x_train.reshape(-1, 1)
    y_train = y_train.reshape(-1, 1)

    y_train = y_train[x_train[:, 0].argsort()]
    x_train = x_train[x_train[:, 0].argsort()]

    # x_test = x_test.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    myList = np.array([])
    for i in range(2, 21):
        # Transform polynomial features
        poly = PolynomialFeatures(degree=i, include_bias=False)
        x_poly = poly.fit_transform(x_train)

        # Train model
        poly_reg_model = LinearRegression().fit(x_poly, y_train)

        x_test_poly = poly.fit_transform(x_test)

        # Predict
        y_predicted = poly_reg_model.predict(x_poly)

        # Training error
        score = poly_reg_model.score(x_poly, y_train)
        print("Training R^2 Score: ", score)
        myList = np.append(myList, score)

        # Plot data
        plt.plot(x_train, y_train, c='green')
        plt.plot(x_test, y_test, c='red')
        # plt.plot(x_poly, y_train, c='orange')
        plt.plot(x_train, y_predicted, c='orange')
        plt.plot(x_test, poly_reg_model.predict(x_test_poly), c="orange")
        plt.show()

    return myList

## SVR: All kernels (Linear, Polynomial, Radial)

In [None]:
def calculateSVR2(df):
    df = df.head(len(df) - 1)

    days = []
    close = []

    df_days = df.loc[:, 'Date']
    df_close = df.loc[:, 'Close']

    for i in range(len(df_days)):
        days.append(i)

    for close_price in df_close:
        close.append(float(close_price))

    days = np.array(days)
    days = days.reshape(-1, 1)
    # Create 3 SVR Models
    # Linear kernel
    lin_svr = SVR(kernel='linear', C=1000.0)
    lin_svr.fit(days, close)

    # Polynomial kernel
    poly_svr = SVR(kernel='poly', C=1000.0, degree=2)
    poly_svr.fit(days, close)

    # Radial kernel
    rbf_svr = SVR(kernel='rbf', C=10.0, gamma=0.15)
    rbf_svr.fit(days, close)

    R_score = rbf_svr.score(days, close)
    print(R_score)

    # Plot on graph
    plt.figure(figsize=(16, 8))
    plt.plot(days, close, color='red', label='Data')
    plt.plot(days, rbf_svr.predict(days), color='green', label='RBF Model')
    plt.plot(days, poly_svr.predict(days),
             color='blue', label='Polynomial Model')
    plt.plot(days, lin_svr.predict(days), color='orange', label='Linear Model')
    plt.xlabel('Date Integer')
    plt.ylabel('Prices')
    plt.title('Different Kernel SVRs')
    plt.legend()
    plt.show()

## Radial Basis SVR with Varying C parameter

In [None]:
def calculateRadialBasisSVR(df):
    actual_price = df.tail()
    df = df.head(len(df) - 1)

    days = []
    close = []

    df_days = df.loc[:, 'Date']
    df_close = df.loc[:, 'Close']

    for i in range(len(df_days)):
        days.append(i)

    for close_price in df_close:
        close.append(float(close_price))

    days = np.array(days)
    days = days.reshape(-1, 1)

    C_parameter = 10

    results = []

    for x in range(C_parameter, 200, 5):
        C_parameter = x
        # Radial kernel
        rbf_svr = SVR(kernel='rbf', C=float(C_parameter), gamma=0.15)
        rbf_svr.fit(days, close)
        R_score = rbf_svr.score(days, close)
        print("R^2 Score when C is " +
              str(C_parameter) + ": " + str(rbf_svr.score(days, close)))

        #   C -> C_parameter
        #   R_score -> R_score parameter
        results.append([C_parameter, R_score])

    return results

## Heatmap for Radial Basis SVR with Varying C and Gamma

In [None]:
# Normalize function to normalize the midpoint of R^2 Score so we can see the variation more clearly
# Credit: Sklearn documentation
class MidpointNormalize(Normalize):
    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

# Function to generate heatmap
def generateRBFHeatmap(df):
    actual_price = df.tail()
    df = df.head(len(df) - 1)

    days = []
    close = []

    df_days = df.loc[:, 'Date']
    df_close = df.loc[:, 'Close']

    for i in range(len(df_days)):
        days.append(i)

    for close_price in df_close:
        close.append(float(close_price))

    days = np.array(days)
    days = days.reshape(-1, 1)

    C_range = np.arange(10, 105, 5)

    gamma_range = np.arange(0.1, 1.05, 0.05)

    table = np.array([[]])
    for C_parameter in C_range:
        row = np.array([])
        for gamma in gamma_range:
            rbf_svr = SVR(kernel='rbf', C=float(C_parameter), gamma=gamma)
            rbf_svr.fit(days, close)
            R_score = rbf_svr.score(days, close)
            print("C: ", C_parameter)
            print("Gamma: ", gamma)
            print("R_score: ", R_score)
            row = np.append(row, R_score)
        table = np.append(table, row)

    table = table.reshape(19, 19)
    print(table)
    plt.subplots_adjust(left=0.2, right=0.95, bottom=0.15, top=0.95)
    plt.imshow(
        table,
        interpolation="nearest",
        cmap=plt.cm.hot,
        norm=MidpointNormalize(vmin=0.2, midpoint=0.65),
    )
    gamma_range = gamma_range.round(4)
    plt.xlabel("gamma")
    plt.ylabel("C")
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
    plt.yticks(np.arange(len(C_range)), C_range)
    plt.title("Validation accuracy")
    plt.show()


generateRBFHeatmap(df_6752)

## LSTM

In [2]:
def runLSTM(df):
  data = df.filter(['Close'])
  # Convert to numpy array
  dataset = data.values

  # Get # of rows
  training_data_len = math.ceil(len(dataset) * 0.8)

  # Scale the data
  scaler = MinMaxScaler(feature_range=(0,1))
  scaled_data = scaler.fit_transform(dataset)
  

  # Create training data set
  train_data = scaled_data[0:training_data_len, :]

  # Split into xtrain, ytrain
  xtrain = []
  ytrain = []

  for i in range(60, len(train_data)):
    xtrain.append(train_data[i-60:i, 0])
    ytrain.append(train_data[i,0])


  # Convert into numpy arrays
  xtrain, ytrain = np.array(xtrain), np.array(ytrain)

  # Reshape
  xtrain = np.reshape(xtrain, (xtrain.shape[0], xtrain.shape[1], 1))


  # Build LSTM Model
  model = Sequential()
  model.add(LSTM(50, input_shape=(xtrain.shape[1], 1), return_sequences=True))
  model.add(LSTM(50, return_sequences=False))

  # LSTM(act)

  # LSTM()


  # LSTM(activation='sigmoid')


  model.add(Dense(25))
  model.add(Dense(1))

  # Compile the model
  model.compile(optimizer='adam', loss='mean_squared_error')

  # Train the model
  model.fit(xtrain, ytrain, batch_size=1, epochs=1)

  # Create new testing data set
  # New array containing scaled values from
  print(training_data_len)
  test_data = scaled_data[training_data_len - 60:, :]

  # Create data sets xtest, ytest
  xtest = []
  ytest = dataset[training_data_len : , :]
  print(len(test_data))
  for i in range(60, len(test_data)):
    xtest.append(test_data[i-60:i, 0])

  # Convert to numpy array
  xtest = np.array(xtest)

  # Reshape data
  xtest = np.reshape(xtest, (xtest.shape[0], xtest.shape[1], 1))


  # get predicted price values
  predictions = model.predict(xtest)
  predictions = scaler.inverse_transform(predictions)

  # Get root mean squared error(RMSE)
  rmse=np.sqrt(np.mean(((predictions- ytest)**2)))
  print("RMSE:", rmse)

  print('R2 Score: ', r2_score(ytest, predictions))


  # Plot
  train = data[:training_data_len]
  valid = data[training_data_len:]
  valid['Predictions'] = predictions

  # Visualize
  plt.figure(figsize=(16, 8))
  plt.title('Model')
  plt.xlabel('Date')
  plt.ylabel('Close Price')
  plt.plot(train['Close'])
  plt.plot(valid[['Close', 'Predictions']])
  plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
  plt.show()




## Running the Functions

### Fetch data for Panasonic (Code 6752)

In [None]:
df_6752 = fetchCompanyData(6752, df)

### Linear Regression 1: Random train-test split

In [None]:
r2score = simpleLinearRegression(df_6752)
print(r2score)

### Linear Regression 2: Attempt at Price Forecasting

In [None]:
r2score = simpleLinearRegression2(df_6752)
print(r2score)

### Polynomial Regression

In [None]:
polynomialRegression(df_6752)

### SVR: Linear, Polynomial and Radial Basis Kernels

In [None]:
calculateSVR2(df_6752)

### SVR: Plotting Radial Basis SVR as hyperparameter C changes

In [None]:
radial_basis_accuracy = calculateRadialBasisSVR(df_6752)
print(radial_basis_accuracy[0:5])
x_val = []
y_val = []
for x in radial_basis_accuracy:
    print(x)
    x_val.append(x[0])
    y_val.append(x[1])

print(x_val[0:5])
plt.plot(x_val, y_val)
plt.xlabel('C Parameter')
plt.ylabel('R^2 Score')
plt.title('Relation between R^2 Score and C Parameter')
plt.show()

### SVR: Plotting heatmap as hyperparameters vary

In [None]:
generateRBFHeatmap(df_6752)

### Running LSTM on Dataset

In [None]:
runLSTM(df_6752)