In [None]:
import matplotlib
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from pylab import rcParams
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, median_absolute_error
from sklearn.model_selection import cross_val_score

In [None]:
src_path = 'indexProcessed.csv'
test_rate = 0.2                 # testing dataset proportion

In [None]:
df = pd.read_csv(src_path, sep = ",")
# df = df.drop(df.columns[[0]], axis = 1)
display(df)

In [None]:
# Convert Date column to datetime
df.loc[:, 'Date'] = pd.to_datetime(df['Date'],format='%Y-%m-%d')

# Change all column headings to be lower case, and remove spacing
df.columns = [str(x).lower().replace(' ', '_') for x in df.columns]

df.head(10)

In [None]:
index, times = print(df['index'][0]), 0
change_index = []
print(type(df))

## find points index to split stocks
for i in df['index']:
    if i != index:
        print(i, times)
        change_index.append(times)
        index = i
    times += 1
print(change_index, len(change_index)) 

In [None]:
df.head(8495)

In [None]:
# ax = matplotlib.pyplot.subplots(4, 4)
rcParams['figure.figsize'] = 6, 3

index, times = 0, 0

# while times < len(change_index):
#         if times == 0:
#                 ax = df.iloc[: change_index[times + 1]].plot(x = 'date', y = 'adj_close', style = 'b-', grid = True)
#                 index = change_index[times]
#                 ax.set_xlabel(df['index'][change_index[times]])
#                 times += 1
#                 continue

#         elif times == len(change_index) - 1:
#                 ax = df.iloc[change_index[times]:].plot(x = 'date', y = 'adj_close', style = 'b-', grid = True)
#                 ax.set_xlabel(df['index'][change_index[times]])
#                 break

#         else:
#                 ax = df.iloc[index: change_index[times + 1]].plot(x = 'date', y = 'adj_close', style = 'b-', grid = True)
#                 index = change_index[times]
#                 ax.set_xlabel(df['index'][change_index[times]])
#                 times += 1


## data process function

In [None]:
def wash_data(X, test_rate):
    # Calculate the number of data entries in the training and test sets
    num_test = int(test_rate * len(X))
    num_train = len(X) - num_test

    # Get the training and test sets
    X_train, Y_train = X[:num_train], X[:num_train].adj_close
    X_test, Y_test = X[num_train:], X[num_train:].adj_close

    # Get the date column data
    date_val = X_test['date'].shift(1).dropna() ## x value to draw

    # Keep only the high, low, close, volume columns
    X_train = X_train[['high', 'low', 'close', 'volume']]
    X_test = X_test[['high', 'low', 'close', 'volume']]

    # Shift the adj_close column in the training set by one position and fill the empty value with the mean
    Y_train = Y_train.fillna(Y_train.mean()).shift(-1).dropna()

    # Shift the high, low, close, volume columns in the training set by one position and fill the empty value with the mean
    X_train = X_train.fillna(X_train.mean()).shift(1).dropna()

    # Shift the high, low, close, volume columns in the test set by one position and fill the empty value with the mean
    X_test = X_test.shift(1).dropna()

    # Shift the adj_close column in the test set by one position and fill the empty value with the mean
    Y_test = Y_test.shift(-1).dropna()

    return X_train, Y_train, X_test, Y_test, date_val

## Regression Switch Fuction (linear, polynomial and Losso)

In [None]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning

def regression_model(X, test_rate, mode, name, degree = 2, alpha = 5):
    simplefilter("ignore", category=ConvergenceWarning)

    X_train, Y_train, X_test, Y_test, date_val = wash_data(X, test_rate)

    if mode == 'linear':
        print("linear regression")
        regression = linear_model.LinearRegression()
        regression.fit(X_train, Y_train)
        Y_pred = regression.predict(X_test)

    elif mode == 'polynomial':
        print("polynomial regression")
        regression = PolynomialFeatures(degree = degree)
        x_poly = regression.fit_transform(X_train)
        model = linear_model.LinearRegression()
        model.fit(x_poly, Y_train)
        Y_pred = model.predict(regression.fit_transform(X_test))

    else: 
        print("Lasso")
        regression = linear_model.Lasso(alpha = alpha)
        regression.fit(X_train, Y_train)
        Y_pred = regression.predict(X_test)
            
    mae = median_absolute_error(Y_test, Y_pred)

    res = []
    res.append(mean_squared_error(Y_test, Y_pred))
    res.append(r2_score(Y_test, Y_pred))
    res.append(explained_variance_score(Y_test, Y_pred))
    res.append(mae)

   
    # scores = cross_val_score(regression, X_test, Y_test, cv = 5)
    # print(scores)

    rcParams['figure.figsize'] = 10,4
    ax = matplotlib.pyplot.subplots(1, 1)
    ax = plt.scatter(date_val, Y_test, color="blue", marker = '.')
    plt.xlabel(name)
    plt.ylabel('USD')
    bx = plt.plot(date_val, Y_pred, color="red")
    
    print(res)

    return res

    # ax = plt.scatter(date_val[1: ], Y_test[: len(Y_test) - 1], color="blue", marker = '.')
    # bx = plt.plot(date_val[1: ], Y_pred[1: ], color="red")

## Main function

In [None]:
index, times = 0, 0
model = 'linear' # linear, polynomial and lasso

#fig, axes = plt.subplots(1, len(change_index))
result = []

print('mean squared error', '| Coefficient of determination', '| explained_variance_score', '| mae')
while times < len(change_index):
        res, current_stock = [], df['index'][change_index[times]]
        res.append(current_stock)
        print(res)
        if times == 0:
                regression_model(df[:change_index[times + 1]], test_rate, model, current_stock)
                # plt.show()
                index = change_index[times]
                times += 1
                continue

        elif times == len(change_index) - 1:
                regression_model(df[change_index[times]:], test_rate, model, current_stock)
                break

        else:
                regression_model(df[index: change_index[times + 1]], test_rate, model, current_stock)
                index = change_index[times]
                times += 1

        # result.append(res)
# print(res)