In [None]:
import matplotlib
import numpy as np
import pandas as pd

from datetime import date, datetime, time, timedelta
from matplotlib import pyplot as plt
from pylab import rcParams
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# https://zhuanlan.zhihu.com/p/56507515
# https://zhuanlan.zhihu.com/p/410655154
# https://zhuanlan.zhihu.com/p/21275919

In [None]:
src_path = 'indexProcessed.csv'
test_size = 0.1               # proportion of dataset to be used as test set
cv_size = 0.2                   # proportion of dataset to be used as cross-validation set
Nmax = 2                        # for feature at day t, we use lags from t-1, t-2, ..., t-N as features
                                # Nmax is the maximum N we are going to test
fontsize = 14
ticklabelsize = 14

In [None]:
df = pd.read_csv(src_path, sep = ",")
# df = df.drop(df.columns[[0]], axis = 1)
del df['Index']
display(df)

In [None]:
# Convert Date column to datetime
df.loc[:, 'Date'] = pd.to_datetime(df['Date'],format='%Y-%m-%d')

# Change all column headings to be lower case, and remove spacing
df.columns = [str(x).lower().replace(' ', '_') for x in df.columns]

# Get month of each sample
df['month'] = df['date'].dt.month

# Sort by datetime
df.sort_values(by='date', inplace=True, ascending=True)

df.head(10)

In [None]:
rcParams['figure.figsize'] = 10, 8 # width 10, height 8

ax = df.plot(x = 'date', y = 'adj_close', style = 'r-', grid = True)
ax.set_xlabel("date")
# df.plot(x = 'date', y = 'adj_close', marker = ',', markevery = 10, color = 'r', ls = '-', lw = 2)

In [None]:
# Get sizes of each of the datasets
num_cv, num_test = int(cv_size*len(df)), int(test_size*len(df))
num_train = len(df) - num_cv - num_test

print("total_num = " + str(len(df)) )
print("num_train = " + str(num_train))
print("num_cv = " + str(num_cv))
print("num_test = " + str(num_test))

# Split into train, cv, and test
train = df[:num_train]
cv = df[num_train: num_train + num_cv]
train_cv = df[:num_train + num_cv]
test = df[num_train + num_cv:]
print("train.shape = " + str(train.shape))
print("cv.shape = " + str(cv.shape))
print("train_cv.shape = " + str(train_cv.shape))
print("test.shape = " + str(test.shape))

In [None]:
# Plot adjusted close over time
rcParams['figure.figsize'] = 10, 8 # width 10, height 8

ax = train.plot(x = 'date', y = 'adj_close', style = 'b-', grid = True)
ax = cv.plot(x = 'date', y = 'adj_close', style = 'y-', grid = True, ax = ax)
ax = test.plot(x = 'date', y = 'adj_close', style = 'g-', grid = True, ax = ax)
ax.legend(['train', 'dev', 'test'])
ax.set_xlabel("date")
ax.set_ylabel("USD")

In [None]:
# X_train,Y_train = df['date'][:num_train], df['adj_close'][:num_train]
# X_test,Y_test = df['date'][num_train:], df['adj_close'][num_train:]
X_train, Y_train = df[:num_train], df[:num_train].adj_close
X_test, Y_test = df[num_train:], df[num_train:].adj_close
# Y_train = df.pop('adj_close')
# X_train = df.pop('date')

date_val = X_test['date']
del X_train['adj_close']
del X_train['date']
del X_train['closeusd']
del X_test['adj_close']
del X_test['date']
del X_test['closeusd']

# X_train = X_train['open'].values.reshape(-1, 1)
# X_test = X_test['open'].values.reshape(-1, 1)

mean = Y_train.mean()
print(mean)
Y_train = Y_train.fillna(mean, inplace=False) 

mean = X_train.mean()
X_train = X_train.fillna(mean, inplace=False) 

# mean = X_test.mean()
# X_test = X_test.fillna(mean, inplace=False) 

# mean = Y_test.mean()
# Y_test = Y_test.fillna(mean, inplace=False) 
# # X_train = X_train.values.reshape(-1, 1)

print(X_train, Y_train)
# print(X_test, Y_test)

In [None]:
# from sklearn import linear_model
# reg = linear_model.Ridge(alpha=.5)
# # score = get_score(X_train, y_train)
# reg.fit(X_train, Y_train)
# reg.coef_
# reg.intercept_

from sklearn import linear_model
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, Y_train)
regr.coef_
regr.intercept_

In [None]:
# from sklearn.metrics import median_absolute_error

# y_pred = reg.predict(X_train)

# mae = median_absolute_error(Y_train, y_pred)
# string_score = f"MAE on training set: {mae:.2f} $/hour"
# y_pred = reg.predict(X_test)
# mae = median_absolute_error(Y_test, y_pred)
# string_score += f"\nMAE on testing set: {mae:.2f} $/hour"

Y_pred = regr.predict(X_test)
print(Y_pred, len(Y_pred))

from sklearn.metrics import mean_squared_error, r2_score
# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(Y_test, Y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(Y_test, Y_pred))

In [None]:
print(len(Y_pred), len(Y_test))
plt.scatter(date_val, Y_test, color="blue")
plt.plot(date_val, Y_pred, color="red", linewidth = 3)
plt.show()

In [None]:
from sklearn.metrics import explained_variance_score
print(explained_variance_score(Y_test, Y_pred))

times = 0
diff = 0
for i in Y_test:
    if int(i) != int(Y_pred[times]):
        # print("different", i, Y_pred[times])
        diff += 1
    times += 1
print(diff, "{%.2f%}", diff / len(Y_pred))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regr, X_test, Y_test, cv=5)
print(scores)