# Continuous Assessment - Stock Price Prediction
### Table of contents
- Import libraries
- Read in data
- Data Preprocessing
- Exploratory analysis
- Preprocessing/Splitting function
- Ridge Regression
- Linear Regression
- Exploratory Analysis Continued
- Multivariate Linear Regression

### Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Reading Data

We have three data sets: `indexData`, `indexInfo`, and `indexProcessed`. The data and processed datasets have one difference: an extra column in the processed dataset that represents the local currency translated into USD.

In [None]:
data_df = pd.read_csv("../data/archive/indexData.csv")
info_df = pd.read_csv("../data/archive/indexInfo.csv")
processed_df = pd.read_csv("../data/archive/indexProcessed.csv")

# Preprocessing Data
From our three files, the important file is `indexProcessed` as that contains the most amount of relevant information to a regression. I also join the processed and info dataframes to collect all information at once.

In [None]:
processed_df.head()

In [None]:
processed_df.columns.values

In [None]:
data_df.columns.values

In [None]:
set(processed_df.columns.values) - set(data_df.columns.values)

In [None]:
info_df

In [None]:
processed_info_df = pd.merge(processed_df, info_df, on='Index')
processed_info_df.head()

In [None]:
processed_info_df['Date'] = pd.to_datetime(processed_info_df['Date'])
processed_info_df.head()

# Exploratory Analysis
I create a heatmap and a pairplot of each of the features in `indexProcessed`. This shows that some fields are very highly correlated and some are not correlated whatsoever.

In [None]:
sns.heatmap(processed_info_df.corr())

In [None]:
sns.pairplot(processed_info_df)

I then claculate the index with the highest return over the period recorded in the data set. The index with the highest return is by far `IXIC`, with 14,000% return from start to finish. If you invested £1,000 in 1971, that money would now be worth £140,000.


In [None]:
all_stocks = np.unique(processed_info_df['Index'])

In [None]:
sns.lineplot(data=processed_info_df, x='Date', y='CloseUSD', hue='Index', lw=1)

In [None]:
stock_dfs = {}
for stock in all_stocks:
    data = processed_info_df[processed_info_df['Index'] == stock]
    data = data[['Date', 'CloseUSD']]
    data = data.set_index('Date')
    stock_dfs[stock] = data
stock_dfs['IXIC']

In [None]:
stock_returns = {}

for name, df in stock_dfs.items():
    start_price = df[df.index == df.index.min()].iloc[0]['CloseUSD']
    final_price = df[df.index == df.index.max()].iloc[0]['CloseUSD']
    ret = (final_price / start_price) * 100
    stock_returns[name] = ret

In [None]:
stock_returns_df = pd.DataFrame.from_dict(stock_returns, orient='index', columns=['Return %'])

ax = sns.barplot(data=stock_returns_df,x=stock_returns_df.index,y='Return %')
ax.set(xlabel='Index', ylabel='Return %')
plt.xticks(rotation=70)
plt.show()

# Preprocessing/Splitting Functions
These functions are the base for: 
 - Picking the feature columns;
 - Scaling the data;
 - Splitting data into train-test splits;
 - Splitting data into dependent-indepdent splits.

In [None]:
def pick_feature_columns(df):
    feature_columns = ['CloseUSD']
    df = df[feature_columns]
    return df

In [None]:
from sklearn.preprocessing import MinMaxScaler

def min_max_scale_data(df):
    scaler = MinMaxScaler()
    scaled_close_usd = scaler.fit_transform(df[['CloseUSD']])
    df[['CloseUSD']] = scaled_close_usd
    return scaler, df

In [None]:
from sklearn.preprocessing import StandardScaler

def standard_scale_data(df):
    scaler = StandardScaler()
    scaled_close_usd = scaler.fit_transform(df[['CloseUSD']])
    df[['CloseUSD']] = scaled_close_usd
    return scaler, df

In [None]:
def train_test_split(df, training_split=0.8):
    training_days = (df.index.max() - df.index.min())*training_split
    cutoff_date = df.index.min() + training_days
    test_data = df[df.index >= cutoff_date]
    train_data = df[df.index < cutoff_date]
    return train_data, test_data

In [None]:
def dependent_independent_split(df, n_data_points=1):
    x, y = [], []
    for index in range(n_data_points, len(df)):
        x.append(np.array(df[index-n_data_points:index]))
        y.append(np.array(df[index:index+1]))        
    x = np.reshape(np.array(x), (len(x), n_data_points))
    y = np.reshape(np.array(y), (len(y), 1))
    return x, y

# Batch Data Processing/Training Functions
These functions are responsible for using the functions above to handle a batch of preprocessing/training at once.

In [None]:
NUMBER_DATA_POINTS = 90

def process_df(df):
    df = pick_feature_columns(df)
    scaler, df = standard_scale_data(df)
    train_data, test_data = train_test_split(df)
    X_train, y_train = dependent_independent_split(train_data, NUMBER_DATA_POINTS)
    X_test, y_test = dependent_independent_split(test_data, NUMBER_DATA_POINTS)
    return X_train, X_test, y_train, y_test, scaler

In [None]:
from sklearn.linear_model import LinearRegression

def train_model(X_train, y_train):
    mlr_model = LinearRegression(n_jobs=1)
    mlr_model.fit(X_train, y_train)
    return mlr_model

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

def score_model(scaler, actual, predicted):
    if scaler is not None:    
        actual = scaler.inverse_transform(actual)
        predicted = scaler.inverse_transform(predicted)
    return {
        "RMSE": mean_squared_error(actual, predicted, squared=False),
        "R2": r2_score(actual, predicted),
        "MAPE": mean_absolute_percentage_error(actual, predicted),
    }

# Ridge Regression
For this first regression, we are picking the index `IXIC` as it has the highest return in the data set. I then use `GridSearchCV` to find the most optimal values for alpha and tol for the training data split. We then score the accuracy and plot the predicted values vs. the true values. We then re-create the regression line.

In [None]:
NUMBER_DATA_POINTS = 120

stock_data = stock_dfs['IXIC']
stock_data = pick_feature_columns(stock_data)

train_data, test_data = train_test_split(stock_data)
X_train, y_train = dependent_independent_split(train_data, NUMBER_DATA_POINTS)
X_test, y_test = dependent_independent_split(test_data, NUMBER_DATA_POINTS)

fig = plt.figure()
ax = fig.add_subplot()
plt.plot(train_data, 'b')
plt.plot(test_data, 'r')
ax.set_xlabel("Date")
ax.set_ylabel("CloseUSD")
plt.show()

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

lasso_model = Ridge()
paramters = {
    'alpha': [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8],
    'tol': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
}

optimised_model = GridSearchCV(lasso_model, paramters)
optimised_model.fit(X_train, y_train)

print(optimised_model.best_estimator_)

In [None]:
predictions = optimised_model.predict(X_test)
predictions = predictions.reshape(len(predictions), 1)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print("RMSE: ",mean_squared_error(y_test, predictions, squared=False))
print("R-squared: ", r2_score(y_test, predictions))

In [None]:
fig = plt.figure()
ax = fig.add_subplot()
plt.plot(predictions, 'r')
plt.plot(y_test, 'b')
plt.grid(True)
ax.set_xlabel('Timestep')
ax.set_ylabel('CloseUSD')
plt.show()

In [None]:
X_test_mean = X_test.mean(axis=1)
y_pred = optimised_model.predict(X_test)

fig = plt.figure()
ax = fig.add_subplot()
plt.scatter(X_test_mean, y_test, color='k')
plt.plot(X_test_mean, y_pred, color='r')
ax.set_xlabel('CloseUSD')
ax.set_ylabel('CloseUSD')
plt.show()

### Ridge Regression on All Stocks
This is running a ridge regression on all of the stocks in the data set, and finding the mean of all of the metrics that we have selected for our comparisons. We redefine `process_df` and `train_model` to include no scaling, and Ridge regression model respectively.

In [None]:
NUMBER_DATA_POINTS = 90

def process_df(df):
    df = pick_feature_columns(df)
    train_data, test_data = train_test_split(df)
    X_train, y_train = dependent_independent_split(train_data, NUMBER_DATA_POINTS)
    X_test, y_test = dependent_independent_split(test_data, NUMBER_DATA_POINTS)
    return X_train, X_test, y_train, y_test, None

In [None]:
from sklearn.linear_model import Ridge

def train_model(X_train, y_train):
    model = Ridge(alpha=0.01, tol=0.1)
    model.fit(X_train, y_train)
    return model

In [None]:
performances = {}
for name, df in stock_dfs.items():
    X_train, X_test, y_train, y_test, scaler = process_df(df)
    model = train_model(X_train, y_train)
    predictions = model.predict(X_test)
    scores = score_model(scaler, y_test, predictions)
    performances[name] = scores

performances_df = pd.DataFrame.from_dict(data=performances, orient='index')
performances_df

In [None]:
performances_df.mean()

# Linear Regression
For the first linear regression, I am repeating the same steps that I have previously done with Ridge regression, but with only one independent variable.

In [None]:
NUMBER_DATA_POINTS = 1

stock_data = stock_dfs['IXIC']
stock_data = pick_feature_columns(stock_data)

train_data, test_data = train_test_split(stock_data)
X_train, y_train = dependent_independent_split(train_data, NUMBER_DATA_POINTS)
X_test, y_test = dependent_independent_split(test_data, NUMBER_DATA_POINTS)

fig = plt.figure()
ax = fig.add_subplot()
plt.plot(train_data, 'b')
plt.plot(test_data, 'r')
ax.set_xlabel("Date")
ax.set_ylabel("CloseUSD")
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression(n_jobs=1)
lr_model.fit(X_train, y_train)

In [None]:
predictions = lr_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

print("RMSE: ", mean_squared_error(y_test, predictions, squared=False))
print("R-squared: ", r2_score(y_test, predictions))

In [None]:
fig = plt.figure()
ax = fig.add_subplot()

plt.plot(predictions, 'r')
plt.plot(y_test, 'b')
plt.grid(True)
ax.set_xlabel('Timestep')
ax.set_ylabel('CloseUSD')
plt.show()

The graph below clearly shows a linear regression, as it creates a straight line (red line) to predict the data found in the test set (black dots).

In [None]:
X_test_mean = X_test.mean(axis=1)
y_pred = lr_model.predict(X_test)

fig = plt.figure()
ax = fig.add_subplot()
plt.scatter(X_test_mean, y_test, color='k')
plt.plot(X_test_mean, y_pred, color='r')
ax.set_xlabel('CloseUSD')
ax.set_ylabel('CloseUSD')
plt.show()

### Multivariate Linear Regression
This attempt is similar to the others, but with 150 independent variables in a multiple linear regression.

In [None]:
NUMBER_DATA_POINTS = 150

stock_data = stock_dfs['IXIC']
stock_data = pick_feature_columns(stock_data)

train_data, test_data = train_test_split(stock_data)
X_train, y_train = dependent_independent_split(train_data, NUMBER_DATA_POINTS)
X_test, y_test = dependent_independent_split(test_data, NUMBER_DATA_POINTS)

fig = plt.figure()
ax = fig.add_subplot()
plt.plot(train_data, 'b')
plt.plot(test_data, 'r')
ax.set_xlabel("Date")
ax.set_ylabel("CloseUSD")
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

mlr_model = LinearRegression(n_jobs=1)
mlr_model.fit(X_train, y_train)

In [None]:
predictions = mlr_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

print("RMSE: ", mean_squared_error(y_test, predictions, squared=False))
print("R-squared: ", r2_score(y_test, predictions))

In [None]:
fig = plt.figure()
ax = fig.add_subplot()
plt.plot(predictions, 'r')
plt.plot(y_test, 'b')
plt.grid(True)
ax.set_xlabel("Timestep")
ax.set_ylabel("CloseUSD")
plt.show()

In [None]:
X_test_mean = X_test.mean(axis=1)
y_pred = mlr_model.predict(X_test)


fig = plt.figure()
ax = fig.add_subplot()
plt.scatter(X_test_mean, y_test, color='k')
plt.plot(X_test_mean, y_pred, color='r')
ax.set_xlabel('CloseUSD')
ax.set_ylabel('CloseUSD')
plt.show()

### Linear Regression with Various Scalers
In this section, I try and determine if any of the scaling techniques have a noticable impact on the outcome of the model accuracy metrics. I try MinMaxScaler, StandardScaler, and no scaling. The result is that the scaling does not have any affect on the accuracy metrics of the model.

In [None]:
# No scaling
NUMBER_DATA_POINTS = 1

performances = {}
for name, df in stock_dfs.items():
    df = pick_feature_columns(df)
    
    train_data, test_data = train_test_split(df)
    X_train, y_train = dependent_independent_split(train_data, NUMBER_DATA_POINTS)
    X_test, y_test = dependent_independent_split(test_data, NUMBER_DATA_POINTS)
    
    model = train_model(X_train, y_train)
    predictions = model.predict(X_test)
    scores = score_model(None, y_test, predictions)
    performances[name] = scores

performances_df = pd.DataFrame.from_dict(data=performances, orient='index')
performances_df

In [None]:
# Z-score scaling
NUMBER_DATA_POINTS = 1

performances = {}
for name, df in stock_dfs.items():
    df = pick_feature_columns(df)
    scaler, df = standard_scale_data(df)
    
    train_data, test_data = train_test_split(df)
    X_train, y_train = dependent_independent_split(train_data, NUMBER_DATA_POINTS)
    X_test, y_test = dependent_independent_split(test_data, NUMBER_DATA_POINTS)
    
    model = train_model(X_train, y_train)
    predictions = model.predict(X_test)
    scores = score_model(scaler, y_test, predictions)
    performances[name] = scores

performances_df = pd.DataFrame.from_dict(data=performances, orient='index')
performances_df

In [None]:
# MinMax scaling
NUMBER_DATA_POINTS = 1

performances = {}
for name, df in stock_dfs.items():
    df = pick_feature_columns(df)
    scaler, df = min_max_scale_data(df)
    
    train_data, test_data = train_test_split(df)
    X_train, y_train = dependent_independent_split(train_data, NUMBER_DATA_POINTS)
    X_test, y_test = dependent_independent_split(test_data, NUMBER_DATA_POINTS)
    
    model = train_model(X_train, y_train)
    predictions = model.predict(X_test)
    scores = score_model(scaler, y_test, predictions)
    performances[name] = scores

performances_df = pd.DataFrame.from_dict(data=performances, orient='index')
performances_df

### Multiple Linear Regression with Varying Number of Independent Variables
To determine how effective varying the amount of independent variables is on the accuracy of the model, I complete a simple/multi-variable linear regression and vary the amount of previous days data. The outcome is that it has little to no impact on the outcome.

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression

number_of_points = [1, 10, 100]

for n in number_of_points:
    stock_data = stock_dfs['HSI']
    stock_data = pick_feature_columns(stock_data)
    scaler, stock_data = standard_scale_data(stock_data)

    train_data, test_data = train_test_split(stock_data)
    X_train, y_train = dependent_independent_split(train_data, n)
    X_test, y_test = dependent_independent_split(test_data, n)
    
    mlr_model = LinearRegression(n_jobs=-1)
    mlr_model.fit(X_train, y_train)
    
    predictions = mlr_model.predict(X_test)
    predictions = scaler.inverse_transform(predictions)
    y_test_scaled = scaler.inverse_transform(y_test)
    
    print("n", n)
    print("\tRMSE: ", mean_squared_error(y_test_scaled, predictions, squared=False))
    print("\tR-squared: ", r2_score(y_test_scaled, predictions))
    print("\tMAPE", mean_absolute_percentage_error(y_test_scaled, predictions))

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression

all_scores = {}

for name, df in stock_dfs.items(): 
    scores = []
    for n in range(1, 50):
        stock_data = df
        stock_data = pick_feature_columns(stock_data)
        scaler, stock_data = standard_scale_data(stock_data)

        train_data, test_data = train_test_split(stock_data)
        X_train, y_train = dependent_independent_split(train_data, n)
        X_test, y_test = dependent_independent_split(test_data, n)

        mlr_model = LinearRegression(n_jobs=-1)
        mlr_model.fit(X_train, y_train)

        predictions = mlr_model.predict(X_test)
        predictions = scaler.inverse_transform(predictions)
        y_test_scaled = scaler.inverse_transform(y_test)

        scores.append(mean_squared_error(y_test_scaled, predictions, squared=False))
        
    all_scores[name] = scores

all_scores

In [None]:
from sklearn.preprocessing import minmax_scale

fig = plt.figure()
ax = fig.add_subplot()

for _, score in all_scores.items():
    score = minmax_scale(score)
    plt.plot(score)

ax.set_xlabel("Number of independent variables")
ax.set_ylabel("RMSE score")
plt.show()

### Multiple Linear Regression on All Stocks
This is to collect the mean of all the accuracy metrics for a mutli-varable linear regression.

In [None]:
NUMBER_DATA_POINTS = 90

def process_df(df):
    df = pick_feature_columns(df)
    train_data, test_data = train_test_split(df)
    X_train, y_train = dependent_independent_split(train_data, NUMBER_DATA_POINTS)
    X_test, y_test = dependent_independent_split(test_data, NUMBER_DATA_POINTS)
    return X_train, X_test, y_train, y_test, None

In [None]:
performances = {}
for name, df in stock_dfs.items():
    X_train, X_test, y_train, y_test, scaler = process_df(df)
    model = train_model(X_train, y_train)
    predictions = model.predict(X_test)
    scores = score_model(None, y_test, predictions)
    performances[name] = scores

performances_df = pd.DataFrame.from_dict(data=performances, orient='index')
performances_df

In [None]:
performances_df.mean()