In [2]:
# Linear Regression
# Humberto Barrantes
# 09-2020

# Imports

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import datetime
from sklearn.model_selection import train_test_split

import plotly.graph_objects as go

# Load the datasets

In [5]:
# Load the data

# Dow Jones
dowjones = pd.read_csv("data/DowJones.csv")

# Nasdaq
nasdaq = pd.read_csv("data/Nasdaq.csv")

# S&P
sandp = pd.read_csv("data/S&P.csv")

In [6]:
dowjones.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-01-04,10430.69043,10604.969727,10430.69043,10583.959961,10583.959961,179780000
1,2010-01-05,10584.55957,10584.55957,10522.519531,10572.019531,10572.019531,188540000
2,2010-01-06,10564.719727,10594.990234,10546.549805,10573.679688,10573.679688,186040000
3,2010-01-07,10571.110352,10612.370117,10505.209961,10606.860352,10606.860352,217390000
4,2010-01-08,10606.400391,10619.400391,10554.330078,10618.19043,10618.19043,172710000


# Convert the dates to numeric

In [7]:
for dataset in [dowjones, nasdaq, sandp]:
    
    # convert each date from str to datetime
    dataset['Date'] = pd.to_datetime(dataset['Date'])
    
    # convert each datetime to int
    dataset['Gregorian'] = dataset['Date'].map(datetime.datetime.toordinal)

In [8]:
dowjones.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Gregorian
count,2690.0,2690.0,2690.0,2690.0,2690.0,2690.0,2690.0
mean,18147.259709,18241.705598,18047.207977,18151.39504,18151.39504,206010500.0,735725.342751
std,5430.726541,5454.521915,5403.064637,5428.63082,5428.63082,134057300.0,1127.298153
min,9686.480469,9770.870117,9614.320313,9686.480469,9686.480469,8410000.0,733776.0
25%,13115.889893,13190.109619,13073.642578,13119.287354,13119.287354,105050000.0,734747.0
50%,17452.915039,17561.044922,17344.225586,17462.735352,17462.735352,162395000.0,735725.5
75%,23525.508301,23601.384278,23353.902832,23512.532227,23512.532227,281537500.0,736701.25
max,29440.470703,29568.570313,29406.75,29551.419922,29551.419922,2190810000.0,737677.0


# Correlation?

In [9]:
corr = sandp.corr()
corr.style.background_gradient(cmap='plasma').set_precision(2)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Gregorian
Open,1.0,1.0,1.0,1.0,1.0,-0.08,0.98
High,1.0,1.0,1.0,1.0,1.0,-0.07,0.98
Low,1.0,1.0,1.0,1.0,1.0,-0.09,0.98
Close,1.0,1.0,1.0,1.0,1.0,-0.08,0.98
Adj Close,1.0,1.0,1.0,1.0,1.0,-0.08,0.98
Volume,-0.08,-0.07,-0.09,-0.08,-0.08,1.0,-0.04
Gregorian,0.98,0.98,0.98,0.98,0.98,-0.04,1.0


# Train

In [38]:

data = pd.DataFrame({
    'Date': dowjones['Date'],
    'Gregorian': dowjones['Gregorian'],
})

data['Gregorian_2'] = np.power(data['Gregorian'], 2)
data['Gregorian_3'] = np.power(data['Gregorian'], 3)

data['Open'] = (dowjones['Open'] + nasdaq['Open'] + sandp['Open'])/3
data['High'] = (dowjones['High'] + nasdaq['High'] + sandp['High'])/3
data['Low'] = (dowjones['Low'] + nasdaq['Low'] + sandp['Low'])/3
data['Close'] = (dowjones['Close'] + nasdaq['Close'] + sandp['Close'])/3
data['Volume'] = (dowjones['Volume'] + nasdaq['Volume'] + sandp['Volume'])/3



In [39]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=data['Date'], y=data['Open'], mode='lines', name='Open'))

fig.add_trace(go.Scatter(x=data['Date'], y=data['Close'], mode='lines', name='Close'))

fig.update_yaxes(type="log")

fig.update_layout(
    title="Daily Average Open and Close market prices",
    xaxis_title="Date",
    yaxis_title="Dollars"
)

fig.show()

In [42]:
before_covid = data[data['Date'] < '2020/02/01']

# Train Model

In [43]:
X = before_covid[['Gregorian', 'Gregorian_2', 'Gregorian_3']]
y = before_covid['Close']

In [44]:
gregorian_model = LinearRegression()

gregorian_model.fit(X, y)

final_score = gregorian_model.score(X, y)

print(f"Final Score: {final_score}")

Final Score: 0.9709920333716154


In [45]:
gregorian_predictions = gregorian_model.predict(X)

In [54]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=data['Date'], y=data['Close'], mode='lines', name="Real data"))

fig.add_trace(go.Scatter(x=data['Date'], y=gregorian_predictions, mode='lines', name="Predictions"))

fig.update_layout(
    title="Daily Market Close 2010 to 2020",
    xaxis_title="Date",
    yaxis_title="Close",
    shapes=[
        dict(
            type="rect",
            xref="x",
            yref="paper",
            x0=datetime.date(2009,1,20),
            y0=0,
            x1=datetime.date(2017,1,20),
            y1=1,
            fillcolor="LightSeaGreen",
            opacity=0.5,
            layer="below",
            line_width=0
        ),
        dict(
            type="rect",
            xref="x",
            yref="paper",
            x0=datetime.date(2020,1,20),
            y0=0,
            x1=datetime.date(2022,1,20),
            y1=1,
            fillcolor="LightSalmon",
            opacity=0.5,
            layer="below",
            line_width=0
        ),
        
    ]
)

fig.show()