# Stock Prediction Algorithm
Applying machine learning on S&P500 data obtained from Yahoo Finance

In [1]:
# Reading the data
import pandas as pd
data = pd.read_csv('YAHOO-INDEX_GSPC.csv')
# Convert the Date column to Panda date type
data['Date'] =  pd.to_datetime(data['Date'])
print(data.head())

        Date         Open         High          Low        Close  \
0 2017-03-03  2380.919922  2383.889893  2375.389893  2383.120117   
1 2017-03-02  2394.750000  2394.750000  2380.169922  2381.919922   
2 2017-03-01  2380.129883  2400.979980  2380.129883  2395.959961   
3 2017-02-28  2366.080078  2367.790039  2358.959961  2363.639893   
4 2017-02-27  2365.229980  2371.540039  2361.870117  2369.729980   

         Volume  Adjusted Close  
0  3.555260e+09     2383.120117  
1  3.821320e+09     2381.919922  
2  4.345180e+09     2395.959961  
3  4.210140e+09     2363.639893  
4  3.582610e+09     2369.729980  


In [2]:
# Making datetime comparisons
from datetime import datetime
dates_later_2015 = data['Date'] > datetime(year=2015, month=4, day=1)
print(dates_later_2015)

0         True
1         True
2         True
3         True
4         True
5         True
6         True
7         True
8         True
9         True
10        True
11        True
12        True
13        True
14        True
15        True
16        True
17        True
18        True
19        True
20        True
21        True
22        True
23        True
24        True
25        True
26        True
27        True
28        True
29        True
         ...  
16871    False
16872    False
16873    False
16874    False
16875    False
16876    False
16877    False
16878    False
16879    False
16880    False
16881    False
16882    False
16883    False
16884    False
16885    False
16886    False
16887    False
16888    False
16889    False
16890    False
16891    False
16892    False
16893    False
16894    False
16895    False
16896    False
16897    False
16898    False
16899    False
16900    False
Name: Date, dtype: bool


In [3]:
# Sort the dataframe on Date column in ascending order
data = data.sort(['Date'], ascending=True)
data = data.reset_index()
print(data.head())

   index       Date   Open   High    Low  Close     Volume  Adjusted Close
0  16900 1950-01-03  16.66  16.66  16.66  16.66  1260000.0           16.66
1  16899 1950-01-04  16.85  16.85  16.85  16.85  1890000.0           16.85
2  16898 1950-01-05  16.93  16.93  16.93  16.93  2550000.0           16.93
3  16897 1950-01-06  16.98  16.98  16.98  16.98  2010000.0           16.98
4  16896 1950-01-09  17.08  17.08  17.08  17.08  2520000.0           17.08


  from ipykernel import kernelapp as app


In [None]:
# Computing additional values
import numpy as np
mean_price = []
std_prices = []
for i, row in data.iterrows():
    price = data['Close']
    if i >= 5:
        price = price.iloc[i-5:i]
        mean_p = price.mean()
        std_price = np.std(price)
        mean_price.append(mean_p)
        std_prices.append(std_price)
mean_prices = np.asarray(mean_price)
std_prices = np.asarray(std_price)

In [None]:
print(data.tail())

In [None]:
# Divide the data 
data = data[data['Date']> datetime(year=1951, month=1, day=2)]
data.dropna(axis=0, inplace=True)
# Generate 2 dataframes
train = data[data['Date']<datetime(year=2013, month=1, day=1)]
test = data[data['Date']>datetime(year=2013, month=1, day=1)]

In [None]:
# Applying the Machine Learning algorithm using the mean absolute error metric
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

# Create the linear model
model = LinearRegression()
model.fit(train['Volume','High', 'Low'], train['Close'])
predictions = model.predict(test['Volume', 'High', 'Low'])
error = mean_absolute_error(predictions, test['Close'])
print(error)