In [1]:
# Author: Abdulmenaf Altintas

# Machine learning models: Linear regression model with skit-learn

In [2]:
import numpy as np
import math as mt
import pandas as pd
import quandl

import warnings
warnings.filterwarnings('ignore')

In [3]:
# We wil build a model that predicts TESLA stock prices
df = quandl.get("WIKI/TSLA")

print(df.head())

             Open     High    Low  Close      Volume  Ex-Dividend  \
Date                                                                
2010-06-29  19.00  25.0000  17.54  23.89  18766300.0          0.0   
2010-06-30  25.79  30.4192  23.30  23.83  17187100.0          0.0   
2010-07-01  25.00  25.9200  20.27  21.96   8218800.0          0.0   
2010-07-02  23.00  23.1000  18.71  19.20   5139800.0          0.0   
2010-07-06  20.00  20.0000  15.83  16.11   6866900.0          0.0   

            Split Ratio  Adj. Open  Adj. High  Adj. Low  Adj. Close  \
Date                                                                  
2010-06-29          1.0      19.00    25.0000     17.54       23.89   
2010-06-30          1.0      25.79    30.4192     23.30       23.83   
2010-07-01          1.0      25.00    25.9200     20.27       21.96   
2010-07-02          1.0      23.00    23.1000     18.71       19.20   
2010-07-06          1.0      20.00    20.0000     15.83       16.11   

            Adj. V

In [4]:
# We will select Adj. Open  Adj. High  Adj. Low  Adj. Close Adj. Volume as our starting features

df = df[['Adj. Open',  'Adj. Close', 'Adj. Low', 'Adj. High', 'Adj. Volume']]
print(df.head())

            Adj. Open  Adj. Close  Adj. Low  Adj. High  Adj. Volume
Date                                                               
2010-06-29      19.00       23.89     17.54    25.0000   18766300.0
2010-06-30      25.79       23.83     23.30    30.4192   17187100.0
2010-07-01      25.00       21.96     20.27    25.9200    8218800.0
2010-07-02      23.00       19.20     18.71    23.1000    5139800.0
2010-07-06      20.00       16.11     15.83    20.0000    6866900.0


In [5]:
# We can can create new features that will help our model to be more effective

df['open_close_percent'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0
df['high_low_percent'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. High'] * 100.0

# We replace 'Adj. High' and 'Adj. Low' with 'high_low_percent'. We also remove 'Adj. Open'.

df_new = df[['open_close_percent', 'high_low_percent', 'Adj. Close', 'Adj. Volume']]

print(df_new.head())

            open_close_percent  high_low_percent  Adj. Close  Adj. Volume
Date                                                                     
2010-06-29           25.736842         29.840000       23.89   18766300.0
2010-06-30           -7.599845         23.403640       23.83   17187100.0
2010-07-01          -12.160000         21.797840       21.96    8218800.0
2010-07-02          -16.521739         19.004329       19.20    5139800.0
2010-07-06          -19.450000         20.850000       16.11    6866900.0


In [6]:
# Determine prediction as label and put it into dataframe as new column
df_new.fillna(value=-10**4, inplace=True)
prediction = int(mt.ceil(0.01 * len(df)))

# define the label
df_new['label'] = df_new["Adj. Close"].shift(-prediction)
df_new.dropna(inplace=True) # drop Nan values
print(df_new.head())



            open_close_percent  high_low_percent  Adj. Close  Adj. Volume  \
Date                                                                        
2010-06-29           25.736842         29.840000       23.89   18766300.0   
2010-06-30           -7.599845         23.403640       23.83   17187100.0   
2010-07-01          -12.160000         21.797840       21.96    8218800.0   
2010-07-02          -16.521739         19.004329       19.20    5139800.0   
2010-07-06          -19.450000         20.850000       16.11    6866900.0   

            label  
Date               
2010-06-29  20.72  
2010-06-30  20.35  
2010-07-01  19.94  
2010-07-02  20.92  
2010-07-06  21.95  


In [7]:
# import skit-learn modules

from sklearn.model_selection import train_test_split
from sklearn import preprocessing, svm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [8]:
# Define features and labels
X = np.array(df_new[['open_close_percent', 'high_low_percent', 'Adj. Close', 'Adj. Volume']])
y = np.array(df_new['label'])
print(X[0:2,:])
print(y[0:2])

[[ 2.57368421e+01  2.98400000e+01  2.38900000e+01  1.87663000e+07]
 [-7.59984490e+00  2.34036398e+01  2.38300000e+01  1.71871000e+07]]
[20.72 20.35]


In [9]:
# Scale features between -1 and +1  before processing
X = preprocessing.scale(X)

# Start training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [10]:
# Choose classifier and fit
classifier = LinearRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print(score)

0.9631461078032267
