<a href="https://colab.research.google.com/github/akhilsrinath/ml-poc/blob/master/Stock_model_building.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Collection

First we want to get the data that we are going to work with

In [1]:
!pip install yfinance

Collecting yfinance
  Downloading https://files.pythonhosted.org/packages/a7/ee/315752b9ef281ba83c62aa7ec2e2074f85223da6e7e74efb4d3e11c0f510/yfinance-0.1.59.tar.gz
Collecting lxml>=4.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/cf/4d/6537313bf58fe22b508f08cf3eb86b29b6f9edf68e00454224539421073b/lxml-4.6.3-cp37-cp37m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 11.8MB/s 
Building wheels for collected packages: yfinance
  Building wheel for yfinance (setup.py) ... [?25l[?25hdone
  Created wheel for yfinance: filename=yfinance-0.1.59-py2.py3-none-any.whl size=23442 sha256=86541229b7a6dda69bfc0fcdf78a1863a31e45d2ed19132f8c19b060c9aa4023
  Stored in directory: /root/.cache/pip/wheels/f8/2a/0f/4b5a86e1d52e451757eb6bc17fd899629f0925c777741b6d04
Successfully built yfinance
Installing collected packages: lxml, yfinance
  Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
Successfull

In [2]:
# Import section
import yfinance as yf
import pandas as pd

In [3]:
def get_original_data(symbol):
    # Get the data of the corresponding symbol (Company info, current price, etc)
    stock_info = yf.Ticker(symbol)
    print(stock_info)

    # We want to get the historical stock information as a DataFrame
    # -Period: you can use start-end or how far back in time do you want to get the data
    # -Interval: time between observations
    stock_df = stock_info.history(period='7d', interval='1m')

    # We don't want to use the Dividends and Stock Splits, so we delete them
    del stock_df['Dividends']
    del stock_df['Stock Splits']
    return stock_df


In [4]:
# Creation of the labels we want to classify the problem in
# if the Label has a value of 1, means that the next price is going to go up, so we want to buy the stock
# if the Label has a value of 0, means that the next price is going to go down, so we want to sell the stock
def get_labels(stock_df):
    shifted_data = stock_df['Close'].shift(-1)
    labels = []
    for close, shifted in zip(stock_df['Close'], shifted_data):
        if shifted >= close:
            labels.append(1)
        else:
            labels.append(0)
    stock_df['Labels'] = labels
    return(stock_df)


In [5]:
if __name__ == '__main__':

    # We need to define the symbol we want to get, lets start working with S&P 500 
    symbol = 'SPY'
    stock_df = get_original_data(symbol)
    stock_df = get_labels(stock_df)

    # More info about the data
    print('We have a total of',stock_df.shape[0], 'observations')
    print('with',stock_df.shape[1], 'features each wich are', stock_df.keys())

    df = stock_df

yfinance.Ticker object <SPY>
We have a total of 2726 observations
with 6 features each wich are Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Labels'], dtype='object')


In [6]:
stock_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Labels
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-03-23 09:30:00-04:00,391.910004,391.950012,391.730011,391.755005,1541884,1
2021-03-23 09:31:00-04:00,391.76001,391.940002,391.559998,391.850006,334659,0
2021-03-23 09:32:00-04:00,391.850006,391.880005,391.630005,391.640015,306343,1
2021-03-23 09:33:00-04:00,391.640015,392.0,391.630005,391.918915,237583,0
2021-03-23 09:34:00-04:00,391.915009,391.970001,391.809998,391.880005,155247,1


In [7]:
import pandas as pd 
import numpy as np 
from sklearn import preprocessing 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [8]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Labels
count,2726.0,2726.0,2726.0,2726.0,2726.0,2726.0
mean,392.630086,392.730145,392.530787,392.631211,225207.4,0.517975
std,3.117306,3.095557,3.136853,3.118438,316570.7,0.499768
min,384.069885,384.200012,383.899994,384.065002,22357.0,0.0
25%,391.049988,391.179993,390.959991,391.056244,97063.0,0.0
50%,392.670959,392.75,392.595001,392.679443,148407.5,1.0
75%,394.727531,394.820007,394.637512,394.725006,245584.0,1.0
max,397.980011,398.0,397.940002,397.970001,7277114.0,1.0


In [9]:
df.isnull().any()

Open      False
High      False
Low       False
Close     False
Volume    False
Labels    False
dtype: bool

In [10]:
X = df.loc[:, df.columns!='Labels']
X

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-03-23 09:30:00-04:00,391.910004,391.950012,391.730011,391.755005,1541884
2021-03-23 09:31:00-04:00,391.760010,391.940002,391.559998,391.850006,334659
2021-03-23 09:32:00-04:00,391.850006,391.880005,391.630005,391.640015,306343
2021-03-23 09:33:00-04:00,391.640015,392.000000,391.630005,391.918915,237583
2021-03-23 09:34:00-04:00,391.915009,391.970001,391.809998,391.880005,155247
...,...,...,...,...,...
2021-03-31 15:55:00-04:00,396.690002,396.690002,396.244995,396.369995,2994314
2021-03-31 15:56:00-04:00,396.369995,396.709991,396.329987,396.559998,2319348
2021-03-31 15:57:00-04:00,396.549988,396.679993,396.434998,396.540009,2693961
2021-03-31 15:58:00-04:00,396.535004,396.750000,396.369995,396.730011,3341823


In [11]:
X = np.array(X)

In [12]:
y = np.array(df['Labels'])
y

array([1, 0, 1, ..., 1, 0, 0])

## **Logistic regression model**

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [18]:
model = LogisticRegression( random_state=0)
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
model.score(X, y)

0.5179750550256786

In [20]:
model.score(X_test, y_test)

0.5146520146520146

## **Random Froest model**

In [17]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth = 3, random_state=0)
clf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [24]:
clf.score(X_train, y_train)

0.5637614678899082

In [23]:
clf.score(X_test, y_test)

0.5164835164835165

## **K-Nearest Neighbors Classifier**

In [25]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [26]:
knn.score(X,y)

0.7002934702861335

In [27]:
knn.score(X_test, y_test)

0.5073260073260073