# Time Series Analysis for Stock Prediction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import requests
from datetime import datetime, timedelta # To use today's day for prediction 

## Loading real-time stock data using API 


Using API provided by AlphaVantage (opensource free api)

Reference: https://www.alphavantage.co/documentation/

Pulling realtime time series stock data from Api

Here company considered is Microsoft for analysis. You can change the company symbol in Api to get data for other companies to look at.
Note that the Company Array consists some symbols of companies to try.

In [2]:
company_symbols = ["MSFT","AAPL","IBM","F"]
API_KEY = "3C2JRD12NE8OF2W8"
url = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=MSFT&apikey="+ API_KEY +"&outputsize=full"
JSONContent = requests.get(url)

In [3]:
JSONContent.json()

{'Meta Data': {'1. Information': 'Daily Prices (open, high, low, close) and Volumes',
  '2. Symbol': 'MSFT',
  '3. Last Refreshed': '2020-04-03',
  '4. Output Size': 'Full size',
  '5. Time Zone': 'US/Eastern'},
 'Time Series (Daily)': {'2020-04-03': {'1. open': '155.1000',
   '2. high': '157.3800',
   '3. low': '152.1900',
   '4. close': '153.8300',
   '5. volume': '40873497'},
  '2020-04-02': {'1. open': '151.8600',
   '2. high': '155.4800',
   '3. low': '150.3600',
   '4. close': '155.2600',
   '5. volume': '49630735'},
  '2020-04-01': {'1. open': '153.0000',
   '2. high': '157.7500',
   '3. low': '150.8200',
   '4. close': '152.1100',
   '5. volume': '57969926'},
  '2020-03-31': {'1. open': '159.4000',
   '2. high': '164.7800',
   '3. low': '156.5600',
   '4. close': '157.7100',
   '5. volume': '77927186'},
  '2020-03-30': {'1. open': '152.4400',
   '2. high': '160.6000',
   '3. low': '150.0100',
   '4. close': '160.2300',
   '5. volume': '63420326'},
  '2020-03-27': {'1. open': '1

## Data Exploration in order to build Dataframe for analysis

In [4]:
# Extracting useful information from the response received using the API
stock_df = pd.DataFrame(JSONContent.json()['Time Series (Daily)'])

In [5]:
stock_df = stock_df.transpose()
stock_df.head()

Unnamed: 0,1. open,2. high,3. low,4. close,5. volume
2020-04-03,155.1,157.38,152.19,153.83,40873497
2020-04-02,151.86,155.48,150.36,155.26,49630735
2020-04-01,153.0,157.75,150.82,152.11,57969926
2020-03-31,159.4,164.78,156.56,157.71,77927186
2020-03-30,152.44,160.6,150.01,160.23,63420326


The dataframe above consists of stock data from 20 years ago till today.

From that we will take today row aside to check how good we can predict today's stock value?

In [6]:
# Discard today's date as we will use this for final validation to check if our model is performing well in predicting future.... 

stock_on_today = stock_df.iloc[0]
print(type(stock_on_today))

today_date = (datetime.now() - timedelta(1)).strftime('%Y-%m-%d')
print(today_date)

stock_df = stock_df.drop(today_date)
stock_df.head()

<class 'pandas.core.series.Series'>
2020-04-02


Unnamed: 0,1. open,2. high,3. low,4. close,5. volume
2020-04-03,155.1,157.38,152.19,153.83,40873497
2020-04-01,153.0,157.75,150.82,152.11,57969926
2020-03-31,159.4,164.78,156.56,157.71,77927186
2020-03-30,152.44,160.6,150.01,160.23,63420326
2020-03-27,151.75,154.89,149.2,149.7,57042291


## Prepare data for prediction

In [7]:
# stock closing price is which is going to predict
Y = stock_df["4. close"]
X = stock_df[["1. open","2. high","3. low","5. volume"]]

In [8]:
import sklearn
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(X.values, Y.values, test_size = 0.25,
                                                                           random_state = 42)

In [9]:
# Training set
train_features

array([['61.0500', '61.1400', '60.2200', '20670700'],
       ['52.5300', '54.3200', '52.5000', '64633324'],
       ['30.2200', '30.5300', '30.2100', '33650000'],
       ...,
       ['34.6900', '35.1400', '34.3800', '58469100'],
       ['25.0700', '25.2500', '24.9100', '47956300'],
       ['60.1600', '60.4200', '59.9200', '26434697']], dtype=object)

In [10]:
# Testing set
test_features

array([['24.5200', '24.8600', '24.4000', '49708700'],
       ['27.9700', '28.0500', '27.3700', '48011800'],
       ['34.8500', '35.2900', '34.4500', '114655600'],
       ...,
       ['17.0300', '17.2200', '16.6000', '70710700'],
       ['32.7200', '32.7800', '32.5900', '32860200'],
       ['25.3600', '25.4900', '25.2600', '49104100']], dtype=object)

# Random Forest Model

In [11]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model 
rf = RandomForestRegressor(n_estimators= 10, random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

## Calculating Error that a model made on validation data (from train_test_split)

In [12]:
import numpy as np

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Converting test_lables to float for computation of error
test_labels = test_labels.astype(float)

# for i in range(len(predictions)) : print(type(predictions[i]))
# for i in range(len(predictions)) : print(type(test_labels[i]))

# Calculate the absolute errors
# we can use numpy substract or can perform arthemetic computation on ndarray's
# errors = np.substract(predictions,test_labels)
errors = predictions - test_labels
print(errors)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

[-0.134   0.274   0.259  ...  0.045  -0.107   0.0355]
Mean Absolute Error: -0.01


### Looking at importances of features

In [14]:
sorted(zip(X.columns, rf.feature_importances_),
        key=lambda x: x[1], reverse=True)

[('3. low', 0.5395612438495887),
 ('2. high', 0.2989852165332637),
 ('1. open', 0.16141103095427178),
 ('5. volume', 4.2508662875829294e-05)]

## Finally Predicting today's closing stock to test our model on today's stock

In [13]:
## test latest date 10/2/2019 using our model 

df_today = pd.DataFrame(stock_on_today)
df_today = df_today.transpose()

predictionStock_today = rf.predict(df_today[["1. open","2. high","3. low","5. volume"]].values)

actualStock_today = df_today["4. close"]

print("Predicted stock on 2nd April 2020 is -> ",predictionStock_today)
actualStock_today = actualStock_today.astype(float).values
print("Actual stock on 2nd April 2020 is  ->  ",actualStock_today)

error = np.subtract(predictionStock_today,actualStock_today)
print(error[0])

Predicted stock on 2nd April 2020 is ->  [153.88]
Actual stock on 2nd April 2020 is  ->   [153.83]
0.049999999999954525


# Conclusion

Here, we see that the Random Forest helps us predict very close to the actual future value in case of stock!