# Stock Market Prediction

# I. Packages

In [None]:
# importing packages
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
import utils as ut
from importlib import reload
reload(ut)

In [None]:
# set pandas options 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 999)

In [None]:
# setting the seed for reproductible results
from numpy.random import seed
seed(5)

# II. Data Exploration
## II.1. Selecting a company ticker
Find the ticker symbol for the comany which closing stock price we want to predict. <br>

**dataset**: US Stocks & ETFs - Tickers, Company Info, Logos <br>
- In this dataset we can search for companies, whose stock price we want to predict. For example, I am going to select a company based on its industry.

In [None]:
# reading in the dataset
df_us_stocks = pd.read_csv('data/companies.csv')

# we won't need this kind of data for finding the ticker we need
df_us_stocks = df_us_stocks.drop(['website', 'logo',  'ceo'], axis = 1)

df_us_stocks.info()

In [None]:
df_us_stocks.sample(5)

In [None]:
# list all industries
df_us_stocks['industry'].unique()

In [None]:
# select TOP 5 companies belonging to 'Education' industry based on their market cap
df_us_stocks[df_us_stocks['industry'] == 'Education'].sort_values(by='market cap', ascending=False).head(5)

In [None]:
# selecting a company
selected_company_ticker = 'TAL'
company_name = df_us_stocks[df_us_stocks['ticker'] == selected_company_ticker].iloc[0]['short name']

## II. 2. Explore the Stock Market Dataset
**dataset:** Huge Stock Market Dataset<br>

Don't forget that there are some days where we have no data - **stock market was closed** on these days.

In [None]:
# Read in the data based on some criteria
zip_file_path = 'data/huge_stock_market_data.zip'
# company ticker format should be: companyticker.us.txt
company_ticker = 'tal.us.txt'
date_interval = ['2014-01-01', '2016-12-31']

df_stock_market = ut.read_huge_market_stock_data(zip_file_path, company_ticker, date_interval)
df_stock_market.sample(5)

In [None]:
df_stock_market.info()

### II.2.1. Closing Price History

In [None]:
ut.plot_closing_price_history(df_stock_market['Close'], company_name)

### II.2.2. Exponential Moving Average (EMA)

In [None]:
# calculate  EMA for the last 60 days
df_stock_market['EMA_60'] = df_stock_market.iloc[:,3].ewm(span=60,adjust=False).mean()

In [None]:
# plot closing price and moving average
ut.plot_closing_price_and_EMA(df_stock_market['Close'], df_stock_market['EMA_60'])

# III. Preparing data for LSTM

**1. Converting the data into array**

In [None]:
# filter only the 'Close' column
dataset = df_stock_market.filter(['Close'])

# and convert it to a numpy array
data = dataset.values

**2. Scaling the data.**

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1)) 
scaled_data = scaler.fit_transform(data)

**3. creating train & test data**

The train and test sets are going to contain moving values. 

**training set**: the past 60 day closing price values<br>
**test_set**: closing price on the 61st day

In [None]:
# we need to store the training data lenght into a separate variable, as we will need it for plotting the results later
training_data_len, x_train, y_train, x_test, y_test = ut.create_train_test_split(.7, 60, data, scaled_data)

**4. preparing data sets for LSTM**

LSTM is expecting a 3D dataset in the following format: [number of samples, number of time steps, and number of features]

In [None]:
# creating 3D arrays from input values
x_train = ut.create_3d_arrays(x_train)
x_test = ut.create_3d_arrays(x_test)


# IV. Modelling
Create the predicitve model using LSTM.

In [None]:
# build the LSTM  model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True,input_shape=(x_train.shape[1],1)))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dense(units=25))
model.add(Dense(units=1))

In [None]:
# compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
# train the model
model.fit(x_train, y_train, batch_size=10, epochs=5)

# V. Prediction

In [None]:
# get the model's predicted stock prices
predictions = model.predict(x_test)

# undo the scaling, so we can see the real prices, not the normalized ones
predictions = scaler.inverse_transform(predictions)

In [None]:
# Calculate RMSE
rmse=np.sqrt(np.mean(((predictions- y_test)**2)))
print('RMSE: ', rmse)

In [None]:
# see real and predicted values
ut.plot_result(dataset, training_data_len, predictions)