## Data Preprocessing

In [141]:
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from tensorflow.keras.layers import Dropout, Dense, LSTM
from tensorflow.keras import Sequential

In [36]:
df = pd.read_csv("/content/tatasteel.csv")
print(df.shape)
df.head()

(8625, 7)


Unnamed: 0.1,Unnamed: 0,timestamp,open,high,low,close,volume
0,0,2021-08-23 14:11:00+05:30,1380.35,1381.0,1379.35,1380.25,15564
1,1,2021-08-23 14:12:00+05:30,1380.5,1381.0,1379.25,1379.8,28503
2,2,2021-08-23 14:13:00+05:30,1379.8,1380.55,1378.05,1378.8,29879
3,3,2021-08-23 14:14:00+05:30,1378.8,1378.8,1377.15,1378.0,16814
4,4,2021-08-23 14:15:00+05:30,1378.3,1378.7,1375.35,1376.25,36478


In [41]:
try:
  df["timestamp"] = pd.to_datetime(df["timestamp"],format = "%Y-%m-%d %H:%M:%S+05:30")
except:
  print("Column is in datetime format already")

In [43]:
df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,open,high,low,close,volume
0,0,2021-08-23 14:11:00,1380.35,1381.0,1379.35,1380.25,15564
1,1,2021-08-23 14:12:00,1380.5,1381.0,1379.25,1379.8,28503
2,2,2021-08-23 14:13:00,1379.8,1380.55,1378.05,1378.8,29879
3,3,2021-08-23 14:14:00,1378.8,1378.8,1377.15,1378.0,16814
4,4,2021-08-23 14:15:00,1378.3,1378.7,1375.35,1376.25,36478


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8625 entries, 0 to 8624
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  8625 non-null   int64         
 1   timestamp   8625 non-null   datetime64[ns]
 2   open        8625 non-null   float64       
 3   high        8625 non-null   float64       
 4   low         8625 non-null   float64       
 5   close       8625 non-null   float64       
 6   volume      8625 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 471.8 KB


In [44]:
print(df["timestamp"].dt.date.unique().shape)
df["timestamp"].dt.date.unique()

(24,)


array([datetime.date(2021, 8, 23), datetime.date(2021, 8, 24),
       datetime.date(2021, 8, 25), datetime.date(2021, 8, 26),
       datetime.date(2021, 8, 27), datetime.date(2021, 8, 30),
       datetime.date(2021, 8, 31), datetime.date(2021, 9, 1),
       datetime.date(2021, 9, 2), datetime.date(2021, 9, 3),
       datetime.date(2021, 9, 6), datetime.date(2021, 9, 7),
       datetime.date(2021, 9, 8), datetime.date(2021, 9, 9),
       datetime.date(2021, 9, 13), datetime.date(2021, 9, 14),
       datetime.date(2021, 9, 15), datetime.date(2021, 9, 16),
       datetime.date(2021, 9, 17), datetime.date(2021, 9, 20),
       datetime.date(2021, 9, 21), datetime.date(2021, 9, 22),
       datetime.date(2021, 9, 23), datetime.date(2021, 9, 24)],
      dtype=object)

In [136]:
X = []
Y = []
for date in df["timestamp"].dt.date.unique():
  #dataframe of date in a particular date
  mini_df = df[df["timestamp"].dt.date == date]
  #Taking first 30 min data of a particular day
  X.append(np.array(mini_df.iloc[:30]["close"]))
  #Computing day sentiment based on begin, end values
  y_day = 0 # 0 represents negative sentiment(day beginning close > day ending close)
  if(mini_df.iloc[-1]["close"] > mini_df.iloc[0]["close"]):
    y_day = 1
  Y.append(y_day)

X = np.array(X)
Y = np.array(Y)
#Data time series has only one attribute - Close price
X_lstm = np.reshape(X,(X.shape[0],X.shape[1],1)) #(No of days,30,1)

In [137]:
x_train,x_test,y_train,y_test = train_test_split(X_lstm,Y,test_size = 0.2)

In [100]:
x_train.shape

(19, 30, 1)

In [102]:
y_train

array([0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1])

In [104]:
x_test.shape

(5, 30, 1)

# LSTM for Market Sentiment Classification

In [126]:
N_HIDDEN = 16
num_attr = 1 #Only close price
SEQ_LENGTH = 30

model = Sequential()
model.add(LSTM(N_HIDDEN, return_sequences=False, activation='relu', input_shape=(SEQ_LENGTH, num_attr)))
model.add(Dropout(0.2))
# model.add(LSTM(N_HIDDEN, return_sequences=False, activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

In [127]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 16)                1152      
                                                                 
 dropout_6 (Dropout)         (None, 16)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1,169
Trainable params: 1,169
Non-trainable params: 0
_________________________________________________________________


In [129]:
%time
model.compile(loss="binary_crossentropy", optimizer='adam')  
history = model.fit(x_train, y_train, epochs=20)
#model.fit(x_train, y_train, batch_size=50, nb_epoch=20, validation_split=0.05)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 9.3 µs
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [131]:
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: {:.2f}".format(scores*100))

Accuracy: 69.66


# XGBoost Classification

In [147]:
xgb = XGBClassifier()

In [148]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.3)

In [149]:
xgb.fit(x_train, y_train)

XGBClassifier()

In [150]:
# make predictions for test data
y_pred = xgb.predict(x_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 75.00%
