# Stock Market Movement using Transformers

Here, we analyse how World News affects the US Stock Market using NLP.
Things to note is that, dataset contains 25 top headlines for each day and whether the DJIA went up or down that day, i.e Positive or Negative Sentiment.


In [1]:


import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



Label "1" is when DJIA Adj Close value rose or stayed as the same.
Label "0" is when DJIA Adj Close value decreased.
                                                                                             

In [2]:
import pandas as pd
 
df = pd.read_csv('/kaggle/input/stocknews/Combined_News_DJIA.csv')

In [3]:
df.head()

In [4]:
df.info()

# Preprocessing the data 

In [5]:
df = df.drop(['Date'], axis=1)
df.head()

In [6]:
df_columns = df.columns
print(df_columns)

In [12]:
columns = ['Top1']

In [13]:
df['combined_news'] = df[columns].apply(lambda row:'.'.join(row.values.astype(str)), axis=1)

In [14]:
df = df.drop(columns, axis=1)

In [15]:
df.head()

In [18]:
df.head()

# Data Analysis 



In [21]:
import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline
import seaborn as sns
sns.set()

ax = sns.countplot(x='Label', hue='Label', data=df)

Finding out the most common words when DJIA goes up or down


In [67]:
from collections import Counter

data_djia_up = df[df['Label']==1].copy()
data_djia_down = df[df['Label']==0].copy()

In [23]:
print(data_djia_up[:2])

In [24]:
print(data_djia_down[:2])

# Cleaning the data and removing stopwords

In [25]:
import string
print(string.punctuation)

In [26]:
from nltk.corpus import stopwords


In [28]:
def punctuation_stopwords_removal(news_article):
    remove_punctuation = [ch for ch in news_article if ch not in string.punctuation]
    remove_punctuation = "".join(remove_punctuation).split()
    filtered_news_article = [word.lower() for word in remove_punctuation if word.lower() not in stopwords.words('english')]
    return filtered_news_article

In [72]:
data_djia_up.loc[:, 'combined_news'] = data_djia_up['combined_news'].apply(punctuation_stopwords_removal)
words_djia_up = data_djia_up['combined_news'].tolist()
print(data_djia_up[:1])

In [73]:
words_djia_up[:3]

In [74]:
data_djia_down.loc[:, 'combined_news'] = data_djia_down['combined_news'].apply(punctuation_stopwords_removal)
words_djia_down = data_djia_down['combined_news'].tolist()
print(words_djia_down[:2])

In [75]:
djia_up_list = []
for sublist in words_djia_up:
    for words in sublist:
        djia_up_list.append(words)

djia_down_list = []
for sublist in words_djia_down:
    for words in sublist:
        djia_down_list.append(words)
        

In [77]:
print('DJIA up list : {}'.format(len(djia_up_list)))
print('DJIA down list : {}'.format(len(djia_down_list)))
djia_up_list

In [79]:
djia_up_counter = Counter(djia_up_list)
djia_down_counter = Counter(djia_down_list)

djia_up_top_30_words = pd.DataFrame(djia_up_counter.most_common(30), columns=['word', 'count'])
top_30_words_down = pd.DataFrame(djia_down_counter.most_common(30), columns=['word', 'count'])


In [80]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='word', y='count', data=djia_up_top_30_words, ax=ax)
plt.title('Top 30 words when DJIA goes up')
plt.xticks(rotation='vertical')

In [81]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='word', y='count', data=top_30_words_down, ax=ax)
plt.title('Top 30 words when DJIA goes down')
plt.xticks(rotation='vertical')

# Using DistilBERT

In [83]:
df.head()

In [84]:
!pip install transformers

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

## Load HuggingFace DistilBERT model


In [89]:
# For DistilBERT
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# load pre-trained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

# Tokenization and Padding


In [90]:
tokenized = df['combined_news'].apply((lambda x: tokenizer.encode(x, add_special_token=True)))
max_len = 0
for i in tokenized.values:
    if len(i)>max_len:
        max_len = len(i)
print(max_len)
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

In [48]:
print(tokenized.shape)

In [91]:
tokenized[:1]

### Masking


In [92]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [93]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [94]:
features = last_hidden_states[0][:, 0, :].numpy()
labels = df['Label']

In [95]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [97]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [98]:
lr_clf.score(test_features, test_labels)

In [99]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
logit_roc_auc = roc_auc_score(test_labels, lr_clf.predict(test_features))
fpr, tpr, thresholds = roc_curve(test_labels, lr_clf.predict_proba(test_features)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.show()

# Things to do to improve:
1. Make own web-scraping tool to collect the dataset.
2. Add Dense layers on top of the Pre-trained BERT Model and fine-tune the Dataset.
3. Use a bigger BERT Model.
4. Fix glitches in the Dataset.
5. In addition, implement Adeversial Training and Ensemble Models.