In [1]:
#!pip install pandas numpy nltk matplotlib seaborn wordcloud
%run 1_data_load.ipynb

Open            float64
High            float64
Low             float64
Close           float64
Volume            int64
Dividends       float64
Stock Splits    float64
dtype: object


In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import wordninja

#FinBERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kasim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df_news['Time'] = pd.to_datetime(df_news['Time'], format='%d/%m/%Y')
df_news = df_news.rename(columns = {'Time': 'Date'})
print(df_news.dtypes)


Title              object
Tag                object
Date       datetime64[ns]
Content            object
dtype: object


In [4]:
df_price.sort_values('Date', inplace=True)

df_price = df_price.reset_index()
df_price = df_price[['Date', 'Open', 'Close', 'Volume']]
df_price['Date'] = df_price['Date'].dt.tz_localize(None)

df_price.rename(columns={'Close/Last': 'Close'}, inplace=True)
df_price['Date'] = pd.to_datetime(df_price['Date'], format='%m/%d/%Y')

df_price['Prev_change'] = df_price['Close'].pct_change()

df_price['Sma_5'] = df_price['Close'].rolling(window=5).mean() # rolling avg for the last 5 days 

df_price['Volatility'] = df_price['Close'].rolling(window=5).std() # volatility for the last 5 days 

df_price.head(10)

Unnamed: 0,Date,Open,Close,Volume,Prev_change,Sma_5,Volatility
0,2015-06-01,5094.939941,5082.930176,1902120000,,,
1,2015-06-02,5063.470215,5076.52002,1729750000,-0.001261,,
2,2015-06-03,5098.47998,5099.22998,1852680000,0.004474,,
3,2015-06-04,5078.220215,5059.120117,1813960000,-0.007866,,
4,2015-06-05,5057.040039,5068.459961,1842890000,0.001846,5077.252051,15.181191
5,2015-06-08,5066.649902,5021.629883,1712210000,-0.00924,5064.991992,28.42496
6,2015-06-09,5013.129883,5013.870117,1754340000,-0.001545,5052.462012,35.097942
7,2015-06-10,5029.410156,5076.689941,1792980000,0.012529,5047.954004,28.397188
8,2015-06-11,5088.259766,5082.509766,1623950000,0.001146,5052.631934,32.348124
9,2015-06-12,5060.240234,5051.100098,1428900000,-0.00618,5049.159961,31.133389


In [5]:
# Kursveränderung berechnen
df_price['Change'] = np.round(
            (df_price['Open'] - df_price['Close']) / df_price['Open'], 5
        )

# Label anhand der Kursveränderung
# 1 - Strong gain (>= +20%)
# 2 - Significant gain (+13% to <20%)
# 3 - Moderate gain (+7% to <13%)
# 4 - Stable gain (3% to <+7%)
# 5 - Stable neutral gain (1.5% to <+3%)
# 6 - Neutral (volatility) (-1.5% to <+1.5%)
# 7 - Stable neutral loss (3% to <-1.5%)
# 8 - Stable loss (-7% to <-3%)
# 9 - Moderate loss (-13% to <-7%)
# 10 - Significant loss (-20% to <-13%)
# 11 - Strong loss (< -20%)
conditions = [
    ((df_price['Change'] >= 0.015)), # 1
    ((df_price['Change'] >= 0) & (df_price['Change'] < 0.015)), # 2
    ((df_price['Change'] >= -0.015) & (df_price['Change'] < 0)), # 3
    ((df_price['Change'] < -0.015)) # 4
]
labels = [1, 2, 3, 4]
df_price['Label'] = np.select(conditions, labels)

df_price.head(10)
df_price.groupby(by=['Label']).count()

Unnamed: 0_level_0,Date,Open,Close,Volume,Prev_change,Sma_5,Volatility,Change
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,153,153,153,153,153,153,153,153
2,780,780,780,780,779,778,778,780
3,1002,1002,1002,1002,1002,1000,1000,1002
4,120,120,120,120,120,120,120,120


In [6]:
# decoupling, case-correction, pyrtial hyphen-resolution

df_news['Content'] = df_news['Content'].str.replace(r'\.[^.]*$', '', regex=True)

def clean_text(text):
    text = re.sub(r'[^a-zA-Z ]', ' ', str(text)).lower()
    words = text.split()
    
    tokens = []
    for w in words:
            tokens.extend(wordninja.split(w))
    
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return tokens

df_news['Cleaned Content'] = df_news['Content'].apply(clean_text)

In [7]:
df_news['Weekday'] = df_news['Date'].dt.day_name()

df_news['Impact Date'] = np.where(
    df_news['Weekday'] != 'Friday',
    df_news['Date'] + pd.Timedelta(days=1),
    df_news['Date'] + pd.Timedelta(days=3)
)
df_news['Impact Date'] = pd.to_datetime(df_news['Impact Date'], format='%m/%d/%Y')

In [8]:
# Beide DataFrames mit Datumsspalten angleichen
#df_news['date'] = df_news['Time'].dt.date
#df_price['date'] = df_price['Date'].dt.date

# Merge
merged_df = pd.merge(df_news, 
                     df_price[['Date', 'Label', 'Change', 'Sma_5', 'Volatility', 'Prev_change']], 
                     left_on = 'Impact Date', right_on = 'Date', how='inner'
                    )
merged_df.head(10)

#merged_df.groupby(by=['Label']).count()

Unnamed: 0,Title,Tag,Date_x,Content,Cleaned Content,Weekday,Impact Date,Date_y,Label,Change,Sma_5,Volatility,Prev_change
0,"TSX Slightly Down, Books Weekly Gains",Stock Market,2023-07-16,"TSX Slightly Down, Books Weekly GainsUnited St...","[tsx, slightly, books, weekly, gains, united, ...",Sunday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299
1,US Budget Deficit Widens More than Expected in...,Government Budget Value,2023-07-14,US Budget Deficit Widens More than Expected in...,"[budget, deficit, widens, expected, june, unit...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299
2,Visa Hits 24-week High,stocks,2023-07-14,Visa Hits 24-week HighUnited States stocksVisa...,"[visa, hits, week, high, united, states, stock...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299
3,Amazon Hits 43-week High,stocks,2023-07-14,Amazon Hits 43-week HighUnited States stocksAm...,"[amazon, hits, week, high, united, states, sto...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299
4,10-Year Treasury Yield Falls for 4th Session,Government Bond 10Y,2023-07-14,10-Year Treasury Yield Falls for 4th SessionUn...,"[year, treasury, yield, falls, session, united...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299
5,DXY Approaches 100,Currency,2023-07-14,DXY Approaches 100United States CurrencyThe do...,"[approaches, united, states, currency, dollar,...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299
6,US Core PPI Rises Less than Expected,Core Producer Prices MoM,2023-07-14,US Core PPI Rises Less than ExpectedUnited Sta...,"[core, ppi, rises, less, expected, united, sta...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299
7,Progressive earnings below expectations at 0.5...,Earnings,2023-07-14,Progressive earnings below expectations at 0.5...,"[progressive, earnings, expectations, usd, uni...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299
8,PepsiCo earnings above expectations at 2.09 USD,Earnings,2023-07-14,PepsiCo earnings above expectations at 2.09 US...,"[pepsico, earnings, expectations, usd, united,...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299
9,US Budget Deficit Widens More than Expected in...,Government Budget Value,2023-07-14,US Budget Deficit Widens More than Expected in...,"[budget, deficit, widens, expected, june, unit...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299


In [9]:
def simplify_label(label):
    if label < 0:
        return 'negative'
    else:
        return 'positiv'

merged_df['Simple Label'] = merged_df['Change'].apply(simplify_label)
df_news = merged_df
df_news.head(10)

#df_news[df_news['Weekday'].isin(['Friday'])].sort_values(by = "Date", ascending=True)
# max 2023 - 07 - 14
# min 2015 - 06 - 23

Unnamed: 0,Title,Tag,Date_x,Content,Cleaned Content,Weekday,Impact Date,Date_y,Label,Change,Sma_5,Volatility,Prev_change,Simple Label
0,"TSX Slightly Down, Books Weekly Gains",Stock Market,2023-07-16,"TSX Slightly Down, Books Weekly GainsUnited St...","[tsx, slightly, books, weekly, gains, united, ...",Sunday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative
1,US Budget Deficit Widens More than Expected in...,Government Budget Value,2023-07-14,US Budget Deficit Widens More than Expected in...,"[budget, deficit, widens, expected, june, unit...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative
2,Visa Hits 24-week High,stocks,2023-07-14,Visa Hits 24-week HighUnited States stocksVisa...,"[visa, hits, week, high, united, states, stock...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative
3,Amazon Hits 43-week High,stocks,2023-07-14,Amazon Hits 43-week HighUnited States stocksAm...,"[amazon, hits, week, high, united, states, sto...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative
4,10-Year Treasury Yield Falls for 4th Session,Government Bond 10Y,2023-07-14,10-Year Treasury Yield Falls for 4th SessionUn...,"[year, treasury, yield, falls, session, united...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative
5,DXY Approaches 100,Currency,2023-07-14,DXY Approaches 100United States CurrencyThe do...,"[approaches, united, states, currency, dollar,...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative
6,US Core PPI Rises Less than Expected,Core Producer Prices MoM,2023-07-14,US Core PPI Rises Less than ExpectedUnited Sta...,"[core, ppi, rises, less, expected, united, sta...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative
7,Progressive earnings below expectations at 0.5...,Earnings,2023-07-14,Progressive earnings below expectations at 0.5...,"[progressive, earnings, expectations, usd, uni...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative
8,PepsiCo earnings above expectations at 2.09 USD,Earnings,2023-07-14,PepsiCo earnings above expectations at 2.09 US...,"[pepsico, earnings, expectations, usd, united,...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative
9,US Budget Deficit Widens More than Expected in...,Government Budget Value,2023-07-14,US Budget Deficit Widens More than Expected in...,"[budget, deficit, widens, expected, june, unit...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative


In [10]:
# FinBert

model_name = "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_finbert_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).numpy()[0]
    labels = ['negative', 'neutral', 'positive']
    return dict(zip(labels, probs))

merged_df['finbert_sentiment'] = merged_df['Title'].apply(lambda x: get_finbert_sentiment(x))
merged_df['finbert_label'] = merged_df['finbert_sentiment'].apply(lambda d: max(d, key=d.get))

merged_df.head(10)

Unnamed: 0,Title,Tag,Date_x,Content,Cleaned Content,Weekday,Impact Date,Date_y,Label,Change,Sma_5,Volatility,Prev_change,Simple Label,finbert_sentiment,finbert_label
0,"TSX Slightly Down, Books Weekly Gains",Stock Market,2023-07-16,"TSX Slightly Down, Books Weekly GainsUnited St...","[tsx, slightly, books, weekly, gains, united, ...",Sunday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative,"{'negative': 0.011293471, 'neutral': 0.9567599...",neutral
1,US Budget Deficit Widens More than Expected in...,Government Budget Value,2023-07-14,US Budget Deficit Widens More than Expected in...,"[budget, deficit, widens, expected, june, unit...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative,"{'negative': 0.6081924, 'neutral': 0.36712483,...",negative
2,Visa Hits 24-week High,stocks,2023-07-14,Visa Hits 24-week HighUnited States stocksVisa...,"[visa, hits, week, high, united, states, stock...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative,"{'negative': 0.7537409, 'neutral': 0.06819165,...",negative
3,Amazon Hits 43-week High,stocks,2023-07-14,Amazon Hits 43-week HighUnited States stocksAm...,"[amazon, hits, week, high, united, states, sto...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative,"{'negative': 0.5418749, 'neutral': 0.046920035...",negative
4,10-Year Treasury Yield Falls for 4th Session,Government Bond 10Y,2023-07-14,10-Year Treasury Yield Falls for 4th SessionUn...,"[year, treasury, yield, falls, session, united...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative,"{'negative': 0.018656207, 'neutral': 0.945734,...",neutral
5,DXY Approaches 100,Currency,2023-07-14,DXY Approaches 100United States CurrencyThe do...,"[approaches, united, states, currency, dollar,...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative,"{'negative': 0.10120035, 'neutral': 0.04638974...",positive
6,US Core PPI Rises Less than Expected,Core Producer Prices MoM,2023-07-14,US Core PPI Rises Less than ExpectedUnited Sta...,"[core, ppi, rises, less, expected, united, sta...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative,"{'negative': 0.5960674, 'neutral': 0.37919444,...",negative
7,Progressive earnings below expectations at 0.5...,Earnings,2023-07-14,Progressive earnings below expectations at 0.5...,"[progressive, earnings, expectations, usd, uni...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative,"{'negative': 0.037467457, 'neutral': 0.9188981...",neutral
8,PepsiCo earnings above expectations at 2.09 USD,Earnings,2023-07-14,PepsiCo earnings above expectations at 2.09 US...,"[pepsico, earnings, expectations, usd, united,...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative,"{'negative': 0.9225229, 'neutral': 0.0413247, ...",negative
9,US Budget Deficit Widens More than Expected in...,Government Budget Value,2023-07-14,US Budget Deficit Widens More than Expected in...,"[budget, deficit, widens, expected, june, unit...",Friday,2023-07-17,2023-07-17,3,-0.00672,14035.376172,193.458381,0.009299,negative,"{'negative': 0.6081924, 'neutral': 0.36712483,...",negative


In [11]:
merged_df.groupby(by=['Simple Label']).count()
merged_df.to_csv('merged_df.csv', index=False)  

In [12]:
# Only text and labels
tokenized_df = merged_df