In [1]:
#!pip install pandas numpy nltk matplotlib seaborn wordcloud
%run 1_data_load.ipynb

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import wordninja

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kasim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df_news['Time'] = pd.to_datetime(df_news['Time'], format='%d/%m/%Y')
df_news = df_news.rename(columns = {'Time': 'Date'})
print(df_news.dtypes)

df_price['Date'] = pd.to_datetime(df_price['Date'], format='%m/%d/%Y')

Title              object
Tag                object
Date       datetime64[ns]
Content            object
dtype: object


In [4]:
# Kursveränderung berechnen
df_price['Change'] = np.round(
            (df_price['Open'] - df_price['Close/Last']) / df_price['Open'], 5
        )

# Label anhand der Kursveränderung
# 1 - Strong gain (>= +20%)
# 2 - Significant gain (+13% to <20%)
# 3 - Moderate gain (+7% to <13%)
# 4 - Stable gain (3% to <+7%)
# 5 - Stable neutral gain (1.5% to <+3%)
# 6 - Neutral (volatility) (-1.5% to <+1.5%)
# 7 - Stable neutral loss (3% to <-1.5%)
# 8 - Stable loss (-7% to <-3%)
# 9 - Moderate loss (-13% to <-7%)
# 10 - Significant loss (-20% to <-13%)
# 11 - Strong loss (< -20%)
conditions = [
    ((df_price['Change'] >= 0.015)), # 1
    ((df_price['Change'] >= 0) & (df_price['Change'] < 0.015)), # 2
    ((df_price['Change'] >= -0.015) & (df_price['Change'] < 0)), # 2
    ((df_price['Change'] < -0.015)) # 3
]
labels = [1, 2, 3, 4]
df_price['Label'] = np.select(conditions, labels)

df_price.groupby(by=['Label']).count()

Unnamed: 0_level_0,Date,Close/Last,Open,High,Low,Change
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,185,185,185,185,185,185
2,955,955,955,955,955,955
3,1236,1236,1236,1236,1236,1236
4,142,142,142,142,142,142


In [5]:
# decoupling, case-correction, pyrtial hyphen-resolution

def clean_text(text):
    text = re.sub(r'[^a-zA-Z ]', ' ', str(text)).lower()
    words = text.split()
    
    tokens = []
    for w in words:
            tokens.extend(wordninja.split(w))
    
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return tokens

df_news['Cleaned Content'] = df_news['Content'].apply(clean_text)

In [6]:
df_news['Weekday'] = df_news['Date'].dt.day_name()

df_news['Impact Date'] = np.where(
    df_news['Weekday'] != 'Friday',
    df_news['Date'] + pd.Timedelta(days=1),
    df_news['Date'] + pd.Timedelta(days=3)
)
df_news['Impact Date'] = pd.to_datetime(df_news['Impact Date'], format='%m/%d/%Y')

In [7]:
# Beide DataFrames mit Datumsspalten angleichen
#df_news['date'] = df_news['Time'].dt.date
#df_price['date'] = df_price['Date'].dt.date

# Merge
merged_df = pd.merge(df_news, df_price[['Date', 'Label', 'Change']], left_on = 'Impact Date', right_on = 'Date', how='inner')
merged_df.head(10)

#merged_df.groupby(by=['Label']).count()

Unnamed: 0,Title,Tag,Date_x,Content,Cleaned Content,Weekday,Impact Date,Date_y,Label,Change
0,"TSX Slightly Down, Books Weekly Gains",Stock Market,2023-07-16,"TSX Slightly Down, Books Weekly GainsUnited St...","[tsx, slightly, books, weekly, gains, united, ...",Sunday,2023-07-17,2023-07-17,3,-0.00672
1,US Budget Deficit Widens More than Expected in...,Government Budget Value,2023-07-14,US Budget Deficit Widens More than Expected in...,"[budget, deficit, widens, expected, june, unit...",Friday,2023-07-17,2023-07-17,3,-0.00672
2,Visa Hits 24-week High,stocks,2023-07-14,Visa Hits 24-week HighUnited States stocksVisa...,"[visa, hits, week, high, united, states, stock...",Friday,2023-07-17,2023-07-17,3,-0.00672
3,Amazon Hits 43-week High,stocks,2023-07-14,Amazon Hits 43-week HighUnited States stocksAm...,"[amazon, hits, week, high, united, states, sto...",Friday,2023-07-17,2023-07-17,3,-0.00672
4,10-Year Treasury Yield Falls for 4th Session,Government Bond 10Y,2023-07-14,10-Year Treasury Yield Falls for 4th SessionUn...,"[year, treasury, yield, falls, session, united...",Friday,2023-07-17,2023-07-17,3,-0.00672
5,DXY Approaches 100,Currency,2023-07-14,DXY Approaches 100United States CurrencyThe do...,"[approaches, united, states, currency, dollar,...",Friday,2023-07-17,2023-07-17,3,-0.00672
6,US Core PPI Rises Less than Expected,Core Producer Prices MoM,2023-07-14,US Core PPI Rises Less than ExpectedUnited Sta...,"[core, ppi, rises, less, expected, united, sta...",Friday,2023-07-17,2023-07-17,3,-0.00672
7,Progressive earnings below expectations at 0.5...,Earnings,2023-07-14,Progressive earnings below expectations at 0.5...,"[progressive, earnings, expectations, usd, uni...",Friday,2023-07-17,2023-07-17,3,-0.00672
8,PepsiCo earnings above expectations at 2.09 USD,Earnings,2023-07-14,PepsiCo earnings above expectations at 2.09 US...,"[pepsico, earnings, expectations, usd, united,...",Friday,2023-07-17,2023-07-17,3,-0.00672
9,US Budget Deficit Widens More than Expected in...,Government Budget Value,2023-07-14,US Budget Deficit Widens More than Expected in...,"[budget, deficit, widens, expected, june, unit...",Friday,2023-07-17,2023-07-17,3,-0.00672


In [8]:
def simplify_label(label):
    if label < 0:
        return 'negative'
    else:
        return 'positiv'

merged_df['Simple Label'] = merged_df['Change'].apply(simplify_label)
df_news = merged_df
df_news.head(10)


#df_news[df_news['Weekday'].isin(['Friday'])].sort_values(by = "Date", ascending=True)
# max 2023 - 07 - 14
# min 2015 - 06 - 23

Unnamed: 0,Title,Tag,Date_x,Content,Cleaned Content,Weekday,Impact Date,Date_y,Label,Change,Simple Label
0,"TSX Slightly Down, Books Weekly Gains",Stock Market,2023-07-16,"TSX Slightly Down, Books Weekly GainsUnited St...","[tsx, slightly, books, weekly, gains, united, ...",Sunday,2023-07-17,2023-07-17,3,-0.00672,negative
1,US Budget Deficit Widens More than Expected in...,Government Budget Value,2023-07-14,US Budget Deficit Widens More than Expected in...,"[budget, deficit, widens, expected, june, unit...",Friday,2023-07-17,2023-07-17,3,-0.00672,negative
2,Visa Hits 24-week High,stocks,2023-07-14,Visa Hits 24-week HighUnited States stocksVisa...,"[visa, hits, week, high, united, states, stock...",Friday,2023-07-17,2023-07-17,3,-0.00672,negative
3,Amazon Hits 43-week High,stocks,2023-07-14,Amazon Hits 43-week HighUnited States stocksAm...,"[amazon, hits, week, high, united, states, sto...",Friday,2023-07-17,2023-07-17,3,-0.00672,negative
4,10-Year Treasury Yield Falls for 4th Session,Government Bond 10Y,2023-07-14,10-Year Treasury Yield Falls for 4th SessionUn...,"[year, treasury, yield, falls, session, united...",Friday,2023-07-17,2023-07-17,3,-0.00672,negative
5,DXY Approaches 100,Currency,2023-07-14,DXY Approaches 100United States CurrencyThe do...,"[approaches, united, states, currency, dollar,...",Friday,2023-07-17,2023-07-17,3,-0.00672,negative
6,US Core PPI Rises Less than Expected,Core Producer Prices MoM,2023-07-14,US Core PPI Rises Less than ExpectedUnited Sta...,"[core, ppi, rises, less, expected, united, sta...",Friday,2023-07-17,2023-07-17,3,-0.00672,negative
7,Progressive earnings below expectations at 0.5...,Earnings,2023-07-14,Progressive earnings below expectations at 0.5...,"[progressive, earnings, expectations, usd, uni...",Friday,2023-07-17,2023-07-17,3,-0.00672,negative
8,PepsiCo earnings above expectations at 2.09 USD,Earnings,2023-07-14,PepsiCo earnings above expectations at 2.09 US...,"[pepsico, earnings, expectations, usd, united,...",Friday,2023-07-17,2023-07-17,3,-0.00672,negative
9,US Budget Deficit Widens More than Expected in...,Government Budget Value,2023-07-14,US Budget Deficit Widens More than Expected in...,"[budget, deficit, widens, expected, june, unit...",Friday,2023-07-17,2023-07-17,3,-0.00672,negative


In [9]:
merged_df.groupby(by=['Simple Label']).count()

Unnamed: 0_level_0,Title,Tag,Date_x,Content,Cleaned Content,Weekday,Impact Date,Date_y,Label,Change
Simple Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
negative,7585,7585,7585,7585,7585,7585,7585,7585,7585,7585
positiv,6052,6052,6052,6052,6052,6052,6052,6052,6052,6052


In [10]:
# Only text and labels
tokenized_df = merged_df[["Cleaned Content", "Label", "Simple Label"]]