In [1]:
#!pip install pandas numpy nltk matplotlib seaborn wordcloud
%run 1_data_load.ipynb

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import wordninja

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kasim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df_news['Time'] = pd.to_datetime(df_news['Time'], format='%d/%m/%Y')
df_news = df_news.rename(columns = {'Time': 'Date'})
print(df_news.dtypes)

df_price['Date'] = pd.to_datetime(df_price['Date'], format='%m/%d/%Y')

Title              object
Tag                object
Date       datetime64[ns]
Content            object
dtype: object


In [4]:
# Kursveränderung berechnen
df_price['Change'] = np.round(
            (df_price['Open'] - df_price['Close/Last']) / df_price['Open'], 5
        )

# Label anhand der Kursveränderung
# 1 - Strong gain (>= +20%)
# 2 - Significant gain (+13% to <20%)
# 3 - Moderate gain (+7% to <13%)
# 4 - Stable gain (3% to <+7%)
# 5 - Stable neutral gain (1.5% to <+3%)
# 6 - Neutral (volatility) (-1.5% to <+1.5%)
# 7 - Stable neutral loss (3% to <-1.5%)
# 8 - Stable loss (-7% to <-3%)
# 9 - Moderate loss (-13% to <-7%)
# 10 - Significant loss (-20% to <-13%)
# 11 - Strong loss (< -20%)
conditions = [
    (df_price['Change'] >= 0.20), # 1
    ((df_price['Change'] >= 0.13) & (df_price['Change'] < 0.20)), # 2
    ((df_price['Change'] >= 0.07) & (df_price['Change'] < 0.13)), # 3
    ((df_price['Change'] >= 0.03) & (df_price['Change'] < 0.07)), # 4
    ((df_price['Change'] >= 0.015) & (df_price['Change'] < 0.03)), # 5
    ((df_price['Change'] >= -0.015) & (df_price['Change'] < 0.015)), # 6
    ((df_price['Change'] >= -0.03) & (df_price['Change'] < -0.015)), # 7
    ((df_price['Change'] >= -0.07) & (df_price['Change'] < -0.03)), # 8
    ((df_price['Change'] >= -0.13) & (df_price['Change'] < -0.07)), # 9
    ((df_price['Change'] >= -0.20) & (df_price['Change'] < -0.13)), # 10
    (df_price['Change'] < -0.20) # 11
]
labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
df_price['Label'] = np.select(conditions, labels)

df_price.groupby(by=['Label']).count()


Unnamed: 0_level_0,Date,Close/Last,Open,High,Low,Change
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,31,31,31,31,31,31
5,154,154,154,154,154,154
6,2191,2191,2191,2191,2191,2191
7,114,114,114,114,114,114
8,22,22,22,22,22,22
9,2,2,2,2,2,2
11,4,4,4,4,4,4


In [5]:
# decoupling, case-correction, pyrtial hyphen-resolution

def clean_text(text):
    text = re.sub(r'[^a-zA-Z ]', ' ', str(text)).lower()
    words = text.split()
    
    tokens = []
    for w in words:
            tokens.extend(wordninja.split(w))
    
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return tokens

df_news['Cleaned Content'] = df_news['Content'].apply(clean_text)


In [6]:
# Beide DataFrames mit Datumsspalten angleichen
#df_news['date'] = df_news['Time'].dt.date
#df_price['date'] = df_price['Date'].dt.date

# Merge
merged_df = pd.merge(df_news, df_price[['Date', 'Label']], on='Date', how='inner')

merged_df.groupby(by=['Label']).count()

Unnamed: 0_level_0,Title,Tag,Date,Content,Cleaned Content
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,363,363,363,363,363
5,1057,1057,1057,1057,1057
6,11321,11321,11321,11321,11321
7,965,965,965,965,965
8,223,223,223,223,223
9,27,27,27,27,27
11,1,1,1,1,1


In [16]:
def simplify_label(label):
    if label < 6 :
        return 'positive'
    elif label > 6:
        return 'negative'
    else:
        return 'neutral'

merged_df['Simple Label'] = merged_df['Label'].apply(simplify_label)
merged_df.head(10)
df_news = merged_df
df_news.head(10)

df_news.sort_values(by = "Date", ascending=True)
# max 2023 - 07 - 14
# min 2015 - 06 - 23

Unnamed: 0,Title,Tag,Date,Content,Cleaned Content,Label,Simple Label
13956,Durable Goods Orders Fall More Than Expected,Durable Goods Orders,2015-06-23,Durable Goods Orders Fall More Than ExpectedUn...,"[durable, goods, orders, fall, expected, unite...",6,neutral
13955,New Home Sales at 7-Year High,New Home Sales,2015-06-23,New Home Sales at 7-Year HighUnited States New...,"[new, home, sales, year, high, united, states,...",6,neutral
13954,US Factory Activity at 20-Month Low,Manufacturing Pmi,2015-06-23,US Factory Activity at 20-Month LowUnited Stat...,"[factory, activity, month, low, united, states...",6,neutral
13953,US GDP Contracts Less Than Expected in Q1,Gdp Growth Rate,2015-06-24,US GDP Contracts Less Than Expected in Q1Unite...,"[gdp, contracts, less, expected, united, state...",6,neutral
13951,Personal Spending Rises The Most in Nearly 6 Y...,Personal Spending,2015-06-25,Personal Spending Rises The Most in Nearly 6 Y...,"[personal, spending, rises, nearly, years, uni...",6,neutral
...,...,...,...,...,...,...,...
13,US Core PPI Rises Less than Expected,Core Producer Prices MoM,2023-07-14,US Core PPI Rises Less than ExpectedUnited Sta...,"[core, ppi, rises, less, expected, united, sta...",6,neutral
14,Progressive earnings below expectations at 0.5...,Earnings,2023-07-14,Progressive earnings below expectations at 0.5...,"[progressive, earnings, expectations, usd, uni...",6,neutral
15,PepsiCo earnings above expectations at 2.09 USD,Earnings,2023-07-14,PepsiCo earnings above expectations at 2.09 US...,"[pepsico, earnings, expectations, usd, united,...",6,neutral
9,Visa Hits 24-week High,stocks,2023-07-14,Visa Hits 24-week HighUnited States stocksVisa...,"[visa, hits, week, high, united, states, stock...",6,neutral


In [None]:
merged_df.groupby(by=['Simple Label']).count()

In [None]:
# Only text and labels
tokenized_df = merged_df[["Cleaned Content", "Label", "Simple Label"]]