In [1]:
#%run 2.1_data_preparation_news.ipynb

Date      datetime64[ns]
Open             float64
Close            float64
Volume             int64
dtype: object


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kasim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Title              object
Tag                object
Date       datetime64[ns]
Content            object
dtype: object


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import pandas as pd
import numpy as np

tokenized_df = pd.read_csv('./merged_df.csv')

label_map = {'negative': -1, 'neutral': 0, 'positive': 1}
tokenized_df['finbert_label_num'] = tokenized_df['finbert_label'].map(label_map)

for col in ['Prev_change', 'Sma_5', 'Volatility', 'finbert_label_num']:
    tokenized_df[col] = tokenized_df[col].fillna(tokenized_df[col].mean())

tokenized_df['text'] = tokenized_df['Cleaned Content'].apply(lambda x: ' '.join(x))

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.9, min_df=5)
X_text = vectorizer.fit_transform(tokenized_df['text'])

numeric_features = ['Prev_change', 'Sma_5', 'Volatility']
numeric_data = tokenized_df[numeric_features]
scaler = StandardScaler()
X_numeric = scaler.fit_transform(numeric_data)

X_combined = hstack([X_text, X_numeric])

y = tokenized_df['Simple Label']

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.85      0.84      0.84      1496
     positiv       0.81      0.82      0.81      1232

    accuracy                           0.83      2728
   macro avg       0.83      0.83      0.83      2728
weighted avg       0.83      0.83      0.83      2728



In [3]:
feature_names = vectorizer.get_feature_names_out().tolist() + numeric_features

coefficients = model.coef_[0]

coef_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients
})

top_positive = coef_df.sort_values(by='coefficient', ascending=False).head(15)
top_negative = coef_df.sort_values(by='coefficient').head(15)

print(coef_df[coef_df['coefficient'].abs() > 0.5])

             feature  coefficient
121    activity data    -0.575420
182     added points    -0.526847
203       additional     0.722955
351              ago    -0.887933
430         aircraft    -0.504737
...              ...          ...
23314   worse market    -0.593941
23390           year    -0.733187
23529          years    -0.758633
23622  yields remain     0.585312
23684    Prev_change    -3.326505

[297 rows x 2 columns]
