In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/news_postprocess.csv", delimiter=";")
df.head()

Unnamed: 0,sentiment,text,words_count,neg,neu,pos,compound,sentiment_encoded
0,positive,"The apartment block will be well-located , in ...",22,0.121,0.711,0.167,0.0258,2
1,neutral,The sale of the food potato business is part o...,19,0.0,1.0,0.0,0.0,1
2,positive,The Group 's business is balanced by its broad...,17,0.0,1.0,0.0,0.0,2
3,positive,TeliaSonera said about $ 100 million will be i...,24,0.0,1.0,0.0,0.0,2
4,neutral,The mall is part of the Baltic Pearl developme...,33,0.0,1.0,0.0,0.0,1


In [5]:
# generate embeddings
model = SentenceTransformer("yiyanghkust/finbert-tone")
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)

# check embeddings dimensions
print(embeddings.shape)

No sentence-transformers model found with name yiyanghkust/finbert-tone. Creating a new one with mean pooling.
Batches: 100%|██████████| 57/57 [00:11<00:00,  5.03it/s]

(1812, 768)





In [9]:
# combine other features
import numpy as np

other_features = df[['words_count', 'neg', 'neu', 'pos', 'compound']].values

X = np.hstack([embeddings, other_features])
y = df['sentiment_encoded'].values 

print(X.shape)
print(y.shape)

(1812, 773)
(1812,)


In [10]:
# dataset splitting
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Training set size: 1159
Validation set size: 290
Test set size: 363


In [11]:
# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# checking shapes
print(X_train_scaled.shape, X_val_scaled.shape, X_test_scaled.shape)

(1159, 773) (290, 773) (363, 773)


In [16]:
# models testing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier


models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
}

In [None]:
# training a evaluating each model

results = []

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    result = {
        "Model" : name,
        "Accuracy": accuracy_score(y_test, y_pred)
    }
    results.append(result)

print("Training Finished")

Training Logistic Regression...
Training Random Forest...
Training Gradient Boosting...
Training SVM...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training MLP...


In [None]:
# Results
for result in results:
    print(f"{result['Model']}: Accuracy = {result['Accuracy']:.4f}")

Logistic Regression: Accuracy = 0.7906
Random Forest: Accuracy = 0.8154
Gradient Boosting: Accuracy = 0.8264
SVM: Accuracy = 0.8209
XGBoost: Accuracy = 0.8127
MLP: Accuracy = 0.8127
