In [99]:
import pandas as pd
from sentence_transformers import SentenceTransformer

In [100]:
df = pd.read_csv("./data/news_postprocess.csv", delimiter=";")
df.head()

Unnamed: 0,sentiment,text,sentiment_encoded
0,positive,"The apartment block will be well-located , in ...",2
1,neutral,The sale of the food potato business is part o...,1
2,positive,The Group 's business is balanced by its broad...,2
3,positive,TeliaSonera said about $ 100 million will be i...,2
4,neutral,The mall is part of the Baltic Pearl developme...,1


In [101]:
# generate embeddings
model = SentenceTransformer("yiyanghkust/finbert-tone")
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)

No sentence-transformers model found with name yiyanghkust/finbert-tone. Creating a new one with mean pooling.
Batches: 100%|██████████| 57/57 [00:15<00:00,  3.66it/s]


In [102]:
# check embeddings dimensions
print(embeddings.shape)

(1812, 768)


In [103]:
# dataset splitting
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(embeddings, df["sentiment_encoded"], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Training set size: 1159
Validation set size: 290
Test set size: 363


In [104]:
# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# checking shapes
print(X_train_scaled.shape, X_val_scaled.shape, X_test_scaled.shape)

(1159, 768) (290, 768) (363, 768)


In [105]:
# logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# training
log_reg = LogisticRegression(max_iter=500, solver='lbfgs', random_state=42)
log_reg.fit(X_train_scaled, y_train)

# evaluation
y_pred = log_reg.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.7851239669421488
Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.89      0.86       129
     neutral       0.74      0.68      0.71       107
    positive       0.77      0.76      0.77       127

    accuracy                           0.79       363
   macro avg       0.78      0.78      0.78       363
weighted avg       0.78      0.79      0.78       363



In [106]:
# svm model
from sklearn.svm import SVC

# training
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)

# evaluation
y_pred = svm_model.predict(X_test_scaled)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.8044077134986226
Classification Report:
              precision    recall  f1-score   support

    negative       0.82      0.89      0.86       129
     neutral       0.78      0.71      0.74       107
    positive       0.81      0.80      0.80       127

    accuracy                           0.80       363
   macro avg       0.80      0.80      0.80       363
weighted avg       0.80      0.80      0.80       363



In [107]:
# xgboost model
from xgboost import XGBClassifier

xgb = XGBClassifier(
    objective='multi:softmax',  
    num_class=3,              
    eval_metric='mlogloss',     
    use_label_encoder=False,
    random_state=42             
)

# training
xgb.fit(X_train_scaled, y_train)

# evaluation
y_pred = xgb.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.8402203856749312
Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.93      0.90       129
     neutral       0.77      0.80      0.79       107
    positive       0.88      0.78      0.82       127

    accuracy                           0.84       363
   macro avg       0.84      0.84      0.84       363
weighted avg       0.84      0.84      0.84       363



In [108]:
# mlp
from sklearn.neural_network import MLPClassifier

# training
mlp_model = MLPClassifier(hidden_layer_sizes=(128,), activation='relu', solver='adam', max_iter=500, random_state=42)
mlp_model.fit(X_train_scaled, y_train)

# evaluation
y_pred = mlp_model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.8154269972451791
Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.92      0.89       129
     neutral       0.79      0.72      0.75       107
    positive       0.79      0.79      0.79       127

    accuracy                           0.82       363
   macro avg       0.81      0.81      0.81       363
weighted avg       0.81      0.82      0.81       363

