In [78]:
import pandas as pd
from sentence_transformers import SentenceTransformer

In [79]:
df = pd.read_csv("./data/news_postprocess.csv", delimiter=";")
df.head()

Unnamed: 0,Sentence,Sentiment,sentiment_encoded
0,"In 2009 , Comptel slipped to a net loss of EUR...",negative,0
1,Repeats sees 2008 EBITA above 18 pct of sales .,positive,2
2,$AAPL price momentum weakening going into the ...,negative,0
3,Why $MCD looks set to fall further. http://stk...,negative,0
4,Net cash flow from operating activities was a ...,neutral,1


In [80]:
# generate embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df['Sentence'].tolist(), show_progress_bar=True)

Batches: 100%|██████████| 81/81 [00:07<00:00, 11.10it/s]


In [81]:
# check embeddings dimensions
print(embeddings.shape)

(2580, 384)


In [82]:
# dataset splitting
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(embeddings, df["sentiment_encoded"], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Training set size: 1651
Validation set size: 413
Test set size: 516


In [83]:
# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# checking shapes
print(X_train_scaled.shape, X_val_scaled.shape, X_test_scaled.shape)

(1651, 384) (413, 384) (516, 384)


In [84]:
# logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# training
log_reg = LogisticRegression(max_iter=500, solver='lbfgs', random_state=42)
log_reg.fit(X_train_scaled, y_train)

# evaluation
y_pred = log_reg.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.6763565891472868
Classification Report:
              precision    recall  f1-score   support

    negative       0.68      0.69      0.68       181
     neutral       0.66      0.67      0.66       172
    positive       0.70      0.67      0.69       163

    accuracy                           0.68       516
   macro avg       0.68      0.68      0.68       516
weighted avg       0.68      0.68      0.68       516



In [85]:
# svm model
from sklearn.svm import SVC

# training
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)

# evaluation
y_pred = svm_model.predict(X_test_scaled)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.6395348837209303
Classification Report:
              precision    recall  f1-score   support

    negative       0.61      0.67      0.64       181
     neutral       0.62      0.60      0.61       172
    positive       0.69      0.65      0.67       163

    accuracy                           0.64       516
   macro avg       0.64      0.64      0.64       516
weighted avg       0.64      0.64      0.64       516



In [86]:
# xgboost model
from xgboost import XGBClassifier

xgb = XGBClassifier(
    objective='multi:softmax',  
    num_class=3,              
    eval_metric='mlogloss',     
    use_label_encoder=False,
    random_state=42             
)

# training
xgb.fit(X_train_scaled, y_train)

# evaluation
y_pred = xgb.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.7131782945736435
Classification Report:
              precision    recall  f1-score   support

    negative       0.72      0.71      0.71       181
     neutral       0.69      0.71      0.70       172
    positive       0.74      0.72      0.73       163

    accuracy                           0.71       516
   macro avg       0.71      0.71      0.71       516
weighted avg       0.71      0.71      0.71       516



In [87]:
# mlp
from sklearn.neural_network import MLPClassifier

# training
mlp_model = MLPClassifier(hidden_layer_sizes=(128,), activation='relu', solver='adam', max_iter=500, random_state=42)
mlp_model.fit(X_train_scaled, y_train)

# evaluation
y_pred = mlp_model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.6918604651162791
Classification Report:
              precision    recall  f1-score   support

    negative       0.68      0.70      0.69       181
     neutral       0.67      0.69      0.68       172
    positive       0.73      0.69      0.71       163

    accuracy                           0.69       516
   macro avg       0.69      0.69      0.69       516
weighted avg       0.69      0.69      0.69       516

