In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer

In [27]:
df = pd.read_csv("./data/tweets_post_process.csv", delimiter=";")

df.head()

Unnamed: 0,text,sentiment,cleaned_text,sentiment_encoded
0,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",positive,rt robertbeadles yo enter to win 1000 monarch ...,2
1,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,negative,srilanka surcharge on fuel removed the surchar...,0
2,Net issuance increases to fund fiscal programs...,positive,net issuance increases to fund fiscal programs...,2
3,RT @bentboolean: How much of Amazon's traffic ...,positive,rt bentboolean how much of amazons traffic is ...,2
4,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,positive,amd ryzen 4000 desktop cpus looking great and ...,2


In [28]:
# load sentence transformers model
model = SentenceTransformer('all-mpnet-base-v2')
# generate sentence embeddings
embeddings = model.encode(df['cleaned_text'].tolist(), show_progress_bar=True)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 41/41 [00:18<00:00,  2.23it/s]


In [29]:
# check embeddings dimensions
print(embeddings.shape)

(1300, 768)


In [64]:
from sklearn.model_selection import train_test_split

# splitting the dataset into train and test and validation sets 
X_train_val, X_test, y_train_val, y_test = train_test_split(embeddings, df["sentiment_encoded"], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Training set size: 832
Validation set size: 208
Test set size: 260


In [65]:
from sklearn.preprocessing import StandardScaler
# scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# checking shapes
print(X_train_scaled.shape, X_val_scaled.shape, X_test_scaled.shape)

(832, 768) (208, 768) (260, 768)


In [None]:
# testing logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

# training
log_reg = LogisticRegression(max_iter=500, solver='lbfgs', random_state=42)
log_reg.fit(X_train_scaled, y_train)

# evaluation
y_pred = log_reg.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.5230769230769231
              precision    recall  f1-score   support

    negative       0.50      0.47      0.49        78
     neutral       0.54      0.50      0.52        88
    positive       0.53      0.59      0.56        94

    accuracy                           0.52       260
   macro avg       0.52      0.52      0.52       260
weighted avg       0.52      0.52      0.52       260



In [74]:
# testing svm model
from sklearn.svm import SVC

# training
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)

# evaluation
y_pred = svm_model.predict(X_test_scaled)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.5307692307692308
Classification Report:
              precision    recall  f1-score   support

    negative       0.51      0.62      0.55        78
     neutral       0.55      0.44      0.49        88
    positive       0.54      0.54      0.54        94

    accuracy                           0.53       260
   macro avg       0.53      0.53      0.53       260
weighted avg       0.53      0.53      0.53       260



In [75]:
# testing xgboost model
from xgboost import XGBClassifier

xgb = XGBClassifier(
    objective='multi:softmax',  
    num_class=3,              
    eval_metric='mlogloss',     
    use_label_encoder=False,
    random_state=42             
)

# training
xgb.fit(X_train_scaled, y_train)

# evaluation
y_pred = xgb.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.5307692307692308
Classification Report:
              precision    recall  f1-score   support

    negative       0.57      0.41      0.48        78
     neutral       0.57      0.57      0.57        88
    positive       0.48      0.60      0.53        94

    accuracy                           0.53       260
   macro avg       0.54      0.52      0.53       260
weighted avg       0.54      0.53      0.53       260



In [76]:
# testing mlp
from sklearn.neural_network import MLPClassifier

# training
mlp_model = MLPClassifier(hidden_layer_sizes=(128,), activation='relu', solver='adam', max_iter=500, random_state=42)
mlp_model.fit(X_train_scaled, y_train)

# evaluation
y_pred = mlp_model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.5423076923076923
Classification Report:
              precision    recall  f1-score   support

    negative       0.54      0.51      0.53        78
     neutral       0.56      0.51      0.54        88
    positive       0.53      0.60      0.56        94

    accuracy                           0.54       260
   macro avg       0.54      0.54      0.54       260
weighted avg       0.54      0.54      0.54       260

