In [36]:
# libraries
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from datasets import Dataset

import sys

sys.path.append("../scripts")
import functions as f

.csv with all manual labels applied

In [37]:
# Load the file
reddit = joblib.load('/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/pickle/reddit_labelled-sample.pkl')

In [38]:
# Apply preprocessing
reddit["processed_text"] = reddit["text"].apply(f.token_and_lemmatize_rob)

# Features and labels
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(reddit["processed_text"])  # Tfidf vectorization
y = reddit["label"]

In [39]:
# split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=13,
    stratify=y
)
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

Train and test RoBERTa (Robustly Optimized BERT)

In [40]:
# Convert sparse matrices to dense and prepare datasets
X_train_text = [" ".join(map(str, row)) for row in X_train.toarray()]
X_test_text = [" ".join(map(str, row)) for row in X_test.toarray()]

train_data = {"text": X_train_text, "labels": y_train.tolist()}
test_data = {"text": X_test_text, "labels": y_test.tolist()}

train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)

In [41]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    # Tokenize using Hugging Face tokenizer
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [42]:
train_dataset = train_dataset.map(f.tokenize_function, batched=True)
test_dataset = test_dataset.map(f.tokenize_function, batched=True)

Map:   0%|          | 0/1056 [00:00<?, ? examples/s]

Map:   0%|          | 0/265 [00:00<?, ? examples/s]

In [43]:
# Load and train the model
roberta_model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=3
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

# Decode predictions and calculate accuracy
decoded_preds = encoder.inverse_transform(preds)
decoded_labels = encoder.inverse_transform(y_test)

accuracy = accuracy_score(decoded_labels, decoded_preds)
print(f"RoBERTa model accuracy: {accuracy:.2f}")

# Classification report
print("Classification Report:")
print(
    classification_report(
        decoded_labels, decoded_preds, target_names=encoder.classes_, zero_division=0
    )
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/396 [00:00<?, ?it/s]

{'loss': 1.0246, 'grad_norm': 6.180790901184082, 'learning_rate': 1.9494949494949496e-05, 'epoch': 0.08}
{'loss': 0.9633, 'grad_norm': 7.70405387878418, 'learning_rate': 1.8989898989898993e-05, 'epoch': 0.15}
{'loss': 0.983, 'grad_norm': 10.537549018859863, 'learning_rate': 1.8484848484848487e-05, 'epoch': 0.23}
{'loss': 0.9486, 'grad_norm': 3.4349822998046875, 'learning_rate': 1.797979797979798e-05, 'epoch': 0.3}
{'loss': 0.8945, 'grad_norm': 3.660184621810913, 'learning_rate': 1.7474747474747475e-05, 'epoch': 0.38}
{'loss': 0.9968, 'grad_norm': 6.397351264953613, 'learning_rate': 1.6969696969696972e-05, 'epoch': 0.45}
{'loss': 1.0389, 'grad_norm': 3.283048391342163, 'learning_rate': 1.6464646464646466e-05, 'epoch': 0.53}
{'loss': 0.9613, 'grad_norm': 5.859060287475586, 'learning_rate': 1.595959595959596e-05, 'epoch': 0.61}
{'loss': 0.989, 'grad_norm': 4.006072521209717, 'learning_rate': 1.5454545454545454e-05, 'epoch': 0.68}
{'loss': 0.9185, 'grad_norm': 6.259837627410889, 'learning_

  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.9830149412155151, 'eval_runtime': 29.8044, 'eval_samples_per_second': 8.891, 'eval_steps_per_second': 1.141, 'epoch': 1.0}
{'loss': 1.0235, 'grad_norm': 2.4713938236236572, 'learning_rate': 1.2929292929292931e-05, 'epoch': 1.06}
{'loss': 0.9829, 'grad_norm': 2.539842367172241, 'learning_rate': 1.2424242424242425e-05, 'epoch': 1.14}
{'loss': 0.9567, 'grad_norm': 3.476973533630371, 'learning_rate': 1.191919191919192e-05, 'epoch': 1.21}
{'loss': 1.068, 'grad_norm': 5.59661865234375, 'learning_rate': 1.1414141414141415e-05, 'epoch': 1.29}
{'loss': 1.0023, 'grad_norm': 3.5393593311309814, 'learning_rate': 1.0909090909090909e-05, 'epoch': 1.36}
{'loss': 0.9332, 'grad_norm': 5.222990036010742, 'learning_rate': 1.0404040404040405e-05, 'epoch': 1.44}
{'loss': 0.8812, 'grad_norm': 3.184427499771118, 'learning_rate': 9.8989898989899e-06, 'epoch': 1.52}
{'loss': 0.9933, 'grad_norm': 2.643828868865967, 'learning_rate': 9.393939393939396e-06, 'epoch': 1.59}
{'loss': 1.0486, 'grad_nor

  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.9684506058692932, 'eval_runtime': 31.0305, 'eval_samples_per_second': 8.54, 'eval_steps_per_second': 1.096, 'epoch': 2.0}
{'loss': 1.0659, 'grad_norm': 6.061112403869629, 'learning_rate': 6.363636363636364e-06, 'epoch': 2.05}
{'loss': 0.9926, 'grad_norm': 3.2398083209991455, 'learning_rate': 5.858585858585859e-06, 'epoch': 2.12}
{'loss': 0.9473, 'grad_norm': 3.6849136352539062, 'learning_rate': 5.353535353535354e-06, 'epoch': 2.2}
{'loss': 0.9115, 'grad_norm': 2.412933111190796, 'learning_rate': 4.848484848484849e-06, 'epoch': 2.27}
{'loss': 1.0707, 'grad_norm': 7.528298854827881, 'learning_rate': 4.343434343434344e-06, 'epoch': 2.35}
{'loss': 0.9221, 'grad_norm': 2.6718850135803223, 'learning_rate': 3.8383838383838385e-06, 'epoch': 2.42}
{'loss': 0.9357, 'grad_norm': 3.609076499938965, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.5}
{'loss': 1.0159, 'grad_norm': 6.547508239746094, 'learning_rate': 2.8282828282828286e-06, 'epoch': 2.58}
{'loss': 0.9081, 'grad_nor

  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.9636731743812561, 'eval_runtime': 30.6682, 'eval_samples_per_second': 8.641, 'eval_steps_per_second': 1.109, 'epoch': 3.0}
{'train_runtime': 1332.0663, 'train_samples_per_second': 2.378, 'train_steps_per_second': 0.297, 'train_loss': 0.9789928498894277, 'epoch': 3.0}


  0%|          | 0/34 [00:00<?, ?it/s]

RoBERTa model accuracy: 0.51
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        31
     neutral       0.51      1.00      0.67       134
    positive       0.00      0.00      0.00       100

    accuracy                           0.51       265
   macro avg       0.17      0.33      0.22       265
weighted avg       0.26      0.51      0.34       265



In [33]:
# Define the save path
save_path = "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/models"

# Save the trained model
roberta_model.save_pretrained(save_path)

# Save the tokenizer
tokenizer.save_pretrained(save_path)

# Save the LabelEncoder as a pickle object
joblib.dump(
    encoder,
    f"{save_path}/roberta_label_encoder.pkl",
)

['/Users/seshat/Documents/GitHub/labor_sentiment_analysis/models/roberta_label_encoder.pkl']

In [34]:
# Load the model from the file
nb_classifier = joblib.load(
    "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/models/custom_nb_model.pkl"
)

In [35]:
# Compare to SAME, UNALTERED custom Naive Bayes model from above applied to this test data subset.
# Extra testing!
# predict
y_pred = nb_classifier.predict(X_test)

# decode
custom_nb_pred = encoder.inverse_transform(y_pred)

y_test_decoded = encoder.inverse_transform(y_test)

accuracy = accuracy_score(y_test_decoded, custom_nb_pred)
print(f"Test set accuracy: {accuracy}")

# Detailed performance metrics
print("Classification Report:")
print(
    classification_report(
        y_test_decoded, custom_nb_pred, 
        target_names=encoder.classes_, 
        zero_division=0
    )
)

Test set accuracy: 0.43018867924528303
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        31
     neutral       0.47      0.67      0.55       134
    positive       0.33      0.24      0.28       100

    accuracy                           0.43       265
   macro avg       0.27      0.30      0.28       265
weighted avg       0.36      0.43      0.39       265



Compare, contrast and discuss these results