In [5]:
# libraries
import pandas as pd
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenizers import Tokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import WordNetLemmatizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
from sklearn.preprocessing import LabelEncoder
import numpy as np

import sys
sys.path.append("../scripts")
import functions as f

In [6]:
# load full processed dataset
pickle_folder = "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/pickle"
pickle_path = os.path.join(pickle_folder, "reddit.pkl")
reddit = joblib.load(pickle_path)

In [7]:
# Initialize tools
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
stop_words = set(STOP_WORDS)
lemmatizer = WordNetLemmatizer()

# Preprocessing function using Hugging Face tokenizer
def token_and_lemmatize_nb(text):
        # Tokenize text using Hugging Face tokenizer
    output = tokenizer.encode(text.lower())
    tokens = output.tokens  # Get tokenized words

    # Remove stopwords and lemmatize tokens
    processed_tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word.isalpha() and word not in stop_words
    ]

    # Rejoin tokens into a processed string
    return " ".join(processed_tokens)

In [8]:
# features and target
data['text_processed'] = data["text"].apply(token_and_lemmatize_nb)

X = data["text_processed"]

KeyboardInterrupt: 

In [8]:
# vectorize
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(X)

In [9]:
# Load the model from the file
nb_classifier = joblib.load(
    "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/models/custom_nb_model.pkl"
)

In [12]:
# Apply the classifier to the 'text' column to classify the text
data['label_nb'] = nb_classifier.predict(X)

encoder = LabelEncoder()
encoder.classes_ = np.load(
    "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/models/nb_label_encoder.pkl",
    allow_pickle=True,
)
data["label_nb"] = encoder.inverse_transform(data["label_nb"])

In [14]:
data.to_csv("/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/master/reddit_labelled_nb.csv", index=False)
pickle_path = os.path.join(pickle_folder, "reddit_labelled_nb.pkl")
joblib.dump(data, pickle_path)


In [None]:
# Load the LabelEncoder
encoder.classes_ = np.load(
    "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/models/roberta_label_encoder.pkl",
    allow_pickle=True,
)

# Tokenize and predict function
def predict_with_roberta(text):
    # Tokenize the input text
    save_path = "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/models"
    roBERTa_model = RobertaForSequenceClassification.from_pretrained(save_path)
    tokenizer = RobertaTokenizer.from_pretrained(save_path)

    roBERTa_model.eval()

    inputs = tokenizer(
        text, padding=True, truncation=True, max_length=512, return_tensors="pt"
    )

    # Pass through the model
    with torch.no_grad():
        outputs = roBERTa_model(**inputs)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, axis=1).item()  # Get predicted label

    # Decode the numerical prediction to original label
    decoded_label = encoder.inverse_transform([predicted_label])[0]
    return decoded_label

In [8]:
# Apply the prediction function to the processed text column
data["label_rob"] = data["text"].apply(f.predict_with_roberta)


In [9]:
# save again
data.to_csv(
    "/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/master/reddit_labelled.csv",
    index=False,
)
pickle_path = os.path.join(pickle_folder, "reddit_labelled.pkl")
joblib.dump(data, pickle_path)

['/Users/seshat/Documents/GitHub/labor_sentiment_analysis/data/pickle/reddit_labelled.pkl']

Compare these results