In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv('employee_reviews.csv', encoding='latin-1')

# Columns to drop
columns_to_drop = [
    'company', 'location', 'dates', 'job-title', 'overall-ratings',
    'work-balance-stars', 'culture-values-stars', 'carrer-opportunities-stars',
    'comp-benefit-stars', 'senior-mangemnet-stars', 'helpful-count', 'link'
]

# Drop the unwanted columns
df.drop(columns=columns_to_drop, inplace=True)

df['advice-to-mgmt'] = df['advice-to-mgmt'].apply(lambda x: '' if str(x).strip().lower() == 'none' else str(x).strip())

# Combine text fields into one column
df['full_text'] = df[['summary', 'pros', 'cons', 'advice-to-mgmt']].fillna('').agg('. '.join, axis=1)

# Drop the original text columns
df.drop(columns=['summary', 'pros', 'cons', 'advice-to-mgmt'], inplace=True)

# Save cleaned dataset
df.to_csv('cleaned_dataset.csv', index=False)

print("Dataset cleaned and saved as 'cleaned_dataset.csv'")

In [None]:
!pip install transformers datasets nltk scikit-learn -q

import pandas as pd
import numpy as np
import torch
import nltk
import re
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from nltk import pos_tag, word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
#Load and Clean Dataset
df = pd.read_csv("cleaned_dataset_with_sentiment.csv")
df = df.dropna(subset=["full_text"])
df = df.reset_index(drop=True)

def pos_chunk_text(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    return " ".join([f"{word}/{tag}" for word, tag in tagged])

df['pos_tagged'] = df['full_text'].apply(pos_chunk_text)

In [None]:
#Tokenization and Dataset Split
def tokenize_dataset(df, tokenizer):
    dataset = Dataset.from_pandas(df[['full_text', 'sentiment_score','label']])
    def tokenize_fn(example):
        return tokenizer(example['full_text'], padding="max_length", truncation=True, max_length=512)
    return dataset.map(tokenize_fn, batched=True)



In [None]:
#Load Models and Tokenizers
model_names = {
    "roberta-base": "roberta-base",
    "twitter-roberta" : "cardiffnlp/twitter-roberta-base-sentiment"
}

models = {}
tokenizers = {}

for name, path in model_names.items():
    tokenizers[name] = AutoTokenizer.from_pretrained(path)
    models[name] = AutoModelForSequenceClassification.from_pretrained(path, num_labels=3).to(device)

def convert_score_to_class(score):
    if score <= -0.33:
        return 0  # Negative
    elif score <= 0.33:
        return 1  # Neutral
    else:
        return 2  # Positive

df['label'] = df['sentiment_score'].apply(convert_score_to_class)
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [None]:
print(test_df)

In [None]:
#Train Function
def train_model(model_name, fine_tune=False):
    tokenizer = tokenizers[model_name]
    model = models[model_name]

    train_dataset = tokenize_dataset(train_df, tokenizer)
    test_dataset = tokenize_dataset(test_df, tokenizer)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    if fine_tune:
        training_args = TrainingArguments(
            output_dir=f'./results_{model_name}',
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=3,
            weight_decay=0.01,
            logging_dir=f'./logs_{model_name}',
            logging_steps=10,
            load_best_model_at_end=True,
            report_to="none",
            fp16=True
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
        )

        trainer.train()

    return model, tokenizer, test_dataset


In [None]:
#Evaluation Function
def evaluate(model, tokenizer, dataset):
    model.eval()
    preds, labels = [], []
    for item in dataset:
        input_ids = item["input_ids"].unsqueeze(0).to(device)
        attention_mask = item["attention_mask"].unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_ids, attention_mask=attention_mask)
        preds.append(torch.argmax(output.logits, dim=1).cpu().item())
        labels.append(item["label"])

    print(classification_report(labels, preds, target_names=["negative", "neutral", "positive"]))
    print("Accuracy:", accuracy_score(labels, preds))
    print("F1 Score:", f1_score(labels, preds, average='weighted'))
    return labels, preds


In [None]:
#Error Analysis
def error_analysis(df, labels, preds):
    errors = []
    for i, (l, p) in enumerate(zip(labels, preds)):

        errors.append({
            'Text': test_df.iloc[i]['full_text'],
            'Predicted': p,
            'True Label': l,
            'Sentiment Score': test_df.iloc[i]['sentiment_score'],

        })
    return pd.DataFrame(errors)

In [None]:
#Train & Compare Models
all_results = {}
for model_name in model_names:
    print(f"\n🚀 Evaluating: {model_name} (pretrained only)")
    model, tokenizer, test_data = train_model(model_name, fine_tune=False)
    labels, preds = evaluate(model, tokenizer, test_data)
    all_results[f'{model_name}_pretrained'] = (labels, preds)

    error_df = error_analysis(test_df.reset_index(), labels, preds)
    error_df['True Label'] = error_df['True Label'].astype(int)
    error_df.to_csv(f'error_analysis_{model_name}_pretrained.csv', index=False)


    print(f"\n🎯 Fine-tuning: {model_name}")
    model, tokenizer, test_data = train_model(model_name, fine_tune=True)
    labels, preds = evaluate(model, tokenizer, test_data)
    all_results[f'{model_name}_finetuned'] = (labels, preds)

    error_df = error_analysis(test_df.reset_index(), labels, preds)
    error_df['True Label'] = error_df['True Label'].astype(int)
    error_df.to_csv(f'error_analysis_{model_name}_finetuned.csv', index=False)


In [None]:
# manual testing
#checkpoint_path = "cardiffnlp/twitter-roberta-base-sentiment"
checkpoint_path = "./drive/MyDrive/results_twitter-roberta/checkpoint-20259"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

def predict_sentiment(text, model, tokenizer):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    sentiment_labels = ["negative", "neutral", "positive"]
    sentiment = sentiment_labels[predicted_class]

    sentiment_score = torch.softmax(logits, dim=1).squeeze().cpu().numpy()

    ranking = np.argsort(sentiment_score)
    ranking = ranking[::-1]
    for i in range(sentiment_score.shape[0]):
        l = sentiment_labels[ranking[i]]
        s = sentiment_score[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

    return sentiment, sentiment_score

#Example of how to use the function
custom_text = "Leaving whilst its dark is fun. #not #sucks"
sentiment, sentiment_score = predict_sentiment(custom_text, model, tokenizer)

print(f"Sentiment: {sentiment}")
print(f"Sentiment Scores: {sentiment_score}")