In [1]:
!pip install transformers torch numpy pandas scikit-learn



# Model loading and preparation

In [2]:
# Let's define a function for cleaning the text inputs
import re

# Define a function for preprocessing the texts
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    text = re.sub(r'\@\w+|\#','', text)
    # Lowercasing
    text = text.lower()
    return text

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch.nn.functional as F
import numpy as np

model_name = "cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
# The labels are: Negative, Neutral, Positive (Alphabetically arranged)
labels = ["negative", "neutral", "positive"]
# Let's create a function for inference
def predict_sentiment(text):
  # Preprocess the text and encode it
  cleaned_text = preprocess_text(text)
  encoded_input = tokenizer(cleaned_text, return_tensors='pt')

  # Do the actual prediction
  output = model(**encoded_input)
  probabilities = F.softmax(output.logits, dim=1)

  # Get the prediction
  index = np.argmax(probabilities.detach().numpy())
  confidence = np.round(probabilities.detach().numpy()[0][index] * 100, 2)

  # Return the outputs
  return labels[index], confidence


In [5]:
# Let's try
predict_sentiment("Okay")

('neutral', 50.67)

In [6]:
# Let's try
predict_sentiment("I'm not okay")

('negative', 92.94)

In [7]:
# Let's try
predict_sentiment("I'm okay")

('positive', 80.45)

# Dataset predictions result

In [8]:
import pandas as pd

In [9]:
# Load the data
df = pd.DataFrame(pd.read_csv("sentiment_test_cases.csv"))

In [10]:
# Let's inspect the data
df.head(5)

Unnamed: 0,expected_sentiment,text
0,positive,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,positive,Reading my kindle2... Love it... Lee childs i...
2,positive,"Ok, first assesment of the #kindle2 ...it fuck..."
3,positive,@kenburbary You'll love your Kindle2. I've had...
4,positive,@mikefish Fair enough. But i have the Kindle2...


In [11]:
# Let's run the predictions
df['predicted_sentiment'], df['confidence'] = zip(*df['text'].apply(lambda x: predict_sentiment(x))) # zip(*...) is used to unpack the tuples from the responses

In [12]:
# Check the predictions
df

Unnamed: 0,expected_sentiment,text,predicted_sentiment,confidence
0,positive,@stellargirl I loooooooovvvvvveee my Kindle2. ...,positive,98.90
1,positive,Reading my kindle2... Love it... Lee childs i...,positive,98.92
2,positive,"Ok, first assesment of the #kindle2 ...it fuck...",positive,82.86
3,positive,@kenburbary You'll love your Kindle2. I've had...,positive,98.93
4,positive,@mikefish Fair enough. But i have the Kindle2...,positive,98.00
...,...,...,...,...
493,neutral,Ask Programming: LaTeX or InDesign?: submitted...,neutral,89.77
494,negative,"On that note, I hate Word. I hate Pages. I hat...",negative,97.63
495,positive,Ahhh... back in a *real* text editing environm...,positive,93.90
496,negative,"Trouble in Iran, I see. Hmm. Iran. Iran so far...",negative,50.15


In [13]:
# Lets get the accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df['expected_sentiment'], df['predicted_sentiment'])
print("Accuracy:", accuracy)

Accuracy: 0.8373493975903614


In [14]:
# Let's see where it made mistakes so maybe we can improve on it in the future
# Filter the DataFrame to find rows where predictions were incorrect
mismatches = df[df['expected_sentiment'] != df['predicted_sentiment']]

# Display these rows
mismatches


Unnamed: 0,expected_sentiment,text,predicted_sentiment,confidence
9,positive,how can you not love Obama? he makes jokes abo...,negative,42.18
15,positive,"#lebron best athlete of our generation, if not...",neutral,48.38
18,negative,"@ludajuice Lebron is a Beast, but I'm still ch...",positive,89.16
34,negative,US planning to resume the military tribunals a...,neutral,76.14
37,negative,@sekseemess no. I'm not itchy for now. Maybe l...,neutral,48.48
...,...,...,...,...
466,neutral,Missed this insight-filled May column: One sma...,positive,70.58
480,negative,Fighting with LaTex. Again...,neutral,56.63
481,negative,@Iheartseverus we love you too and don't want ...,neutral,37.94
485,negative,Monday already. Iran may implode. Kitchen is a...,positive,42.55


In [15]:
# Lastly, let's save the data frame
df.to_csv("accomplished_sentiments.csv", index=False)