In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
text = "Even though ChatGPT can be very useful, however it could also lead to unprecedented problems!"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")


Downloading:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1) negative 0.7236
2) neutral 0.2287
3) positive 0.0477


In [20]:
# Getting the data
import pandas as pd

dataset = pd.read_csv('tweets.csv')
tweets = dataset['text']
predictions = []

for tweet in tweets[0:300]:
    text = preprocess(tweet)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    predictions.append(scores)

In [36]:
import torch
import numpy as np
from collections import Counter

# 0 -> negative
# 1 -> neutral
# 2 -> positive

max_predictions = np.argmax(predictions, axis=1)
max_predictions

# Get the count of each element in the array
counts = Counter(max_predictions)

# Print the counts
print(counts)

# Percantages #
# Get the total number of elements in the array
total = len(max_predictions)

# Create a dictionary to store the percentages using the numerical labels as keys
percentages = {key: round(value / total, 2) for key, value in counts.items()}

Counter({2: 137, 1: 129, 0: 34})


In [37]:
# Data exporting to csv

import csv

# Define a dictionary to map the numerical labels to string labels
label_map = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

# Open the CSV file in write mode
with open("results.csv", "w", newline="") as csv_file:
    # Create a CSV writer object
    writer = csv.writer(csv_file)

    # Write the column labels
    writer.writerow(["Element", "Count", "Percentage"])

    # Write the data rows
    for element, count in counts.items():
       

        percentage = percentages[element]
         # Map the numerical label to a string label
        element = label_map[element]
        writer.writerow([element, count, percentage])
