In [1]:
from transformers import pipeline, AutoTokenizer
import pandas as pd
from tqdm import tqdm # For progress bar

# Load the tokenizer and the sentiment analysis pipeline
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

# Function to perform sentiment analysis with proper truncation
def analyze_sentiment(dataframe):
    sentiments = []
    scores = []
    for comment in tqdm(dataframe['comment_body'], desc="Analyzing Sentiments"):
        # Let the tokenizer handle truncation to the max model length
        inputs = tokenizer(comment, return_tensors='pt', truncation=True, max_length=512)
        # Convert tokenized input to text format for the pipeline
        truncated_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
        result = sentiment_pipeline(truncated_text)
        sentiment_result = result[0]['label']
        score = result[0]['score']
        sentiments.append(sentiment_result)
        scores.append(score)
    # Add the sentiment analysis results and scores to the dataframe
    dataframe['sentiment'] = sentiments
    dataframe['score'] = scores
    return dataframe

# Assuming your datasets are loaded here
rdf = pd.read_csv('rangers_comments_sentiment_analysis')
cdf = pd.read_csv('cowboys_comments_sentiment_analysis')
mdf = pd.read_csv('mavericks_comments_sentiment_analysis')

# Analyze sentiment for each dataset and add scores
rangers_df = analyze_sentiment(rdf)
cowboys_df = analyze_sentiment(cdf)
mavericks_df = analyze_sentiment(mdf)

  _torch_pytree._register_pytree_node(





  _torch_pytree._register_pytree_node(


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Analyzing Sentiments: 100%|██████████| 9245/9245 [28:41<00:00,  5.37it/s]  
Analyzing Sentiments: 100%|██████████| 29818/29818 [1:35:37<00:00,  5.20it/s]  
Analyzing Sentiments: 100%|██████████| 21875/21875 [1:10:02<00:00,  5.21it/s]


In [2]:
rangers_df.head()

Unnamed: 0,comment_body,sentiment,score
0,14 1/2 minutes of pure sex,LABEL_1,0.763947
1,14 minutes…I’m not going to wa- OH I REMEMBER ...,LABEL_1,0.513736
2,Those Garcia HRs hit harder today,LABEL_1,0.758604
3,Crazy he is just 35. Older yes but to think he...,LABEL_1,0.549139
4,"Man, it was fun watching this guy just throw c...",LABEL_2,0.911701


In [10]:
rangers_df['sentiment'] = rangers_df['sentiment'].replace({'LABEL_0': 'Negative', 'LABEL_1': 'Neutral', 'LABEL_2': 'Positive'})
cowboys_df['sentiment'] = mavericks_df['sentiment'].replace({'LABEL_0': 'Negative', 'LABEL_1': 'Neutral', 'LABEL_2': 'Positive'})
mavericks_df['sentiment'] = mavericks_df['sentiment'].replace({'LABEL_0': 'Negative', 'LABEL_1': 'Neutral', 'LABEL_2': 'Positive'})
rangers_df.head()

Unnamed: 0,comment_body,sentiment,score
0,14 1/2 minutes of pure sex,Neutral,0.763947
1,14 minutes…I’m not going to wa- OH I REMEMBER ...,Neutral,0.513736
2,Those Garcia HRs hit harder today,Neutral,0.758604
3,Crazy he is just 35. Older yes but to think he...,Neutral,0.549139
4,"Man, it was fun watching this guy just throw c...",Positive,0.911701


In [12]:
rangers_df.to_csv('bert_rangers_scores.csv', index=False)
cowboys_df.to_csv('bert_cowboys_scores.csv', index=False)
mavericks_df.to_csv('bert_mavericks_scores.csv', index=False)