In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import pandas as pd
import numpy as np
import pytz

# Read the CSV file
df = pd.read_csv('2musk_tweets.csv', encoding='utf-8')

# Append the row entries to an array
data_array = df.values.tolist()

# Define the processing function
def process_tweet(tweet):
    if isinstance(tweet, float):
        return ""
    tweet_words = []
    for word in tweet.split(' '):
        if word.startswith('@') and len(word) > 1:
            word = '@user'
        elif word.startswith('http'):
            word = "http"
        tweet_words.append(word)
    tweet_processed = " ".join(tweet_words)
    return tweet_processed

# Apply the processing function to the 'Tweet' column and store the results in a new column
df['Tweet'] = df['Tweet'].apply(process_tweet)

# Formatting
current_timezone = pytz.timezone('UTC')
df['Time (US/Pacific)'] = df['Time (+0)'].dt.tz_localize(current_timezone).dt.tz_convert('US/Pacific')
df = df.drop(columns=['Time (+0)'])
df = df.rename(columns={'Time (US/Pacific)': 'Time'})

print(df)
df.to_csv('3processed_tweets.csv', index=False, encoding='utf-8')

AttributeError: Can only use .dt accessor with datetimelike values

In [3]:
# load model and tokenizer
roberta = "cardiffnlp/roberta-base-tweet-sentiment-en" # https://huggingface.co/cardiffnlp/roberta-base-tweet-sentiment-en/tree/main
model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)
labels = ['Negative', 'Neutral', 'Positive']

df['Sentiment'] = ''
df['Negative Score'] = ''
df['Neutral Score'] = ''
df['Positive Score'] = ''

# Perform sentiment analysis
for index, row in df.iterrows():
    tweet = row['Tweet']
    encoded_tweet = tokenizer(tweet, return_tensors='pt')
    output = model(**encoded_tweet)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    df.at[index, 'Sentiment'] = labels[scores.argmax()]
    df.at[index, 'Negative Score'] = scores[0]
    df.at[index, 'Neutral Score'] = scores[1]
    df.at[index, 'Positive Score'] = scores[2]

# # Print sentiment labels and scores
# for i in range(len(scores)):
#     label = labels[i]
#     score = scores[i]
#     print("Tweet:", tweet)
#     print("Sentiment:", label)
#     print("Score:", score)
#     print()

print(df)
df.to_csv('4sentiment.csv', index=False, encoding='utf-8')

               Time (+0)                                              Tweet   
0    2023-05-11 10:11:16                    @user @user @user Probably true  \
1    2023-05-11 10:08:59          @user Major update coming in a few months   
2    2023-05-11 09:32:11  RT @user Starship is the largest, most capable...   
3    2023-05-11 09:24:48  @user Then he added “Silicon is for n00bs, Gal...   
4    2023-05-11 09:17:47                   @user @user @user @user @user 😂💯   
..                   ...                                                ...   
345  2023-05-04 18:04:34                       @user Thanks for the 3 Doge!   
346  2023-05-04 18:01:57  @user True, electronics all want DC, not AC. S...   
347  2023-05-04 17:58:59                   @user Welcome back to Twitter! 💕   
348  2023-05-04 17:56:08                    May the 4th be with you ❤️ http   
349  2023-05-04 17:48:32  @user @user Yes, but they are not to blame for...   

    Sentiment Negative Score Neutral Score Positive