In [None]:
import pandas as pd
import pickle
import requests
import random 

with open('world_cup_tweets.pkl', 'rb') as f:
    data = pickle.load(f)

tweets = data.Tweet_processed.to_list()
tweets = random.sample(tweets, 300)

In [None]:
model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
hf_token = "YOUR OWN TOKEN"

API_URL = "https://api-inference.huggingface.co/models/" + model
headers = {"Authorization": "Bearer %s" % (hf_token)}

def analysis(data):
    payload = dict(inputs=data, options=dict(wait_for_model=True))
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [None]:
# Following is the code for doing Topic Based Sentiment Analysis. 
# Not only we need to get the higher score, 
# we need to store the sentiment probability score for every tweet for further calculation. 

import pandas as pd

tweets_analysis = []

for tweet in tweets:
    try:
        sentiment_result = analysis(tweet)[0]
        sentiment_probabilities = {label['label']: label['score'] for label in sentiment_result}
        tweets_analysis.append({'tweet': tweet, **sentiment_probabilities})
    except Exception as e:
        print(e)

In [None]:
df = pd.DataFrame.from_records(tweets_analysis, columns=['tweet'] + list(sentiment_probabilities.keys()))
df = df.reset_index(drop=True)

df.head()
df.to_pickle('world_cup_tweets_sentiment_score.pkl')

In [None]:
texts=df['tweet']
from bertopic import BERTopic

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(texts)

In [None]:
df2 = pd.DataFrame(topic_model.get_document_info(texts))
df2.head()

In [None]:
df_merge = pd.merge(df,df2, how='inner', left_on = 'tweet', right_on = 'Document')
df_merge = df_merge.drop('Document', axis=1)
df_merge.head()

In [None]:
df_topic_sentiment = df_merge.groupby('Topic').agg({'neutral': 'mean', 'positive': 'mean', 'negative': 'mean'})
df_topic_sentiment = df_topic_sentiment.reset_index()
df_topic_sentiment

In [None]:
freq = topic_model.get_topic_info()
df_freq = pd.DataFrame(freq)
df_new = pd.merge (df_freq, df_topic_sentiment, how = 'inner', on = 'Topic' )
df_new

In [None]:
# add new column with the highest score
score_cols = ['neutral', 'positive', 'negative']
df_new['highest_score'] = df_new[score_cols].max(axis=1)

# define function to calculate sentiment label
def get_sentiment(row):
    if row['positive'] == row['highest_score']:
        return 'positive'
    elif row['negative'] == row['highest_score']:
        return 'negative'
    else:
        return 'neutral'

# apply function to each row and create new column
df_new['sentiment'] = df_new.apply(get_sentiment, axis=1)

df_new