In [1]:
import pandas as pd
import pickle
import requests
import random 

with open('world_cup_tweets.pkl', 'rb') as f:
    data = pickle.load(f)

tweets = data.Tweet_processed.to_list()
tweets = random.sample(tweets, 300)

In [2]:
model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
hf_token = "YOUR OWN TOKEN"

API_URL = "https://api-inference.huggingface.co/models/" + model
headers = {"Authorization": "Bearer %s" % (hf_token)}

def analysis(data):
    payload = dict(inputs=data, options=dict(wait_for_model=True))
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [None]:
import pandas as pd

tweets_analysis = []

for tweet in tweets:
    try:
        sentiment_result = analysis(tweet)[0]
        sentiment_probabilities = {label['label']: label['score'] for label in sentiment_result}
        tweets_analysis.append({'tweet': tweet, **sentiment_probabilities})
    except Exception as e:
        print(e)

In [None]:
df = pd.DataFrame.from_records(tweets_analysis, columns=['tweet'] + list(sentiment_probabilities.keys()))
df = df.reset_index(drop=True)

df.head()
df.to_pickle('world_cup_tweets_sentiment_score.pkl')

In [2]:
df = pd.read_pickle('world_cup_tweets_sentiment_score.pkl')

In [3]:
df.head()

Unnamed: 0,tweet,neutral,positive,negative
0,Im soo happy the world cup is finally over. Geez,0.01302,0.981011,0.005969
1,Messi and @BYJUs in the World Cup finals. Its ...,0.171005,0.825542,0.003453
2,Swanson: Argentinas World Cup win sparks a par...,0.562505,0.431108,0.006387
3,MacArthur World Cup Fun Felicidades Argentina...,0.019768,0.978542,0.001691
4,Aftermath of the World Cup Final. Congratulati...,0.015951,0.98169,0.00236


In [10]:
texts=df['tweet']
from bertopic import BERTopic

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(texts)

Batches: 100%|██████████| 9/9 [00:04<00:00,  2.02it/s]
2023-04-06 22:21:27,218 - BERTopic - Transformed documents to Embeddings
2023-04-06 22:21:29,804 - BERTopic - Reduced dimensionality
2023-04-06 22:21:29,839 - BERTopic - Clustered reduced embeddings


In [11]:
df2 = pd.DataFrame(topic_model.get_document_info(texts))
df2.head()

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,Im soo happy the world cup is finally over. Geez,0,0_cup_world_the_is,cup - world - the - is - over - to - of - was ...,1.0,True
1,Messi and @BYJUs in the World Cup finals. Its ...,1,1_argentina_world_cup_messi,argentina - world - cup - messi - the - to - i...,0.218229,False
2,Swanson: Argentinas World Cup win sparks a par...,1,1_argentina_world_cup_messi,argentina - world - cup - messi - the - to - i...,1.0,False
3,MacArthur World Cup Fun Felicidades Argentina...,1,1_argentina_world_cup_messi,argentina - world - cup - messi - the - to - i...,0.296636,False
4,Aftermath of the World Cup Final. Congratulati...,-1,-1_the_world_cup_to,the - world - cup - to - of - and - for - mess...,0.096605,False


In [12]:
df_merge = pd.merge(df,df2, how='inner', left_on = 'tweet', right_on = 'Document')
df_merge = df_merge.drop('Document', axis=1)
df_merge.head()

Unnamed: 0,tweet,neutral,positive,negative,Topic,Name,Top_n_words,Probability,Representative_document
0,Im soo happy the world cup is finally over. Geez,0.01302,0.981011,0.005969,0,0_cup_world_the_is,cup - world - the - is - over - to - of - was ...,1.0,True
1,Messi and @BYJUs in the World Cup finals. Its ...,0.171005,0.825542,0.003453,1,1_argentina_world_cup_messi,argentina - world - cup - messi - the - to - i...,0.218229,False
2,Swanson: Argentinas World Cup win sparks a par...,0.562505,0.431108,0.006387,1,1_argentina_world_cup_messi,argentina - world - cup - messi - the - to - i...,1.0,False
3,MacArthur World Cup Fun Felicidades Argentina...,0.019768,0.978542,0.001691,1,1_argentina_world_cup_messi,argentina - world - cup - messi - the - to - i...,0.296636,False
4,Aftermath of the World Cup Final. Congratulati...,0.015951,0.98169,0.00236,-1,-1_the_world_cup_to,the - world - cup - to - of - and - for - mess...,0.096605,False


In [13]:
df_topic_sentiment = df_merge.groupby('Topic').agg({'neutral': 'mean', 'positive': 'mean', 'negative': 'mean'})
df_topic_sentiment = df_topic_sentiment.reset_index()
df_topic_sentiment

Unnamed: 0,Topic,neutral,positive,negative
0,-1,0.292008,0.563412,0.14458
1,0,0.338219,0.493514,0.168267
2,1,0.136186,0.796156,0.067658
3,2,0.254474,0.646871,0.098654
4,3,0.382726,0.303542,0.313732
5,4,0.57748,0.415198,0.007322
6,5,0.365797,0.525797,0.108406


In [14]:
freq = topic_model.get_topic_info()
df_freq = pd.DataFrame(freq)
df_new = pd.merge (df_freq, df_topic_sentiment, how = 'inner', on = 'Topic' )
df_new

Unnamed: 0,Topic,Count,Name,neutral,positive,negative
0,-1,112,-1_the_world_cup_to,0.292008,0.563412,0.14458
1,0,53,0_cup_world_the_is,0.338219,0.493514,0.168267
2,1,43,1_argentina_world_cup_messi,0.136186,0.796156,0.067658
3,2,28,2_messi_the_world_cup,0.254474,0.646871,0.098654
4,3,19,3_qatar_the_cup_world,0.382726,0.303542,0.313732
5,4,13,4_france_argentina_2022_penalties,0.57748,0.415198,0.007322
6,5,11,5_hat_trick_mbappe_in,0.365797,0.525797,0.108406


In [15]:
# add new column with the highest score
score_cols = ['neutral', 'positive', 'negative']
df_new['highest_score'] = df_new[score_cols].max(axis=1)

# define function to calculate sentiment label
def get_sentiment(row):
    if row['positive'] == row['highest_score']:
        return 'positive'
    elif row['negative'] == row['highest_score']:
        return 'negative'
    else:
        return 'neutral'

# apply function to each row and create new column
df_new['sentiment'] = df_new.apply(get_sentiment, axis=1)

df_new

Unnamed: 0,Topic,Count,Name,neutral,positive,negative,highest_score,sentiment
0,-1,112,-1_the_world_cup_to,0.292008,0.563412,0.14458,0.563412,positive
1,0,53,0_cup_world_the_is,0.338219,0.493514,0.168267,0.493514,positive
2,1,43,1_argentina_world_cup_messi,0.136186,0.796156,0.067658,0.796156,positive
3,2,28,2_messi_the_world_cup,0.254474,0.646871,0.098654,0.646871,positive
4,3,19,3_qatar_the_cup_world,0.382726,0.303542,0.313732,0.382726,neutral
5,4,13,4_france_argentina_2022_penalties,0.57748,0.415198,0.007322,0.57748,neutral
6,5,11,5_hat_trick_mbappe_in,0.365797,0.525797,0.108406,0.525797,positive


In [17]:
df_filtered = df_new[df_new["Topic"] != -1]


In [31]:
df_filtered

Unnamed: 0,Topic,Count,Name,neutral,positive,negative,highest_score,sentiment
1,0,53,0_cup_world_the_is,0.338219,0.493514,0.168267,0.493514,positive
2,1,43,1_argentina_world_cup_messi,0.136186,0.796156,0.067658,0.796156,positive
3,2,28,2_messi_the_world_cup,0.254474,0.646871,0.098654,0.646871,positive
4,3,19,3_qatar_the_cup_world,0.382726,0.303542,0.313732,0.382726,neutral
5,4,13,4_france_argentina_2022_penalties,0.57748,0.415198,0.007322,0.57748,neutral
6,5,11,5_hat_trick_mbappe_in,0.365797,0.525797,0.108406,0.525797,positive


In [None]:
import matplotlib.pyplot as plt

for index, row in df_filtered.iterrows():
    topic = row["Topic"]
    name = row["Name"]
    neutral = row["neutral"] * 100
    positive = row["positive"] * 100
    negative = row["negative"] * 100
    labels = ["Neutral", "Positive", "Negative"]
    sizes = [neutral, positive, negative]
    plt.pie(sizes, labels=labels, autopct='%1.1f%%')
    plt.title(f"Topic {topic}: {name}")
    plt.show()

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(2, 3, figsize=(12, 12), sharey=True)

for index, row in df_filtered.iterrows():
    topic = row["Topic"]
    name = row["Name"]
    neutral = row["neutral"] * 100
    positive = row["positive"] * 100
    negative = row["negative"] * 100
    labels = ["Neutral", "Positive", "Negative"]
    sizes = [neutral, positive, negative]
    if topic == 0:
        axs[0, 0].pie(sizes, labels=labels, autopct='%1.1f%%')
        axs[0, 0].set_title(f"Topic {topic}: {name}")
    elif topic == 1:
        axs[0, 1].pie(sizes, labels=labels, autopct='%1.1f%%')
        axs[0, 1].set_title(f"Topic {topic}: {name}")
    elif topic == 2:
        axs[0, 2].pie(sizes, labels=labels, autopct='%1.1f%%')
        axs[0, 2].set_title(f"Topic {topic}: {name}")
    elif topic == 3:
        axs[1, 0].pie(sizes, labels=labels, autopct='%1.1f%%')
        axs[1, 0].set_title(f"Topic {topic}: {name}")
    elif topic == 4:
        axs[1, 1].pie(sizes, labels=labels, autopct='%1.1f%%')
        axs[1, 1].set_title(f"Topic {topic}: {name}")
    elif topic == 5:
        axs[1, 2].pie(sizes, labels=labels, autopct='%1.1f%%')
        axs[1, 2].set_title(f"Topic {topic}: {name}")

plt.savefig("topic_based_sentiment_distribution.png")
plt.show()

