In [2]:
pwd

'/root/emotion_classification'

In [3]:
import re
# from urlextract import URLExtract
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import numpy as np
import torch
from scipy.special import expit
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import udf
# from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from tqdm import tqdm
from collections import Counter

In [4]:
dtypes = {'id':str,'conversation_id':str,'referenced_tweets.replied_to.id' : str,"theta": np.float64}

replies = pd.read_csv('/root/emotion_classification/replies_with_sentiment.csv', dtype=dtypes)
posters = pd.read_csv("/root/emotion_classification/posters_with_sentiment.csv", dtype=dtypes)

In [5]:
len(posters),len(replies)


(2261178, 2277727)

In [6]:
posters = posters.drop_duplicates(subset='id', keep='first')
len(posters)

2123847

In [7]:
merged_df = pd.merge(replies, posters, left_on='conversation_id', right_on='id', how='inner')

In [8]:
len(merged_df)

1928962

In [9]:
merged_df = merged_df.drop(columns=['referenced_tweets.replied_to.id', 'id_y'])

In [10]:
merged_df.head()


Unnamed: 0,id_x,conversation_id,author_id_x,formatted_text_x,theta_x,accounts_followed_x,sentiment_x,author_id_y,formatted_text_y,topic,context_annotations,event,theta_y,accounts_followed_y,sentiment_y
0,1216975183296073729,1216782463810396161,21779437,{@CharlesMBlow@} {@dilanesper@} I am heartbrok...,1.0,48,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0
1,1216824578422398976,1216782463810396161,1954842048,{@CharlesMBlow@} to many candidates &amp; Bern...,1.0,28,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0
2,1216842811049463815,1216782463810396161,17921569,{@CharlesMBlow@} Most are for Biden. That's wh...,1.0,87,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0
3,1216789249825148929,1216782463810396161,20267054,{@CharlesMBlow@} Duvall Patrick is in the race.,1.0,5,1,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0
4,1216863946184982528,1216782463810396161,3045832460,"{@CharlesMBlow@} No, it's not. They didn't run...",1.0,33,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0


In [11]:
merged_df = merged_df.dropna(subset=['theta_y'])
len(merged_df)

1790532

In [12]:
merged_df['IsCPI'] = (merged_df['theta_x'] != merged_df['theta_y']).astype(int)


In [13]:
count_ones = merged_df['IsCPI'].sum()
print(f"Number of rows where are CPI: {count_ones}")

Number of rows where are CPI: 873209


In [14]:
cpiPair = merged_df[merged_df["IsCPI"]==1]

In [15]:
cpiPair.head()

Unnamed: 0,id_x,conversation_id,author_id_x,formatted_text_x,theta_x,accounts_followed_x,sentiment_x,author_id_y,formatted_text_y,topic,context_annotations,event,theta_y,accounts_followed_y,sentiment_y,IsCPI
0,1216975183296073729,1216782463810396161,21779437,{@CharlesMBlow@} {@dilanesper@} I am heartbrok...,1.0,48,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0,1
1,1216824578422398976,1216782463810396161,1954842048,{@CharlesMBlow@} to many candidates &amp; Bern...,1.0,28,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0,1
2,1216842811049463815,1216782463810396161,17921569,{@CharlesMBlow@} Most are for Biden. That's wh...,1.0,87,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0,1
3,1216789249825148929,1216782463810396161,20267054,{@CharlesMBlow@} Duvall Patrick is in the race.,1.0,5,1,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0,1
4,1216863946184982528,1216782463810396161,3045832460,"{@CharlesMBlow@} No, it's not. They didn't run...",1.0,33,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0,1


In [16]:
event_counts = merged_df['topic'].value_counts().head(20)
print(event_counts)
print()
cpi_counts = cpiPair['topic'].value_counts().head(20)
print(cpi_counts)


topic
news_&_social_concern       1137787
diaries_&_daily_life         221723
sports                        77364
other_hobbies                 59323
fitness_&_health              53108
film_tv_&_video               47245
celebrity_&_pop_culture       38607
business_&_entrepreneurs      31304
food_&_dining                 30721
music                         21128
science_&_technology          15121
learning_&_educational        12916
travel_&_adventure            11446
fashion_&_style                7819
arts_&_culture                 7214
family                         5819
gaming                         5001
relationships                  4026
youth_&_student_life           2860
Name: count, dtype: int64

topic
news_&_social_concern       607322
diaries_&_daily_life         80837
sports                       38955
other_hobbies                27860
fitness_&_health             24327
film_tv_&_video              16622
business_&_entrepreneurs     16381
celebrity_&_pop_culture      146

In [24]:
# 计算 merged_df 和 cpiPair 中的 event 统计信息
event_counts_merged = merged_df['event'].value_counts()
event_counts_cpiPair = cpiPair['event'].value_counts()

# 转化为DataFrame并重命名列
df_event_counts_merged = event_counts_merged.rename('merged_count').reset_index()
df_event_counts_cpiPair = event_counts_cpiPair.rename('cpiPair_count').reset_index()

# 将两个数据框按事件名称 ('index') 合并
df_combined_event_counts = pd.merge(df_event_counts_merged, df_event_counts_cpiPair, on='event', how='inner')

# 重命名 'index' 列为 'event'
#df_combined_event_counts.rename(columns={'index': 'event'}, inplace=True)
df_combined_event_counts['cpiPair_count'] = df_combined_event_counts['cpiPair_count'].astype(int)

# 显示结果
df_combined_event_counts.head(30)

Unnamed: 0,event,merged_count,cpiPair_count
0,Donald Trump,321239,202787
1,COVID-19,262783,137935
2,2020 US Presidential Election,33071,18267
3,Entertainment,26093,12350
4,Joe Biden,22908,11222
5,Services,18467,6372
6,Food,12573,4363
7,Entertainment industry,11647,4124
8,Elizabeth Warren,10203,4388
9,Nancy Pelosi,9378,5536


In [26]:
event_counts_merged = merged_df['theta_y'].value_counts()
print(event_counts_merged)
event_counts_cpiPair = cpiPair['theta_y'].value_counts()
print(event_counts_cpiPair)

theta_y
1.0    835664
2.0    518754
0.0    436114
Name: count, dtype: int64
theta_y
0.0    344234
2.0    310225
1.0    218750
Name: count, dtype: int64


In [30]:
sentiment_pairs_count = cpiPair.groupby(['sentiment_x', 'sentiment_y']).size().reset_index(name='counts')
# 0: negative, 1: neutral, 2: positive
# 显示统计结果
print(sentiment_pairs_count)

   sentiment_x  sentiment_y  counts
0            0            0  212970
1            0            1  192497
2            0            2   65799
3            1            0   92886
4            1            1  144693
5            1            2   41110
6            2            0   25636
7            2            1   56112
8            2            2   41506


In [28]:
cpiPair.head()

Unnamed: 0,id_x,conversation_id,author_id_x,formatted_text_x,theta_x,accounts_followed_x,sentiment_x,author_id_y,formatted_text_y,topic,context_annotations,event,theta_y,accounts_followed_y,sentiment_y,IsCPI
0,1216975183296073729,1216782463810396161,21779437,{@CharlesMBlow@} {@dilanesper@} I am heartbrok...,1.0,48,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0,1
1,1216824578422398976,1216782463810396161,1954842048,{@CharlesMBlow@} to many candidates &amp; Bern...,1.0,28,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0,1
2,1216842811049463815,1216782463810396161,17921569,{@CharlesMBlow@} Most are for Biden. That's wh...,1.0,87,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0,1
3,1216789249825148929,1216782463810396161,20267054,{@CharlesMBlow@} Duvall Patrick is in the race.,1.0,5,1,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0,1
4,1216863946184982528,1216782463810396161,3045832460,"{@CharlesMBlow@} No, it's not. They didn't run...",1.0,33,0,20772763,The fact that the black and latino candidates ...,news_&_social_concern,"[{""domain"": {""id"": ""10"", ""name"": ""Person"", ""de...",Charles Blow,0.0,72.0,0,1
