In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


df = pd.read_csv('res_sentiment.csv')
neg_df = df[df['sentiment'] == -1]
neu_df = df[df['sentiment'] == 0]
pos_df = df[df['sentiment'] == 1]

def topic_analysis(df, n_components, top_n):
    contents = [' '.join(eval(i)) for i in df['tokens'].to_list()]
    tfidf = TfidfVectorizer(ngram_range=(1, 3))
    x = tfidf.fit_transform(contents)
    model = LatentDirichletAllocation(n_components=n_components, random_state=42)
    model.fit(x)
    featute_names = tfidf.get_feature_names()
    rows = []
    for topic in model.components_:
        topwords = [featute_names[i] for i in topic.argsort()[: -top_n - 1:-1]]
        rows.append(topwords)
    for idx, row in enumerate(rows):
        print(f'topic :{idx + 1}')
        print(row)

print('positive')
topic_analysis(pos_df, 10, 10)
print('neutrality')
topic_analysis(neu_df, 10, 10)
print('negtive')
topic_analysis(neg_df, 10, 10)

positive
topic :1
['china', 'welcome china', 'welcome', 'chinese', 'world', 'happy', 'us', 'good', 'great', 'good china']
topic :2
['china', 'chinese', 'best', 'autumn', 'love', 'new', 'album', 'year', 'thank', 'beauty autumn']
topic :3
['new year', 'chinese new year', 'chinese new', 'happy chinese new', 'happy chinese', 'year', 'new', 'happy', 'chinese', 'china']
topic :4
['china', 'best', 'kpop', 'best selling', 'selling', 'chinese', 'selling kpop', 'best selling kpop', 'china year', 'champ']
topic :5
['things happening china', 'happening china', 'things happening', 'happening', 'china', 'things', 'incredible things', 'incredible things happening', 'incredible', 'chinese']
topic :6
['chinese', 'china', 'love', 'like', 'one', 'album', 'made', 'first', 'thank', 'year']
topic :7
['china', 'chinese', 'best', 'best selling', 'selling', 'year', 'food', 'digital', 'like', 'world']
topic :8
['china', 'chinese', 'new', 'first', 'rank', 'happy', 'us', 'jimin bts', 'congratulations jimin bts', 

In [37]:
from read_data import read_csv

In [38]:
df = read_csv('outputs/chinese_culture_2021-01-01_2023-05-13.csv')

In [39]:
df.head()

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL
0,boysenberry45,@boysenberry451,2021-01-01T15:23:35.000Z,"boysenberry45\n@boysenberry451\n·\nJan 1, 2021","In Chinese culture, the Great Egret represents...",,7.0,15.0,218.0,['https://pbs.twimg.com/media/Eqp-WPRXUAIGTEX?...,https://twitter.com/boysenberry451/status/1345...
1,freya,@hotbruvshit,2021-01-01T00:11:30.000Z,"freya\n@hotbruvshit\n·\nJan 1, 2021",when ur white parents raised u with chinese cu...,,,,,['https://pbs.twimg.com/media/Eqmu_U7VkAAIboS?...,https://twitter.com/hotbruvshit/status/1344798...
2,Shami,@Shami_Das,2021-01-01T10:17:40.000Z,"Shami\n@Shami_Das\n·\nJan 1, 2021",Replying to \n@globaltimesnews,,,,,[],https://twitter.com/Shami_Das/status/134495091...
3,rachel tensions,@joeytalks2much,2021-01-01T00:58:12.000Z,"rachel tensions\n@joeytalks2much\n·\nJan 1, 2021",10.) Over The Moon\n\nAmbitious storytelling w...,,1.0,,,['https://pbs.twimg.com/media/Eqm5rQMXAAA-3gN?...,https://twitter.com/joeytalks2much/status/1344...
4,"stan lulu, luobo, laosan, laosi",@refreshxing,2021-01-01T05:36:51.000Z,"stan lulu, luobo, laosan, laosi\n@refreshxing\...","Me as non chinese finds this is so amusing, an...",,,,,[],https://twitter.com/refreshxing/status/1344880...


In [40]:
tmp = df.dropna(subset=['Retweets']).reset_index(drop=True)

In [41]:
def convert_to_int(x):
    x = x.replace(',', '')
    if 'K' in x:
        x = eval(x[:-1]) * 1000
    else:
        return eval(x)

In [42]:
tmp['Retweets'] = tmp['Retweets'].apply(convert_to_int)

In [43]:
tmp = tmp[tmp['Retweets'] > 50]

In [44]:
tmp.to_csv('outputs/chinese_culture_2021-01-01_2023-05-13_retweet_50.csv', index=False)