In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
import matplotlib.pyplot as plt

In [36]:
with open('twi22/processed/topic_tweet1_covid.pkl', 'rb') as f:
    G = pickle.load(f)
tweet = pd.read_csv('twi22/processed/topic_tweet1_covid_score.csv')

# with open('twi22/processed/topic_tweet2_war.pkl', 'rb') as f:
#     G = pickle.load(f)
# tweet = pd.read_csv('twi22/processed/topic_tweet2_war_score.csv')

# with open('twi22/processed/topic_tweet3_climate.pkl', 'rb') as f:
#     G = pickle.load(f)
# tweet = pd.read_csv('twi22/processed/topic_tweet3_climate_score.csv')

tweet.set_index('uid', inplace=True)
print("Nodes: ", len(G.nodes()), "Len tweet: ", len(tweet))
print("Edges: ", len(G.edges()))

Nodes:  195491 Len tweet:  195491
Edges:  1341914


In [18]:
# with human inf vs. without human inf
from scipy import stats
opi_b, opi_o = [], []
for node in G.nodes:
    if G.nodes[node]['label']=='bot':
        continue
    following_nodes = list(G.successors(node))
    condition = []
    for check_node_id in following_nodes:
        if tweet.loc[check_node_id]['StScore']>0.8:    # sentiment threshold
            if G.nodes[check_node_id]['label']=='human':
                condition.append(1)
    exist_2 = 1 in set(condition)
    if exist_2:
        opi_b.append(tweet.loc[node]['StScore'])
    else:
        opi_o.append(tweet.loc[node]['StScore'])


mu_b, mu_o = np.mean(opi_b), np.mean(opi_o)
t_stats, p_value = stats.ttest_ind(opi_b, opi_o)
print("ave b: {:.4f}, ave o: {:.4f}, p val: {:.4f}".format(mu_b, mu_o, p_value))

ave b: 0.1609, ave o: 0.1052, p val: 0.0000


In [9]:
# with bot inf vs. without bot inf
from scipy import stats
opi_b, opi_o = [], []
for node in G.nodes:
    if G.nodes[node]['label']=='bot':
        continue
    following_nodes = list(G.successors(node))
    condition = []
    for check_node_id in following_nodes:
        if tweet.loc[check_node_id]['StScore']<-0.8:    # sentiment threshold
            if G.nodes[check_node_id]['label']=='bot':
                condition.append(1)
    exist_2 = 1 in set(condition)
    if exist_2:
        opi_b.append(tweet.loc[node]['StScore'])
    else:
        opi_o.append(tweet.loc[node]['StScore'])


mu_b, mu_o = np.mean(opi_b), np.mean(opi_o)
t_stats, p_value = stats.ttest_ind(opi_b, opi_o)
print("ave b: {:.4f}, ave o: {:.4f}, p val: {:.4f}".format(mu_b, mu_o, p_value))

ave b: 0.1530, ave o: 0.1892, p val: 0.2763


## Analyzing the Influence Receivers

In [38]:

i = 0
treat_sent = {'id': [], 'my_sent': [], 'num_inf_source': [], 'num_following': [], 'num_ego_edge': []}
control_sent = {'id': [], 'my_sent': [], 'num_inf_source': [], 'num_following': [], 'num_ego_edge': []}

for node in G.nodes:
    if G.nodes[node]['label']=='bot':
        continue
    following_nodes = list(G.successors(node))
    condition = []  # jugde treat or control
    followings_sent = []
    for check_node_id in following_nodes:
        followings_sent.append(tweet.loc[check_node_id]['StScore'])
        if tweet.loc[check_node_id]['StScore']<-0.8:    # sentiment threshold
            if G.nodes[check_node_id]['label']=='human':
                condition.append(1)
            elif G.nodes[check_node_id]['label']=='bot':
                condition.append(2)

    exist_1 = 1 in set(condition)
    exist_2 = 2 in set(condition)
    exist_both = exist_1 and exist_2

    if exist_2 and not exist_both:  # different from treat/control split in impact estimation
        treat_sent['id'].append(node) # bot
        treat_sent['my_sent'].append(tweet.loc[node]['StScore']) # with bot influencer
        treat_sent['num_inf_source'].append(len(condition))
        treat_sent['num_following'].append(len(following_nodes))

        ego_G = nx.ego_graph(G, node)
        treat_sent['num_ego_edge'].append(sum(out_degree for _, out_degree in ego_G.out_degree()))

    elif exist_1 and not exist_both:
        control_sent['id'].append(node) # human
        control_sent['my_sent'].append(tweet.loc[node]['StScore']) # with human influencer
        control_sent['num_inf_source'].append(len(condition))
        control_sent['num_following'].append(len(following_nodes))
                
        ego_G = nx.ego_graph(G, node)
        control_sent['num_ego_edge'].append(sum(out_degree for _, out_degree in ego_G.out_degree()))

# humans
treat_sent = pd.DataFrame(treat_sent)
control_sent = pd.DataFrame(control_sent)

treat_sent['inf_s_rate'] = treat_sent['num_inf_source']/treat_sent['num_following']
treat_sent['ego_rate'] = treat_sent['num_following']/treat_sent['num_ego_edge']
control_sent['inf_s_rate'] = control_sent['num_inf_source']/control_sent['num_following']
control_sent['ego_rate'] = control_sent['num_following']/control_sent['num_ego_edge']

# treat_sent.to_csv('t1_pos_y1InfReciver.csv', index=False)
# treat_sent.to_csv('t2_pos_y1InfReciver.csv', index=False)
# treat_sent.to_csv('t3_pos_y1InfReciver.csv', index=False)
treat_sent.to_csv('t1_neg_y1InfReciver.csv', index=False)
# treat_sent.to_csv('t2_neg_y1InfReciver.csv', index=False)
# treat_sent.to_csv('t3_neg_y1InfReciver.csv', index=False)

# control_sent.to_csv('t1_pos_y0InfReciver.csv', index=False)
# control_sent.to_csv('t2_pos_y0InfReciver.csv', index=False)
# control_sent.to_csv('t3_pos_y0InfReciver.csv', index=False)
control_sent.to_csv('t1_neg_y0InfReciver.csv', index=False)
# control_sent.to_csv('t2_neg_y0InfReciver.csv', index=False)
# control_sent.to_csv('t3_neg_y0InfReciver.csv', index=False)
    


In [8]:
# 1'30
rel_dt1 = pd.read_csv('edges/retweeted.csv')
rel_dt2 = pd.read_csv('edges/post.csv')

In [10]:
# 3'12
retweet = pd.merge(rel_dt2, rel_dt1, left_on='target_id', right_on='target_id', how='inner')
retweeted = pd.merge(rel_dt2, rel_dt1, left_on='target_id', right_on='source_id', how='inner')

In [23]:
retweet.set_index('source_id_x', inplace=True)
retweeted.set_index('source_id_x', inplace=True)

## Analyzing the Influencers

In [24]:
# run six dataset at once
for dt in [10,11,20,21,30,31]:
    print("Processing ", dt)
    if dt in [10, 11]:
        with open('twi22/processed/topic_tweet1_covid.pkl', 'rb') as f:
            G = pickle.load(f)
        tweet = pd.read_csv('twi22/processed/topic_tweet1_covid_score.csv')
    elif dt in [20, 21]:
        with open('twi22/processed/topic_tweet2_war.pkl', 'rb') as f:
            G = pickle.load(f)
        tweet = pd.read_csv('twi22/processed/topic_tweet2_war_score.csv')
    else:
        with open('twi22/processed/topic_tweet3_climate.pkl', 'rb') as f:
            G = pickle.load(f)
        tweet = pd.read_csv('twi22/processed/topic_tweet3_climate_score.csv')
    
    G = G.reverse()

    tweet.set_index('uid', inplace=True)
    i = 0
    bots_follower_sent = {'id': [], 'followers': [], 'follower_sent': []}
    humans_follower_sent = {'id': [], 'followers': [], 'follower_sent': []}
    if dt in [10,20,30]:
        opinion, times = 0.8, 1
    else:
        opinion, times = -0.8, -1

    for node in G.nodes:
        if tweet.loc[node]['StScore']*times > opinion*times: # positive or negative
            if G.nodes[node]['label'] == 'bot':
                bots_follower_sent['id'].append(node)
                followers_nodes = list(G.successors(node))
                humans_followers = 0
                for follower in followers_nodes:
                    if G.nodes[follower]['label']=='human':
                        humans_followers+=1
                bots_follower_sent['followers'].append(humans_followers)
                bots_follower_sent['follower_sent'].append(tweet.loc[followers_nodes]['StScore'].mean())
            elif G.nodes[node]['label'] == 'human':
                humans_follower_sent['id'].append(node)
                followers_nodes = list(G.successors(node))
                bots_followers = 0
                for follower in followers_nodes:
                    if G.nodes[follower]['label']=='human':
                        bots_followers+=1
                humans_follower_sent['followers'].append(bots_followers)
                humans_follower_sent['follower_sent'].append(tweet.loc[followers_nodes]['StScore'].mean())

    print("finish counting followers")
    # bots
    # count bots' relations
    bots_follower_sent = pd.DataFrame(bots_follower_sent)
    # retweet, retweeted
    bots_follower_sent['retweet'] = ''
    val_c = retweet.index.value_counts()
    for i in range(len(bots_follower_sent)):
        bots_follower_sent.at[i, 'retweet'] = val_c.get(bots_follower_sent['id'][i], 0)
    bots_follower_sent['retweeted'] = ''
    val_c = retweeted.index.value_counts()
    for i in range(len(bots_follower_sent)):
        bots_follower_sent.at[i, 'retweeted'] = val_c.get(bots_follower_sent['id'][i], 0)
    # membership, mentioned
    rel1 = ['membership', 'mentioned']
    for rel in rel1:
        rel_dt = pd.read_csv('edges/'+rel+'.csv')
        rel_dt.set_index('target_id', inplace=True)
        bots_follower_sent[rel] = ''
        val_c = rel_dt.index.value_counts()
        for i in range(len(bots_follower_sent)):
            bots_follower_sent.at[i, rel] = val_c.get(bots_follower_sent['id'][i], 0)
    rel2 = ['post', 'like', 'pinned']
    for rel in rel2:
        rel_dt = pd.read_csv('edges/'+rel+'.csv')
        rel_dt.set_index('source_id', inplace=True)
        bots_follower_sent[rel] = ''
        val_c = rel_dt.index.value_counts()
        for i in range(len(bots_follower_sent)):
            bots_follower_sent.at[i, rel] = val_c.get(bots_follower_sent['id'][i], 0)

    # humans
    # count bots' relations
    humans_follower_sent = pd.DataFrame(humans_follower_sent)
    # retweet, retweeted
    humans_follower_sent['retweet'] = ''
    val_c = retweet.index.value_counts()
    for i in range(len(humans_follower_sent)):
        humans_follower_sent.at[i, 'retweet'] = val_c.get(humans_follower_sent['id'][i], 0)
    humans_follower_sent['retweeted'] = ''
    val_c = retweeted.index.value_counts()
    for i in range(len(humans_follower_sent)):
        humans_follower_sent.at[i, 'retweeted'] = val_c.get(humans_follower_sent['id'][i], 0)
    # membership, mentioned
    rel1 = ['membership', 'mentioned']
    for rel in rel1:
        rel_dt = pd.read_csv('edges/'+rel+'.csv')
        rel_dt.set_index('target_id', inplace=True)
        humans_follower_sent[rel] = ''
        val_c = rel_dt.index.value_counts()
        for i in range(len(humans_follower_sent)):
            humans_follower_sent.at[i, rel] = val_c.get(humans_follower_sent['id'][i], 0)
    rel2 = ['post', 'like', 'pinned']
    for rel in rel2:
        rel_dt = pd.read_csv('edges/'+rel+'.csv')
        rel_dt.set_index('source_id', inplace=True)
        humans_follower_sent[rel] = ''
        val_c = rel_dt.index.value_counts()
        for i in range(len(humans_follower_sent)):
            humans_follower_sent.at[i, rel] = val_c.get(humans_follower_sent['id'][i], 0)

    if dt == 10:
        bots_follower_sent.to_csv('t1_pos_botInfluencer.csv', index=False)
        humans_follower_sent.to_csv('t1_pos_humanInfluencer.csv', index=False)
    elif dt == 20:
        bots_follower_sent.to_csv('t2_pos_botInfluencer.csv', index=False)
        humans_follower_sent.to_csv('t2_pos_humanInfluencer.csv', index=False)
    elif dt == 30:
        bots_follower_sent.to_csv('t3_pos_botInfluencer.csv', index=False)
        humans_follower_sent.to_csv('t3_pos_humanInfluencer.csv', index=False)
    elif dt == 11:
        bots_follower_sent.to_csv('t1_neg_botInfluencer.csv', index=False)
        humans_follower_sent.to_csv('t1_neg_humanInfluencer.csv', index=False)
    elif dt == 21:
        bots_follower_sent.to_csv('t2_neg_botInfluencer.csv', index=False)
        humans_follower_sent.to_csv('t2_neg_humanInfluencer.csv', index=False)
    else:
        bots_follower_sent.to_csv('t3_neg_botInfluencer.csv', index=False)
        humans_follower_sent.to_csv('t3_neg_humanInfluencer.csv', index=False)


Processing  10
finish counting followers
Processing  11
finish counting followers
Processing  20
finish counting followers
Processing  21
finish counting followers
Processing  30
finish counting followers
Processing  31
finish counting followers


## Influencers' tweet

In [12]:
# save their tweet example
# inf_id = pd.read_csv('twi22/influencers/t1_pos_botinfluencer.csv')
# tweet = pd.read_csv('twi22/processed/topic_tweet1_covid.csv')
# inf_id = pd.read_csv('twi22/influencers/t2_pos_botinfluencer.csv')
# tweet = pd.read_csv('twi22/processed/topic_tweet2_war.csv')
# inf_id = pd.read_csv('twi22/influencers/t3_pos_botinfluencer.csv')
# tweet = pd.read_csv('twi22/processed/topic_tweet3_climate.csv')

# tweet['uid'] = 'u'+tweet['uid'].astype(str)
# tweet_inf = pd.merge(inf_id, tweet, left_on='id', how='inner', right_on='uid')[['uid', 'tweet']]
# tweet_inf.to_csv('t1_pos_bot_tweet.csv', index=False)

## Influencers' Attitude Varaiances

In [48]:
bot_inf = pd.read_csv('twi22/influencers/t3_neg_botinfluencer.csv')
human_inf = pd.read_csv('twi22/influencers/t3_neg_humaninfluencer.csv')
bot_inf_id = bot_inf['id'].to_list()
human_inf_id = human_inf['id'].to_list()

tweet = pd.read_csv('twi22/processed/topic_tweet3_climate.csv')
tweet['uid'] = 'u'+tweet['uid'].astype(str)
tweet_ylabel = tweet.set_index('uid') 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
StScore = SentimentIntensityAnalyzer()

In [49]:
var_all_human, var_all_bot = [], []
for u in bot_inf_id:
    user_i_tweet = (tweet_ylabel.loc[u])['tweet']
    if type(user_i_tweet) == str:
        user_i_tweet = [user_i_tweet]
    else:
        user_i_tweet = user_i_tweet.to_list()
    t_score = []
    for t in user_i_tweet:
        t_score.append(StScore.polarity_scores(t)['compound'])
    var_all_bot.append(np.var(t_score))

for u in human_inf_id:
    user_i_tweet = (tweet_ylabel.loc[u])['tweet']
    if type(user_i_tweet) == str:
        user_i_tweet = [user_i_tweet]
    else:
        user_i_tweet = user_i_tweet.to_list()
    t_score = []
    for t in user_i_tweet:
        t_score.append(StScore.polarity_scores(t)['compound'])
    var_all_human.append(np.var(t_score))

In [50]:
from scipy import stats
_, p_val = stats.ttest_ind(var_all_human, var_all_bot)

print(np.mean(var_all_human), np.mean(var_all_bot))
print(p_val)

0.13806034355955424 0.11767900256785836
0.01994190475933364


## Tweets Text Analysis

In [74]:
b_inf_id10 = pd.read_csv('twi22/influencers/t1_pos_botinfluencer.csv')['id'].to_list()
h_inf_id10 = pd.read_csv('twi22/influencers/t1_pos_humaninfluencer.csv')['id'].to_list()
b_inf_id11 = pd.read_csv('twi22/influencers/t1_neg_botinfluencer.csv')['id'].to_list()
h_inf_id11 = pd.read_csv('twi22/influencers/t1_neg_humaninfluencer.csv')['id'].to_list()
b_inf_id20 = pd.read_csv('twi22/influencers/t2_pos_botinfluencer.csv')['id'].to_list()
h_inf_id20 = pd.read_csv('twi22/influencers/t2_pos_humaninfluencer.csv')['id'].to_list()
b_inf_id21 = pd.read_csv('twi22/influencers/t2_neg_botinfluencer.csv')['id'].to_list()
h_inf_id21 = pd.read_csv('twi22/influencers/t2_neg_humaninfluencer.csv')['id'].to_list()
b_inf_id30 = pd.read_csv('twi22/influencers/t3_pos_botinfluencer.csv')['id'].to_list()
h_inf_id30 = pd.read_csv('twi22/influencers/t3_pos_humaninfluencer.csv')['id'].to_list()
b_inf_id31 = pd.read_csv('twi22/influencers/t3_neg_botinfluencer.csv')['id'].to_list()
h_inf_id31 = pd.read_csv('twi22/influencers/t3_neg_humaninfluencer.csv')['id'].to_list()

tweet1 = pd.read_csv('twi22/processed/topic_tweet1_covid.csv')
tweet2 = pd.read_csv('twi22/processed/topic_tweet2_war.csv')
tweet3 = pd.read_csv('twi22/processed/topic_tweet3_climate.csv')

tweet1['uid'] = 'u'+tweet1['uid'].astype(str)
tweet1.set_index('uid', inplace=True)
tweet2['uid'] = 'u'+tweet2['uid'].astype(str)
tweet2.set_index('uid', inplace=True)
tweet3['uid'] = 'u'+tweet3['uid'].astype(str)
tweet3.set_index('uid', inplace=True)

def collect_all_tweet(tweet, human_inf_id, bot_inf_id):
    human_tweet_list, bot_tweet_list = [], []
    for i in human_inf_id:
        tweet_i = tweet.loc[i, 'tweet']
        if type(tweet_i)==str:
            human_tweet_list.append(tweet_i)
        else:
            human_tweet_list.extend(tweet.loc[i, 'tweet'].to_list())

    for i in bot_inf_id:
        tweet_i = tweet.loc[i, 'tweet']
        if type(tweet_i)==str:
            bot_tweet_list.append(tweet_i)
        else:
            bot_tweet_list.extend(tweet.loc[i, 'tweet'].to_list())
    return human_tweet_list, bot_tweet_list

hl10, bl10 = collect_all_tweet(tweet1, h_inf_id10, b_inf_id10)
hl11, bl11 = collect_all_tweet(tweet1, h_inf_id11, b_inf_id11)
hl20, bl20 = collect_all_tweet(tweet2, h_inf_id20, b_inf_id20)
hl21, bl21 = collect_all_tweet(tweet2, h_inf_id21, b_inf_id21)
hl30, bl30 = collect_all_tweet(tweet3, h_inf_id30, b_inf_id30)
hl31, bl31 = collect_all_tweet(tweet3, h_inf_id31, b_inf_id31)

In [215]:
def ana_text(hl, bl, data_type):
    print("data: ", data_type)
    h_len, b_len = len(hl), len(bl)
    h_at, b_at = sum([text.count('#') for text in hl]), sum([text.count('#') for text in bl])
    h_tt, b_tt = sum([text.count('@') for text in hl]), sum([text.count('@') for text in bl])
    h_rt, b_rt = sum([text.count('http') for text in hl]), sum([text.count('http') for text in bl])
    print("Num tweet (H & B): ", len(hl), len(bl))
    print("Num at (H & B): ", h_at, b_at, "At rate (H & B)", h_at/h_len, b_at/b_len)
    print("Num tt (H & B): ", h_tt, b_tt, "Tt rate (H & B)", h_tt/h_len, b_tt/b_len)
    print("Num rt (H & B): ", h_rt, b_rt, "Rt rate (H & B)", h_rt/h_len, b_rt/b_len)

ana_text(hl10, bl10, data_type='10')
ana_text(hl11, bl11, data_type='11')
ana_text(hl20, bl20, data_type='20')
ana_text(hl21, bl21, data_type='21')
ana_text(hl30, bl30, data_type='30')
ana_text(hl31, bl31, data_type='31') 


data:  10
Num tweet (H & B):  383694 19508
Num at (H & B):  439215 40268 At rate (H & B) 1.1447012463056498 2.064178798441665
Num tt (H & B):  407710 24948 Tt rate (H & B) 1.0625915443035336 1.2788599548903015
Num rt (H & B):  278150 14067 Rt rate (H & B) 0.7249266342449974 0.721088784088579
data:  11
Num tweet (H & B):  204582 8574
Num at (H & B):  172655 9355 At rate (H & B) 0.8439403271060015 1.0910893398647072
Num tt (H & B):  190556 10729 Tt rate (H & B) 0.9314406937071688 1.2513412642873805
Num rt (H & B):  114940 3547 Rt rate (H & B) 0.5618285088619722 0.413692558898997
data:  20
Num tweet (H & B):  249524 6909
Num at (H & B):  180194 8667 At rate (H & B) 0.7221509754572707 1.2544507164567955
Num tt (H & B):  273943 9878 Tt rate (H & B) 1.0978623298760841 1.4297293385439283
Num rt (H & B):  105194 2935 Rt rate (H & B) 0.42157868581779706 0.42480822116080474
data:  21
Num tweet (H & B):  887730 16941
Num at (H & B):  566905 18115 At rate (H & B) 0.6386007006634901 1.0692993329791

In [106]:
import nltk
from nltk.corpus import stopwords
from collections import Counter

nltk.download('stopwords')
stop_words = set(stopwords.words('English'))

texts = []
for text in tweet_list:
    words = [word.lower() for word in text.split() if word.lower() not in stop_words]
    texts.extend(words)
word_count = Counter(texts)
for word, count in word_count.most_common():
    print(f"{word}: {count}")
#print(word_count)

[nltk_data] Error loading stopwords: <urlopen error [WinError 10061]
[nltk_data]     由于目标计算机积极拒绝，无法连接。>


climate: 3228
rt: 2308
&amp;: 1093
change: 852
#climatechange: 727
-: 493
#climate: 454
us: 451
#climateaction: 440
new: 439
join: 369
global: 316
action: 313
need: 307
#cop26: 273
help: 258
energy: 244
#climatecrisis: 241
great: 226
people: 218
one: 206
world: 202
like: 201
carbon: 197
learn: 196
read: 192
work: 185
change.: 183
make: 173
get: 167
must: 163
see: 163
take: 159
report: 155
solutions: 155
support: 154
time: 148
future: 148
first: 139
today: 138
emissions: 131
here:: 129
want: 128
next: 128
know: 126
many: 125
green: 125
justice: 125
impacts: 123
#sustainability: 122
thank: 120
change,: 120
better: 117
good: 117
register: 115
part: 112
latest: 112
impact: 111
last: 111
also: 111
#climateemergency: 109
#cleanenergy: 108
event: 108
check: 107
health: 107
leaders: 105
@labourinexile: 105
@corbyn_project: 104
looking: 103
working: 103
adaptation: 101
week: 100
way: 100
clean: 98
we're: 97
data: 97
@jubilee4climate: 97
important: 96
#environment: 96
it’s: 95
sustainable: 95
sa

In [209]:
# analysis of bot influencers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

stop_words = ['the', 'to', 'of', 'and', 'in', 'a', 'for', 'rt', 'is', 'on', 'with', 'are', 'you', 'our', 'we', 'this', 'that', 
              'from', 'at', 'have', 'amp', 'be', 'i', 'by', 'it', 'your', 'as', 'has', 'how', 'can', 'more', 'about', 'not', 
              'my', 'if', 'an', 'but', 'us', 'their', 'who', 'what', 'been', 'so', 'was', 'were', 'https', 'all']

tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(np.array(bl30))

In [210]:
ldab = LatentDirichletAllocation(n_components=10, max_iter=5, learning_method='online', random_state=101)
ldab.fit(tfidf_matrix)
feature_names = tfidf_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(ldab.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10-1:-1]]))
    print()
    

Topic #0:
student expertise climatewwf organization thx ghgsat enable resulting himalayan projected

Topic #1:
fridaysforfuture partner co joining travel mangrovecleanupmumbai climatestrike gretathunberg protecting methane

Topic #2:
climatepledge girls include explain self view adding awards features normal

Topic #3:
requires speed hottest 14th pitch failed publichealth 1fzkiay0at proposal ca

Topic #4:
worldbank renewableenergysystems mastercard dividend solarenergysolutions solarenergycompany allintradelimited letnaturepayyourelectricitybills reducedbills pst

Topic #5:
founder professor finalists seminar gswf2021 advancing humanitarian passion explains represent

Topic #6:
cleantech upelections2022 essay जलव नह वर तन रह पर ymnjrmntju

Topic #7:
co climate change climatechange will cop26 action new climateaction join

Topic #8:
link firms greenwashing cop21 markruffalo joebiden suggest missing usembassynepal lateral

Topic #9:
labourinexile corbyn_project jubilee4climate unitetheun

In [211]:
topic_dist = ldab.transform(tfidf_matrix)
topic_coverage = topic_dist.sum(axis=0)
for i, coverage in enumerate(topic_coverage):
    print(i, coverage)

0 196.6229304743293
1 502.236823889242
2 234.83343749928713
3 217.50428163730288
4 199.6973371039569
5 215.94099109369216
6 212.45372810493072
7 4093.92088594305
8 189.52097549230697
9 247.26860876190128


In [212]:
ptb=ldab.components_[7] # most important topic
print([feature_names[i] for i in ptb.argsort()[:-50-1:-1]])

['co', 'climate', 'change', 'climatechange', 'will', 'cop26', 'action', 'new', 'climateaction', 'join', 'global', 'world', 'energy', 'here', 'out', 'today', 'need', 'justice', 'up', 'now', 'do', 'help', 'climatecrisis', 're', 'people', 'time', 'crisis', 'future', 'carbon', 'great', 'they', 'learn', 'solutions', 'one', 'report', 'work', 'must', 'like', 'or', 'just', 'read', 'its', 'impacts', 'sustainability', 'support', 'emissions', 'environment', 'year', 'health', 'get']


In [154]:
# analysis of human influencers
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(np.array(hl10))
lda = LatentDirichletAllocation(n_components=10, max_iter=3, learning_method='online', random_state=101)
lda.fit(tfidf_matrix)
feature_names = tfidf_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10-1:-1]]))
    print()

Topic #0:
review eu tuesday listen leadership weekly los icymi del town

Topic #1:
illness sars cov continues record sarscov2 announce finds persons blood

Topic #2:
1st dashboard 23 drug huge grateful cost calling potus coverage

Topic #3:
women 24 indiafightscorona air provides initiative game often particularly focused

Topic #4:
department tools queen sa security district certain request overall actions

Topic #5:
order stop canada website clinical approved 21 sharing center omicronvariant

Topic #6:
co covid 19 covid19 vaccine new will cases health pandemic

Topic #7:
hands nyc side outbreaks along wash hold girls organizations san

Topic #8:
hand alert regarding nytimes location maps faculty lockdowns eligibility 98

Topic #9:
de la en el 28 que le un co learned



In [155]:
topic_dist = lda.transform(tfidf_matrix)
topic_coverage = topic_dist.sum(axis=0)
for i, coverage in enumerate(topic_coverage):
    print(i, coverage)

0 19446.537250449386
1 18389.060781101485
2 19628.343133089984
3 17074.111997817672
4 16473.190368513002
5 21119.83597862936
6 220123.8159076791
7 16778.306034632686
8 16102.118414888415
9 18558.680133203346


In [156]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(np.array(hl20))
lda2 = LatentDirichletAllocation(n_components=10, max_iter=3, learning_method='online', random_state=101)
lda2.fit(tfidf_matrix)
feature_names = tfidf_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda2.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10-1:-1]]))
    print()

Topic #0:
ukraine co russia minister russian humanitarian indian president today security

Topic #1:
pray heart co russiaukraineconflict praying serious friday services voice small

Topic #2:
el co ministry agree ukrainerussianwar sharing resolve themselves shazam russiaukraine

Topic #3:
de la en co ucrania di le il que ucraina

Topic #4:
talks advisory et supports joined les co flight evacuate des

Topic #5:
ukrainerussiawar proud co cities narendramodi number scale affected group press

Topic #6:
co standwithukraine february place courage amazing supply happy coverage link

Topic #7:
children show news co human hours romania keep message rights

Topic #8:
ukraine russia co russian people will putin they support war

Topic #9:
crypto assets address words donations god prayers trussliz offer grateful



In [157]:
topic_dist = lda2.transform(tfidf_matrix)
topic_coverage = topic_dist.sum(axis=0)
for i, coverage in enumerate(topic_coverage):
    print(i, coverage)

0 36624.86154109508
1 12725.233016710266
2 12790.316592106425
3 15274.829425216485
4 12034.698083232583
5 15108.883019832978
6 15171.801330866541
7 21308.64768225387
8 93428.22106291239
9 15056.50824577242


In [158]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(np.array(hl30))
lda3 = LatentDirichletAllocation(n_components=10, max_iter=3, learning_method='online', random_state=101)
lda3.fit(tfidf_matrix)
feature_names = tfidf_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda3.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10-1:-1]]))
    print()

Topic #0:
vulnerability pm noaa arctic epa nuclear michaelemann seen worst war

Topic #1:
potus citizensclimate ceo congratulations ukraine house decade goal happy russia

Topic #2:
dr learning paris early applications push details contribute gmt authors

Topic #3:
video urban temperature award review agree innovative submit words nasa

Topic #4:
love bipartisan putting rural groups undpclimate saving advocate funded fun

Topic #5:
present de rapidly agreed closing mit staff degrees window pact

Topic #6:
co climate change climatechange will new action cop26 energy today

Topic #7:
march tune others supply wildfires host wildlife discussed findings central

Topic #8:
council digital featuring friday inspired al signed egypt price truly

Topic #9:
strike front colleagues activist prevent company 50 trying kind links



In [159]:
topic_dist = lda3.transform(tfidf_matrix)
topic_coverage = topic_dist.sum(axis=0)
for i, coverage in enumerate(topic_coverage):
    print(i, coverage)

0 9030.418173406653
1 8615.295758966027
2 7955.024870978863
3 8280.414107284694
4 7055.885726143998
5 6974.1443629546575
6 87729.88057040265
7 8343.59998000023
8 6832.322956119709
9 7090.013493743009


In [190]:
# most important topic:
pt1=lda.components_[6]
pt2=lda2.components_[8]
pt3=lda3.components_[6]

In [191]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(np.array(hl10))
feature_names1 = tfidf_vectorizer.get_feature_names()

tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(np.array(hl20))
feature_names2 = tfidf_vectorizer.get_feature_names()

tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(np.array(hl30))
feature_names3 = tfidf_vectorizer.get_feature_names()

In [198]:
[feature_names3[i] for i in pt3.argsort()[:-35-1:-1]]

['co',
 'climate',
 'change',
 'climatechange',
 'will',
 'new',
 'action',
 'cop26',
 'energy',
 'today',
 'climateaction',
 'join',
 'out',
 'now',
 'world',
 'global',
 'here',
 'need',
 'report',
 'help',
 're',
 'crisis',
 'people',
 'up',
 'impacts',
 'do',
 'future',
 'work',
 'they',
 'ipcc',
 'time',
 'one',
 'carbon',
 'learn',
 'read']