# import and load 

In [2]:
#%%
import networkx as nx
import igraph as ig
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



full_network_path = '/Volumes/boot420/Users/data/climate_network/cop22/networks/cop22_retweets.gml'
topics_file = '/Volumes/boot420/Users/data/climate_network/cop22/cache/topics_cop22.csv'
projected_path = '/Volumes/boot420/Users/data/climate_network/cop22/networks/cop22_reply_projected.gml'

test_network = '/Users/alessiogandelli/dev/internship/tweets-to-topic-network/data/networks/toy_test.gml'

# load network
g = ig.read(full_network_path, format='gml')

topic_name = pd.read_csv(topics_file)

  return reader(f, *args, **kwds)


# data preparation

In [19]:
# %%
# subset of nodes : users, tweets, original tweets
tweets = g.vs.select(bipartite=1)
user = g.vs.select(bipartite=0)
original_tweets = g.vs.select(is_retweet='original')

#create dataframe of tweets 
df_tweets = pd.DataFrame({'tweets': tweets['label'], 
                          'author': tweets['author'], 
                          'topic': tweets['topics']}
                          )

 
# create dataframe with original tweets 
original_tweets_indegree = g.degree(original_tweets, mode='in')
df_original = pd.DataFrame({'tweets': original_tweets['label'], 
                                            'indegree': original_tweets_indegree, 
                                            'author': original_tweets['author'],
                                            'topic': original_tweets['topics']})

df_original['indegree'] = df_original['indegree'] - 1 # do not count the edge from the author, only retweets 
df_original = df_original.rename(columns={'indegree': 'retweets'})
# dataframe with users that creates original tweets
df_rt_user = df_original.groupby('author').aggregate({'tweets':'count', 'retweets':'sum'})
df_rt_user['rt_per_tweet'] = round(df_rt_user['retweets'] / df_rt_user['tweets'])

original_user_no_rt = df_rt_user[df_rt_user['retweets'] > 0] # oringal users with 0 retweets 

top_users = {}
for i in [10000,1000,100,10,3]:

    top_users[i] =  df_rt_user.sort_values('retweets', ascending=False).head(i).sum()
    
df_top_users = pd.DataFrame.from_dict(top_users, orient='index')[['tweets', 'retweets']]


# stats 

## basic 

Out of 454k tweets, most of them are copies (retweets). we call the tweets that are not retweeted but wrote by someone original tweets.

original tweets are 110k (24%) and retweets are 344k (76%), these original tweets have been written by a subset of all the 130k users, only 19k users (15%) wrote original tweets.

out of the 110k original tweets only 50k ahae at least one retweet, so there are around 400k tweets that are or original or a copy of the originals

all the original tweets with retweets are make by less than 9k people

we call producers the users that produce tweets and retweeters the ones who spread.

In [26]:
print('Number of users: ', len(user))
print('Number of tweets: ', len(tweets))
print('number of original tweets: ', len(original_tweets))
print('original tweets with retweets: ', len(df_original[df_original['retweets'] > 0]))
print('user that tweeted original tweets: ', len(set(original_tweets['author'])))
print('user that tweeted original tweets with retweets: ', len(set(df_original[df_original['retweets'] > 0]['author'])))


Number of users:  134212
Number of tweets:  454754
number of original tweets:  110043
original tweets with retweets:  50728
user that tweeted original tweets:  19583
user that tweeted original tweets with retweets:  8690


## top users 

there are few producers and many retweeters, it is interesting to notice how taking the n top retweet users we can reach a big portion of the network.

with 10k producers we can generate 454k tweets 
with 100 producers we can generate 218k tweets

In [5]:
df_top_users

Unnamed: 0,tweets,retweets
10000,91174.0,344711.0
1000,44785.0,299225.0
100,18819.0,200070.0
10,10470.0,108128.0
3,8116.0,62663.0


In [47]:
topic_name.set_index('Topic', inplace=True)

## topics


analyzing the topic we can investigate the ones that are more retweeted and the one more produce, we can see that topics related to air pollution are the most retweeted 

to achieve this i calculate the percentage increment on the share each topic have in the original set of tweets versus the all set of tweets 

In [51]:
original_topic = df_original.value_counts('topic')
tweets_topic = df_tweets.value_counts('topic')

df_topics = pd.DataFrame({'original': original_topic, 'tweets': tweets_topic})
df_topics['retweets'] = df_topics['tweets'] - df_topics['original']
df_topics['original_prob'] =  df_topics['original'] / df_topics['original'].sum() 
df_topics['tweets_prob'] =  df_topics['tweets'] / df_topics['tweets'].sum()
df_topics['retweets_prob'] =  df_topics['retweets'] / df_topics['retweets'].sum()

df_topics['increment'] =  df_topics['tweets_prob'] / df_topics['original_prob']
df_topics['rt_increment'] = df_topics['retweets_prob'] / df_topics['original_prob']
df_topics['topic_name'] = topic_name['Name']

df_topics.sort_values('rt_increment', ascending=False)


Unnamed: 0_level_0,original,tweets,retweets,original_prob,tweets_prob,retweets_prob,increment,rt_increment,topic_name
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
25.0,509,7100,6591,0.004625,0.015613,0.019120,3.375409,4.133718,25_airpollution_unicef_children_pneumonia
83.0,140,1846,1706,0.001272,0.004059,0.004949,3.190726,3.890078,83_reef_coral_barrier_corals
66.0,209,2592,2383,0.001899,0.005700,0.006913,3.001059,3.639863,66_pollution_cookstoves_delhi_air
41.0,358,4184,3826,0.003253,0.009201,0.011099,2.828099,3.411687,41_kimoon_ban_ki_moon
33.0,429,3960,3531,0.003898,0.008708,0.010243,2.233695,2.627530,33_hottest_record_wmo_year
...,...,...,...,...,...,...,...,...,...
88.0,117,120,3,0.001063,0.000264,0.000009,0.248188,0.008185,88_mysollars_omg_ridley48a_teajunkie1
22.0,529,536,7,0.004807,0.001179,0.000020,0.245186,0.004224,22_easiest_beforetheflood_learn_act
19.0,607,613,6,0.005516,0.001348,0.000017,0.244376,0.003156,19_study_evaporative_climate_cooling
21.0,536,540,4,0.004871,0.001187,0.000012,0.243789,0.002382,21_earn_certificate_offset_footprint


In [49]:
df_topics.sort_values('original', ascending=False)


Unnamed: 0_level_0,original,tweets,retweets,original_prob,tweets_prob,delta,increment,topic_name
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1.0,44182,164857,120675,0.401498,0.362519,0.038979,0.902917,-1_2a_tcot_climatecounts_development
0.0,18039,73160,55121,0.163927,0.160878,0.003049,0.981402,0_conflictsofinterest_pollutersout_kick_stopco...
1.0,3328,20692,17364,0.030243,0.045502,-0.015259,1.504545,1_renewable_energy_100_renewables
2.0,3199,14819,11620,0.029070,0.032587,-0.003516,1.120961,2_climatejustice_justice_champions_climateaction
3.0,2835,14602,11767,0.025763,0.032110,-0.006347,1.246365,3_agriculture_food_farmers_foodsecurity
...,...,...,...,...,...,...,...,...
93.0,113,359,246,0.001027,0.000789,0.000237,0.768780,93_ndcpartnership_ndcs_ndc_cpi
94.0,112,121,9,0.001018,0.000266,0.000752,0.261429,94_crowdsourced_freewordcentre_cop22poem_words
95.0,111,490,379,0.001009,0.001078,-0.000069,1.068216,95_lighting_philipslight_akon_energyefficient
96.0,109,126,17,0.000991,0.000277,0.000713,0.279724,96_thanking_governors_coast_bold


# multinet