# import and load 

In [2]:
#%%
import networkx as nx
import igraph as ig
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import uunet.multinet as ml
import json


full_network_path = '/Volumes/boot420/Users/data/climate_network/cop22/networks/cop22_retweets.gml'
topics_file = '/Volumes/boot420/Users/data/climate_network/cop22/cache/topics_cop22.csv'
projected_path = '/Volumes/boot420/Users/data/climate_network/cop22/networks/cop22_retweets_ml.gml'

test_network = '/Users/alessiogandelli/dev/internship/tweets-to-topic-network/data/networks/toy_test.gml'

# load network
g = ig.read(full_network_path, format='gml')

mln = ml.read(projected_path)
topic_name = pd.read_csv(topics_file)
topic_label = json.load(open('/Volumes/boot420/Users/data/climate_network/cop22/cache/labels_cop22.json'))

  return reader(f, *args, **kwds)


In [6]:
topic_label

{'-1': 'Climate Change News and Movement',
 '0': 'Climate Change Action',
 '1': 'Climate Change News Updates',
 '2': "Women's Leadership in Climate Change"}

# data preparation

In [3]:
# %%
# subset of nodes : users, tweets, original tweets
tweets = g.vs.select(bipartite=1)
user = g.vs.select(bipartite=0)
original_tweets = g.vs.select(is_retweet='original')

#create dataframe of tweets 
df_tweets = pd.DataFrame({'tweets': tweets['label'], 
                          'author': tweets['author'], 
                          'topic': tweets['topics']}
                          )

 
# create dataframe with original tweets 
original_tweets_indegree = g.degree(original_tweets, mode='in')
df_original = pd.DataFrame({'tweets': original_tweets['label'], 
                                            'indegree': original_tweets_indegree, 
                                            'author': original_tweets['author'],
                                            'topic': original_tweets['topics']})

df_original['indegree'] = df_original['indegree'] - 1 # do not count the edge from the author, only retweets 
df_rt_user = df_original.rename(columns={'indegree': 'retweets'})
# dataframe with users that creates original tweets
df_rt_user = df_rt_user.groupby('author').aggregate({'tweets':'count', 'retweets':'sum'})
df_rt_user['rt_per_tweet'] = round(df_rt_user['retweets'] / df_rt_user['tweets'])

original_user_no_rt = df_rt_user[df_rt_user['retweets'] > 0] # oringal users with 0 retweets 

top_users = {}
for i in [10000,1000,100,10,3]:

    top_users[i] =  df_rt_user.sort_values('retweets', ascending=False).head(i).sum()
    
df_top_users = pd.DataFrame.from_dict(top_users, orient='index')[['tweets', 'retweets']]


In [4]:
#count how many topics a user tweets about
df_tweets.groupby('author').aggregate({'tweets':'count', 'topic':'nunique'}).sort_values('topic', ascending=False).head(100)




Unnamed: 0_level_0,tweets,topic
author,Unnamed: 1_level_1,Unnamed: 2_level_1
PlaneteVivante,179,11
inventwitt,619,11
rjber15,528,11
FabyLizarragaG,227,11
laila_cakes,431,11
...,...,...
IntiqabRawoof,141,10
Habitat4_2036,219,10
gezgintrk,154,10
HabitatCO2lutio,164,10


# stats 

## basic 

Out of 454k tweets, most of them are copies (retweets). we call the tweets that are not retweeted but wrote by someone original tweets.

original tweets are 110k (24%) and retweets are 344k (76%), these original tweets have been written by a subset of all the 130k users, only 19k users (15%) wrote original tweets.

out of the 110k original tweets only 50k have at least one retweet, so there are around 400k tweets that are or original or a copy of the originals

all the original tweets with retweets are make by less than 9k people

we call producers the users that produce tweets and retweeters the ones who spread.

In [5]:
print('Number of users: ', len(user))
print('Number of tweets: ', len(tweets))
print('number of original tweets: ', len(original_tweets))
print('original tweets with retweets: ', len(df_rt_user[df_rt_user['retweets'] > 0]))
print('user that tweeted original tweets: ', len(set(original_tweets['author'])))
print('user that tweeted original tweets with retweets: ', len(set(df_rt_user[df_rt_user['retweets'] > 0])))


Number of users:  134212
Number of tweets:  454754
number of original tweets:  110043
original tweets with retweets:  8690
user that tweeted original tweets:  19583
user that tweeted original tweets with retweets:  3


## top users 

there are few producers (authorities) and many retweeters, it is interesting to notice how taking the n top retweet users we can reach a big portion of the network.

with 10k producers we can generate 454k tweets 
with 100 producers we can generate 218k tweets

find hubs!! users that are retweeting producers a lot 

In [117]:
df_top_users

Unnamed: 0,tweets,retweets
10000,91174.0,344711.0
1000,44785.0,299225.0
100,18819.0,200070.0
10,10470.0,108128.0
3,8116.0,62663.0


In [47]:
topic_name.set_index('Topic', inplace=True)

## topics


analyzing the topic we can investigate the ones that are more retweeted and the one more produce, we can see that topics related to air pollution are the most retweeted 

to achieve this i calculate the percentage increment on the share each topic have in the original set of tweets versus the all set of tweets 

In [118]:
original_topic = df_original.value_counts('topic')
tweets_topic = df_tweets.value_counts('topic')

df_topics = pd.DataFrame({'original': original_topic, 'tweets': tweets_topic})
df_topics['retweets'] = df_topics['tweets'] - df_topics['original']
df_topics['original_prob'] =  df_topics['original'] / df_topics['original'].sum() 
df_topics['tweets_prob'] =  df_topics['tweets'] / df_topics['tweets'].sum()
df_topics['retweets_prob'] =  df_topics['retweets'] / df_topics['retweets'].sum()

df_topics['increment'] =  df_topics['tweets_prob'] / df_topics['original_prob']
df_topics['rt_increment'] = df_topics['retweets_prob'] / df_topics['original_prob']
# take the topic name from the dictionary topic_labels keys should match the index of df_topics
topic_label = {int(k): v for k, v in topic_label.items()}
df_topics['topic_name'] =  df_topics.index.map(topic_label)


df_topics.sort_values('rt_increment', ascending=False)


Unnamed: 0_level_0,original,tweets,retweets,original_prob,tweets_prob,retweets_prob,increment,rt_increment,topic_name
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5.0,2570,15443,12873,0.023355,0.033959,0.037344,1.454067,1.59902,Climate Smart Agriculture and Food Security
8.0,1296,7064,5768,0.011777,0.015534,0.016733,1.31896,1.420782,Gender Equality & Women's Climate Justice
7.0,1905,10239,8334,0.017311,0.022515,0.024177,1.300614,1.39658,African Climate Change
9.0,1255,6542,5287,0.011405,0.014386,0.015337,1.2614,1.344847,Deforestation & Forests
4.0,2633,13700,11067,0.023927,0.030126,0.032105,1.259087,1.341795,Climate Change News
2.0,4912,24773,19861,0.044637,0.054476,0.057616,1.220411,1.290773,Climate Change Impacts & Solutions
6.0,2425,10922,8497,0.022037,0.024017,0.02465,1.089874,1.118565,Paris Agreement Ratification and Implementation
1.0,15038,66910,51872,0.136656,0.147134,0.15048,1.07668,1.101159,Climate Finance at COP22
0.0,15237,62055,46818,0.138464,0.136458,0.135818,0.985515,0.980891,COP22 Climate Action
-1.0,58735,232421,173686,0.533746,0.511092,0.50386,0.957556,0.944007,Climate Change Action


In [110]:
#keys as int


In [113]:
df_topics.sort_values('original', ascending=False)


Unnamed: 0_level_0,original,tweets,retweets,original_prob,tweets_prob,retweets_prob,increment,rt_increment,topic_name
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1.0,58735,232421,173686,0.533746,0.511092,0.50386,0.957556,0.944007,Climate Change Action
0.0,15237,62055,46818,0.138464,0.136458,0.135818,0.985515,0.980891,COP22 Climate Action
1.0,15038,66910,51872,0.136656,0.147134,0.15048,1.07668,1.101159,Climate Finance at COP22
2.0,4912,24773,19861,0.044637,0.054476,0.057616,1.220411,1.290773,Climate Change Impacts & Solutions
3.0,4037,4685,648,0.036686,0.010302,0.00188,0.280826,0.051242,Renewable Energy Transition
4.0,2633,13700,11067,0.023927,0.030126,0.032105,1.259087,1.341795,Climate Change News
5.0,2570,15443,12873,0.023355,0.033959,0.037344,1.454067,1.59902,Climate Smart Agriculture and Food Security
6.0,2425,10922,8497,0.022037,0.024017,0.02465,1.089874,1.118565,Paris Agreement Ratification and Implementation
7.0,1905,10239,8334,0.017311,0.022515,0.024177,1.300614,1.39658,African Climate Change
8.0,1296,7064,5768,0.011777,0.015534,0.016733,1.31896,1.420782,Gender Equality & Women's Climate Justice


# multinet

In [54]:
import uunet.multinet as ml
projected_path = '/Volumes/boot420/Users/data/climate_network/cop22/networks/cop22_retweets_ml.gml'

mln = ml.read(projected_path)


In [71]:
ml_summary = ml.summary(mln)

In [75]:
pd.DataFrame.from_dict(ml_summary)

Unnamed: 0,layer,n,m,dir,nc,slc,dens,cc,apl,dia
0,72.0,561,802,True,556,2,0.002553,0.136285,1.000000,1
1,14.0,2821,5304,True,2752,48,0.000667,0.268215,3.684397,9
2,55.0,699,1021,True,681,9,0.002093,0.323374,1.986111,4
3,37.0,992,1011,True,992,1,0.001028,0.225000,0.000000,0
4,13.0,643,669,True,641,2,0.001621,0.352273,1.000000,1
...,...,...,...,...,...,...,...,...,...,...
94,70.0,325,325,True,325,1,0.003086,0.000000,0.000000,0
95,68.0,488,471,True,482,3,0.001982,0.010169,1.333333,2
96,54.0,553,738,True,541,8,0.002418,0.031219,2.071429,3
97,87.0,127,325,True,127,1,0.020310,0.197647,0.000000,0


In [79]:
deg = ml.degree(mln)
act = ml.actors(mln)

degrees = [ [deg[i], act[i]] for i in range(len(deg)) ]
degrees.sort(reverse = True) 

top_actors = []
for el in degrees[0:10]: 
      top_actors.append(el[1])

layer_deg = dict()
layer_deg["actor"] = top_actors
for layer in ml.layers(mln):
    layer_deg[layer] = ml.degree(mln, actors = top_actors, layers = [layer] )
    
pd.DataFrame.from_dict(layer_deg)

Unnamed: 0,actor,72.0,14.0,55.0,37.0,13.0,93.0,53.0,3.0,69.0,...,66.0,58.0,4.0,74.0,1.0,70.0,68.0,54.0,87.0,94.0
0,UNFCCC,2.0,321.0,30.0,2.0,,64.0,1.0,432.0,277.0,...,,133.0,839.0,18.0,755.0,,,183.0,52.0,
1,UN,1.0,329.0,,3.0,4.0,,,18.0,1.0,...,3.0,1.0,326.0,,458.0,,,1.0,51.0,
2,COP22,76.0,127.0,4.0,,,2.0,6.0,152.0,,...,7.0,13.0,154.0,1.0,213.0,,4.0,,,
3,UNDP,,17.0,,,43.0,15.0,,1154.0,1.0,...,133.0,,931.0,,228.0,,,,,
4,pablorodas,8.0,97.0,11.0,478.0,351.0,,,86.0,,...,47.0,17.0,19.0,2.0,21.0,324.0,,3.0,3.0,
5,UNICEF,,,,,,,,,,...,1202.0,,,,,,,,,
6,UNEP,,3.0,33.0,,,,,,12.0,...,302.0,1.0,79.0,24.0,234.0,,,,1.0,
7,WorldBank,,20.0,113.0,,,6.0,,278.0,1.0,...,,3.0,2.0,,374.0,,,,,
8,LeoDiCaprio,,,,,,,1.0,,,...,,,,,2922.0,,,1.0,,
9,sustainable_4rt,,18.0,13.0,,22.0,,2.0,115.0,8.0,...,15.0,14.0,85.0,10.0,180.0,,,9.0,,


In [122]:
# user with most retweets
topic_label

{-1: 'Climate Change Action',
 0: 'COP22 Climate Action',
 1: 'Climate Finance at COP22',
 2: 'Climate Change Impacts & Solutions',
 3: 'Renewable Energy Transition',
 4: 'Climate Change News',
 5: 'Climate Smart Agriculture and Food Security',
 6: 'Paris Agreement Ratification and Implementation',
 7: 'African Climate Change',
 8: "Gender Equality & Women's Climate Justice",
 9: 'Deforestation & Forests',
 10: 'Water Security and Climate Change'}

In [123]:
df_original[df_original['topic'] == 5].sort_values('indegree', ascending=False).head(10)

Unnamed: 0,tweets,indegree,author,topic
87614,794266760744763392,374,UNFCCC,5.0
29690,798870486759407616,336,JohnKerry,5.0
81116,795611528997244928,265,UN,5.0
86994,794481733563482112,248,UNEP,5.0
85817,794641346267148288,245,UNFCCC,5.0
86801,794505196554620928,229,EU_Commission,5.0
12136,800155458300481536,182,nature_org,5.0
86614,794530920632090624,181,UNFCCC,5.0
96810,784107739819769859,173,ConservationOrg,5.0
83795,795324765095477248,148,EU_Commission,5.0


In [126]:
tweets['text']

['2,000-5000 liters of #water are needed to produce a person&#8217;s daily food #COP22&#160;&#8230; https://t.co/mdld9OjxXS https://t.co/vpWqHYytyI',
 '2,000-5000 liters of #water are needed to produce a person&#8217;s daily food #COP22&#160;&#8230; https://t.co/DrfrEuPMev https://t.co/28EYhLy4Ae',
 '2,000-5000 liters of #water are needed to produce a person&#8217;s daily food #COP22&#160;&#8230; https://t.co/YAs69AtLAE https://t.co/KVKamz5bCK',
 '#CLIMATEchange #p2 RT Top 10 HAPPY environmental stories of 2016. https://t.co/rbTENUUZI5 #COP22 #tcot #2A https://t.co/C4owMuusW5',
 '#CLIMATEchange #p2 RT Our best environment stories from this past&#160;year. https://t.co/f5lY9m4sE2 #COP22 #tcot #2A https://t.co/dFF8I5vESR',
 "My only wish for 2017: that Trump won't destroy the world. #p2 #Uniteblue #tcot #tlot #GOP #feelthebern #Obama #POTUS #climatechange #COP22",
 '#CLIMATEchange #p2 RT  How much fossil fuel has been used in your lifetime? https://t.co/3xOS9PxvHe #COP22 #tcot #2A https: