In [2]:
## Imports

import pandas as pd
import networkx as nx
import itertools
import numpy as np

### Load the data

In [303]:
## Title hyperlinks
title_links = pd.read_csv("Data/soc-redditHyperlinks-title.tsv",'\t')
## Body hyperlinks
body_links = pd.read_csv("Data/soc-redditHyperlinks-body.tsv",'\t')

### Take a look at the data

In [304]:
title_links.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
0,rddtgaming,rddtrust,1u4pzzs,2013-12-31 16:39:18,1,"25.0,23.0,0.76,0.0,0.44,0.12,0.12,4.0,4.0,0.0,..."
1,xboxone,battlefield_4,1u4tmfs,2013-12-31 17:59:11,1,"100.0,88.0,0.78,0.02,0.08,0.13,0.07,16.0,16.0,..."
2,ps4,battlefield_4,1u4tmos,2013-12-31 17:59:40,1,"100.0,88.0,0.78,0.02,0.08,0.13,0.07,16.0,16.0,..."
3,fitnesscirclejerk,leangains,1u50xfs,2013-12-31 19:01:56,1,"49.0,43.0,0.775510204082,0.0,0.265306122449,0...."
4,fitnesscirclejerk,lifeprotips,1u51nps,2013-12-31 21:02:28,1,"14.0,14.0,0.785714285714,0.0,0.428571428571,0...."


In [305]:
body_links.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
0,leagueoflegends,teamredditteams,1u4nrps,2013-12-31 16:39:58,1,"345.0,298.0,0.75652173913,0.0173913043478,0.08..."
1,theredlion,soccer,1u4qkd,2013-12-31 18:18:37,-1,"101.0,98.0,0.742574257426,0.019801980198,0.049..."
2,inlandempire,bikela,1u4qlzs,2014-01-01 14:54:35,1,"85.0,85.0,0.752941176471,0.0235294117647,0.082..."
3,nfl,cfb,1u4sjvs,2013-12-31 17:37:55,1,"1124.0,949.0,0.772241992883,0.0017793594306,0...."
4,playmygame,gamedev,1u4w5ss,2014-01-01 02:51:13,1,"715.0,622.0,0.777622377622,0.00699300699301,0...."


## the data is presented is as follows

- SOURCE_SUBREDDIT: the subreddit where the link originates
- TARGET_SUBREDDIT: the subreddit where the link ends
- POST_ID: the post in the source subreddit that starts the link
- TIMESTAMP: time time of the post
- POST_LABEL: label indicating if the source post is explicitly negative towards the target post. The value is -1 if the source is negative towards the target, and 1 if it is neutral or positive. The label is created using crowd-sourcing and training a text based classifier, and is better than simple sentiment analysis of the posts. Please see the reference paper for details.
- POST_PROPERTIES: a vector representing the text properties of the source post, listed as a list of comma separated numbers. The vector elements are the following:
    1. Number of characters
    2. Number of characters without counting white space
    3. Fraction of alphabetical characters
    4. Fraction of digits
    5. Fraction of uppercase characters
    6. Fraction of white spaces
    7. Fraction of special characters, such as comma, exclamation mark, etc.
    8. Number of words
    9. Number of unique works
    10. Number of long words (at least 6 characters)
    11. Average word length
    12. Number of unique stopwords
    13. Fraction of stopwords
    14. Number of sentences
    15. Number of long sentences (at least 10 words)
    16. Average number of characters per sentence
    17. Average number of words per sentence
    18. Automated readability index
    19. Positive sentiment calculated by VADER
    20. Negative sentiment calculated by VADER
    21. Compound sentiment calculated by VADER
    22. LIWC_Funct
    23. LIWC_Pronoun
    24. LIWC_Ppron
    25. LIWC_I
    26. LIWC_We
    27. LIWC_You
    28. LIWC_SheHe
    29. LIWC_They
    30. LIWC_Ipron
    31. LIWC_Article
    32. LIWC_Verbs
    33. LIWC_AuxVb
    34. LIWC_Past
    35. LIWC_Present
    36. LIWC_Future
    37. LIWC_Adverbs
    38. LIWC_Prep
    39. LIWC_Conj
    40. LIWC_Negate
    41. LIWC_Quant
    42. LIWC_Numbers
    43. LIWC_Swear
    44. LIWC_Social
    45. LIWC_Family
    46. LIWC_Friends
    47. LIWC_Humans
    48. LIWC_Affect
    49. LIWC_Posemo
    50. LIWC_Negemo
    51. LIWC_Anx
    52. LIWC_Anger
    53. LIWC_Sad
    54. LIWC_CogMech
    55. LIWC_Insight
    56. LIWC_Cause
    57. LIWC_Discrep
    58. LIWC_Tentat
    59. LIWC_Certain
    60. LIWC_Inhib
    61. LIWC_Incl
    62. LIWC_Excl
    63. LIWC_Percept
    64. LIWC_See
    65. LIWC_Hear
    66. LIWC_Feel
    67. LIWC_Bio
    68. LIWC_Body
    69. LIWC_Health
    70. LIWC_Sexual
    71. LIWC_Ingest
    72. LIWC_Relativ
    73. LIWC_Motion
    74. LIWC_Space
    75. LIWC_Time
    76. LIWC_Work
    77. LIWC_Achiev
    78. LIWC_Leisure
    79. LIWC_Home
    80. LIWC_Money
    81. LIWC_Relig
    82. LIWC_Death
    83. LIWC_Assent
    84. LIWC_Dissent
    85. LIWC_Nonflu
    86. LIWC_Filler    

###  Note that POST_PROPERTIES were constructed in order to assign a sign (-1 or 1) to the directed link betweet the SOURCE_SUBREDDIT and TARGET_SUBREDDIT.
### These signs are contained in POST_LABEL. Thus, we will basically need only those three features


In [306]:
## Drop unwanted features 
title_links = title_links[["SOURCE_SUBREDDIT","TARGET_SUBREDDIT","LINK_SENTIMENT","TIMESTAMP"]]
body_links = body_links[["SOURCE_SUBREDDIT","TARGET_SUBREDDIT","LINK_SENTIMENT","TIMESTAMP"]]


## Print some characteristics for Title links
print(20*'=')
print('Title links :')
title_source_subreddits = title_links.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in the title links dataset:\t {len(title_source_subreddits)}')
title_source_set = set(title_source_subreddits)
print(f'The number of unique source subreddits: \t {len(title_source_set)}')
title_subreddits = set(title_links.SOURCE_SUBREDDIT.values.tolist() + title_links.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits in the title dataset:\t {len(title_subreddits)}')
print(20*'=')

## Print some characteristics for Body links
print(20*'=')
print("Body links :")
body_source_subreddits = body_links.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in the body links dataset: \t {len(body_source_subreddits)}')
body_source_set = set(body_source_subreddits)
print(f'The number of unique source subreddits : \t {len(body_source_set)}')
body_subreddits = set(body_links.SOURCE_SUBREDDIT.values.tolist() + body_links.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits in the body dataset :\t {len(body_subreddits)}')
print(20*'=')

Title links :
The number of links in the title links dataset:	 571927
The number of unique source subreddits: 	 43695
the set of all subreddits in the title dataset:	 54075
Body links :
The number of links in the body links dataset: 	 286561
The number of unique source subreddits : 	 27863
the set of all subreddits in the body dataset :	 35776


In [308]:
## Here we concatenate the both dataframes to create a unique dataset of connexions between subreddits.
Data = pd.concat([title_links,body_links])
Data = Data.rename({"LINK_SENTIMENT":"Sign","TIMESTAMP":'Date'},axis=1)
print("All subreddit links :")
source_subreddits = Data.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in all data: \t {len(source_subreddits)}')
source_set = set(source_subreddits)
print(f'The number of unique source subreddits:  {len(source_set)}')
subreddits = set(Data.SOURCE_SUBREDDIT.values.tolist() + Data.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits :\t\t {len(subreddits)}')

All subreddit links :
The number of links in all data: 	 858488
The number of unique source subreddits:  55863
the set of all subreddits :		 67180


In [12]:
## Let's see the number and pourcentages of positive and negatives links
number_pos_links = Data.where(Data.Sign==1).count()[0]
number_neg_links = Data.where(Data.Sign==-1).count()[0]
print(f'There is {number_pos_links} positive links and {number_neg_links} negative links')

print(f'REDDIT : Pourcentage of (+)edges is {round(number_pos_links*100/len(Data),1)}% and pourcentage of (-)edges is {round(number_neg_links*100/len(Data),1)}%')

There is 776278 positive links and 82210 negative links
REDDIT : Pourcentage of (+)edges is 90.4% and pourcentage of (-)edges is 9.6%


### Comparing these pourcentages to the ones of Epinions or Slashdot datasets, We can see that Reddit has a lot less negatives links than the others. We can state (hypothetize ?) that the interactions betweet subreddits are mostly positive and that negative links are especially for conflicts rather than just negative opinion/vote and of course, conflicts are much less likely.

Now let's create a graph of subreddits links

In [311]:
complete_graph = nx.from_pandas_edgelist(Data,source='SOURCE_SUBREDDIT', target="TARGET_SUBREDDIT", edge_attr = ["Sign","Date"],create_using=nx.DiGraph)

nbr_nodes = complete_graph.number_of_nodes()
nbr_edges = complete_graph.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes," Edges =",nbr_edges)

sum_of_pos = sum(1 if w["Sign"]==1 else 0 for (_,_,w) in complete_graph.edges(data=True))
sum_of_neg = sum(1 if w["Sign"]==-1 else 0 for (_,_,w) in complete_graph.edges(data=True))
pourc_of_pos = round(100 *sum_of_pos/nbr_edges,1)
pourc_of_neg = round(100 *sum_of_neg/nbr_edges,1)
print("REDDIT : Pourcentage of (+)edges is {pos}% and pourcentage of (-)edges is {neg}%".format(pos = pourc_of_pos, neg= pourc_of_neg))

REDDIT:    Nodes = 67180  Edges = 339643
REDDIT : Pourcentage of (+)edges is 92.5% and pourcentage of (-)edges is 7.5%


We can see that this way of creating the graph led us to dropping an huge number of edges (from 858488 to 339643) while keeping the same number of nodes.

### Since our dataset contains a lot of duplicates edges (Some of those edges may be all positive or all negative or a combination of positive and negative signs). Creating our directed graph directly from those edges, will ommit these duplicates and will take into account only their last occurences. Thus , it will take only the last sign of the link betweet those two subreddits. This will alter our perspectives since in a case where all links but the LAST ONE , between subreddit A and subreddit B were negative, this generated graph will take only the positive link and we will be dropping very important information.

Let's try and generate a multiple directed graph (MultiDiGraph)

In [334]:
complete_multi_graph = nx.from_pandas_edgelist(Data,source='SOURCE_SUBREDDIT', target="TARGET_SUBREDDIT", edge_attr = ["Sign",'Date'],create_using=nx.MultiDiGraph)

nbr_nodes = complete_multi_graph.number_of_nodes()
nbr_edges = complete_multi_graph.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes," Edges =",nbr_edges)

sum_of_pos = sum(1 if w["Sign"]==1 else 0 for (_,_,w) in complete_multi_graph.edges(data=True))
sum_of_neg = sum(1 if w["Sign"]==-1 else 0 for (_,_,w) in complete_multi_graph.edges(data=True))
pourc_of_pos = round(100 *sum_of_pos/nbr_edges,1)
pourc_of_neg = round(100 *sum_of_neg/nbr_edges,1)
print("REDDIT : Pourcentage of (+)edges is {pos}% and pourcentage of (-)edges is {neg}%".format(pos = pourc_of_pos, neg= pourc_of_neg))

REDDIT:    Nodes = 67180  Edges = 858488
REDDIT : Pourcentage of (+)edges is 90.4% and pourcentage of (-)edges is 9.6%


### We can see now that this graph contains more informations about the data than the other one; It complete and have all edges and nodes

### Since one of our objectives is to compare the similarities and differences between the individuals datasets (Epinions, slashdot and wikipedia) and the communities dataset (Reddit) , apart from this methode of creating a multiple edges directed graph, we decides to generate another graph that , for each multiple signed edges between same two nodes, will create a single edge with the mean of all those signs as unique weight.

In [328]:
## Here we group by (source,target) but instead of keeping all weights, we sum them into one weight
groupedBySource_Target_mean = Data.groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT']).Sign.apply(lambda x: np.mean(x)).to_frame()
## The groupby function makes the pair (source,target) as index.
# To construct the graph, we need to have list of all sources , targets and signs in a dataframe 
listed_source = [ elem[0] for elem in groupedBySource_Target_mean.index]
listed_target = [ elem[1] for elem in groupedBySource_Target_mean.index]
listed_sign = [ elem[0] for elem in groupedBySource_Target_mean.values ]

In [329]:
## We groupby the data using pairs (source,target) and collect all link signs between same pair in one attribute (a dictionary)
groupedBySource_Target_pair = Data.groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT']).Date.apply(lambda x: list(x)).to_frame()
groupedBySource_Target_pair.Date = groupedBySource_Target_pair.Date.apply(lambda x: x[len(x)-1])

In [387]:

data_mean = pd.DataFrame({"source":listed_source,"target":listed_target,"Sign":listed_sign,'Date':groupedBySource_Target_pair.Date.apply(lambda x: x[:10].replace('-','/'))}).reset_index().drop(columns=["SOURCE_SUBREDDIT","TARGET_SUBREDDIT"])

graph_mean = nx.from_pandas_edgelist(data_mean,source='source', target="target", edge_attr = ["Sign",'Date'],create_using=nx.DiGraph)

In [29]:
print(f'This meaned graph contains {len(data_mean)} signed edges.')
print(f'The number of edges which mean is equal to 1 is {len(data_mean[data_mean.Sign==1])}')
print(f'The number of edges which mean is equal to -1 is {len(data_mean[data_mean.Sign==-1])}')

This meaned graph contains 339643 signed edges.
The number of edges which mean is equal to 1 is 298473
The number of edges which mean is equal to -1 is 18104


### We can see that the mean sign of the edges by majority 1 or -1. 
### From this, we decided to define "FRIENDS Communities" by the communities which only have **multiple** (ie : >=2) positive edges. and "ENEMIES communities" by the communities which only have **multiple** (ie : >=2) negative edges.

for this, we need to isolate the communities that have multiple edges between them : ie: we can't consider that two communities are enemies just by having only one negative link between them. so we just keep multiple linked communities

In [35]:
## We groupby the data using pairs (source,target) and collect all link signs between same pair in one attribute (a dictionary)
groupedBySource_Target_pair = Data.groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT']).LINK_SENTIMENT.apply(lambda x: list(x)).to_frame()
groupedBySource_Target_pair.LINK_SENTIMENT = groupedBySource_Target_pair.LINK_SENTIMENT.apply(lambda x: dict(zip(np.arange(len(x)), x)))
## The groupby function makes the pair (source,target) as index.
# To construct the graph, we need to have list of all sources , targets and signs in a dataframe 
listed_source_dict = [ elem[0] for elem in groupedBySource_Target_pair.index]
listed_target_dict = [ elem[1] for elem in groupedBySource_Target_pair.index]
listed_sign_dict = [ elem[0] for elem in groupedBySource_Target_pair.values ]

data_dict_df = pd.DataFrame({"source":listed_source_dict,"target":listed_target_dict,"sign":listed_sign_dict})

In [489]:
indices =[]
for i,elem in zip(range(len(data_dict_df.sign.values)),data_dict_df.sign.values):
    if len(elem)>=4: indices.append(i)

In [490]:
data_multiple_edges = data_dict_df.loc[indices]

In [491]:
data_multiple_edges.re

Unnamed: 0,source,target,sign
22,100daysofketo,keto,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}"
25,100daysofrejection,30daystosortmylifeout,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}"
27,100daysofrejection,howtonotgiveafuck,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: ..."
30,100movies365days,movieaweek,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1}"
31,100movies365days,movieclub,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1}"
...,...,...,...
339586,zorceror44,writingprompts,"{0: 1, 1: 1, 2: 1, 3: 1}"
339596,zreviews,audiophile,"{0: 1, 1: 1, 2: 1, 3: 1}"
339634,zylooxwrites,writingprompts,"{0: 1, 1: 1, 2: 1, 3: 1}"
339635,zyramains,leagueoflegends,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: ..."


In [492]:
len(set(data_multiple_edges.source.unique()) or set(data_multiple_edges.target.unique()))

9760

In [494]:
multiple_edges_meaned_data = data_mean.loc[indices].copy()

In [509]:
multiple_edges_meaned_data

Unnamed: 0,source,target,Sign,Date
22,100daysofketo,keto,1.0,2016/05/07
25,100daysofrejection,30daystosortmylifeout,1.0,2015/12/01
27,100daysofrejection,howtonotgiveafuck,1.0,2015/11/14
30,100movies365days,movieaweek,1.0,2014/07/12
31,100movies365days,movieclub,1.0,2014/07/12
...,...,...,...,...
339586,zorceror44,writingprompts,1.0,2016/08/25
339596,zreviews,audiophile,1.0,2016/06/25
339634,zylooxwrites,writingprompts,1.0,2015/11/05
339635,zyramains,leagueoflegends,1.0,2016/11/16


In [428]:
enemies_df = multiple_edges_meaned_data[multiple_edges_meaned_data.Sign==-1].copy()
enemies_df

Unnamed: 0,source,target,Sign,Date
4929,againsthatesubreddits,4chan,-1.0,2017/04/05
5009,againsthatesubreddits,gender_critical,-1.0,2017/02/08
5150,againsthatesubreddits,undelete,-1.0,2015/09/11
5392,againstmensrights,atheism,-1.0,2016/12/06
5487,againstmensrights,publicfreakout,-1.0,2017/02/15
...,...,...,...,...
315074,twoxchromosomes,confession,-1.0,2015/06/29
321818,uvajerk,uva,-1.0,2017/01/01
327721,watchredditdie,news,-1.0,2016/12/02
329205,wehraboosinaction,historyporn,-1.0,2016/12/21


In [454]:
set(enemies_df.source.unique()) -  (set(Data[Data.Sign==1].SOURCE_SUBREDDIT.unique()) or set(Data[Data.Sign==1].TARGET_SUBREDDIT.unique()))

{'harpersrecession',
 'mongolianhatewatch',
 'postamericanhistory',
 'shittyokcupid2',
 'the_iowa'}

### We can see here that there is subreddits that initiates only NEGATIVE links to other subreddits

In [458]:
Data[Data.SOURCE_SUBREDDIT=="harpersrecession"]

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Sign,Date
312910,harpersrecession,canada,-1,2016-01-19 11:58:51
313125,harpersrecession,soylent,-1,2016-01-19 20:33:37
313622,harpersrecession,canada,-1,2016-01-20 18:21:02
314301,harpersrecession,canada,-1,2016-01-21 19:01:54


In [459]:
Data[Data.SOURCE_SUBREDDIT=="mongolianhatewatch"]

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Sign,Date
235198,mongolianhatewatch,european,-1,2015-08-30 07:52:35
238207,mongolianhatewatch,european,-1,2015-08-30 07:52:35
242467,mongolianhatewatch,european,-1,2015-08-30 07:52:35
273497,mongolianhatewatch,european,-1,2015-11-02 17:55:45
277230,mongolianhatewatch,topmindsofreddit,-1,2015-11-02 22:14:04
277237,mongolianhatewatch,againsthatesubreddits,-1,2015-11-02 22:14:04
279497,mongolianhatewatch,european,-1,2015-11-02 22:14:04


In [463]:
Data[Data.SOURCE_SUBREDDIT=="postamericanhistory"]

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Sign,Date
333089,postamericanhistory,economics,-1,2016-02-23 03:53:08
337940,postamericanhistory,economics,-1,2016-03-19 14:03:18
340739,postamericanhistory,economics,-1,2016-03-19 14:03:18


In [460]:
Data[Data.SOURCE_SUBREDDIT=="shittyokcupid2"]

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Sign,Date
467277,shittyokcupid2,okcupid,-1,2016-11-09 03:58:04
472275,shittyokcupid2,okcupid,-1,2016-11-17 08:19:39
475311,shittyokcupid2,okcupid,-1,2016-11-24 06:57:51


In [461]:
Data[Data.SOURCE_SUBREDDIT=="the_iowa"]

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Sign,Date
543601,the_iowa,iowa,-1,2017-03-18 06:16:24
544359,the_iowa,iowa,-1,2017-03-19 06:18:24
546131,the_iowa,iowa,-1,2017-03-22 09:52:00
547385,the_iowa,iowa,-1,2017-03-24 10:27:57


### It's generally a repetitive negative links in a maximum of one month period of time so we may say that it's a conflict

In [161]:
friends_df = multiple_edges_meaned_data[multiple_edges_meaned_data.sign==1].copy()
friends_df

Unnamed: 0,source,target,sign
1,07scape,osrstranscripts,1.0
6,0magick,occult,1.0
10,0x02,writingprompts,1.0
12,0x10c,techcompliant,1.0
22,100daysofketo,keto,1.0
...,...,...,...
339622,zurich,switzerland,1.0
339629,zxspectrum,retrogaming,1.0
339634,zylooxwrites,writingprompts,1.0
339635,zyramains,leagueoflegends,1.0


In [162]:
print(f'number of source subreddits from which originates the friendships : {len(friends_df.source.unique())}')
print(f'number of source subreddits from which originates the conflicts : {len(enemies_df.source.unique())}')

number of source subreddits from which originates the friendships : 18460
number of source subreddits from which originates the conflicts : 433


In [163]:
indices_source_enemies = []
for i,elem in zip(range(len(Data)),Data.SOURCE_SUBREDDIT.values):
    if elem in enemies_df.source.unique() : indices_source_enemies.append(i)
        
edges_originated_from_enemies_source = Data.loc[indices_source_enemies].copy()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """


In [164]:
edges_originated_from_enemies_source=edges_originated_from_enemies_source.dropna()
negative_edges_originated_from_enemies_source = edges_originated_from_enemies_source[edges_originated_from_enemies_source.LINK_SENTIMENT==-1]
negative_edges_originated_from_enemies_source

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,LINK_SENTIMENT
16,subredditdrama,nfl,-1.0
52,shitstatistssay,foodforthought,-1.0
53,casualiama,teenagers,-1.0
68,metaphotography,photography,-1.0
89,sjsucks,truechristian,-1.0
...,...,...,...
571866,badwomensanatomy,askreddit,-1.0
571891,peoplewhosayheck,sex,-1.0
571908,peoplewhosayheck,divorce,-1.0
571912,conspiracy,truereddit,-1.0


In [165]:
all_negative_signs = Data[Data.LINK_SENTIMENT==-1].copy()
all_negative_signs

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,LINK_SENTIMENT
16,subredditdrama,nfl,-1
20,libertyworldproblems,bitcoin,-1
52,shitstatistssay,foodforthought,-1
69,childfree,adviceanimals,-1
80,levantinewar,worldpolitics,-1
...,...,...,...
286475,badpolitics,bannedfromthe_donald,-1
286491,tipofmytongue,deathcore,-1
286501,soundcloud,procss,-1
286523,enoughtrumpspam,humansbeingbros,-1


In [487]:
len(set(all_negative_signs.SOURCE_SUBREDDIT.unique()) or set(all_negative_signs.TARGET_SUBREDDIT.unique()))

8698

#### Here we compute the pourcentage of negative edges that originated from the set of source subreddits that led to more than 2 conflicts comparing to all negative links.

In [166]:
100 * len(negative_edges_originated_from_enemies_source) / len(all_negative_signs)

57.623160199489114

#### Here we compute the pourcentage of source subreddits that led to more than 3 conflicts comparing to the set of all source subreddits from which originated a negative link

In [167]:
100 * len(enemies_df.source.unique()) / len(all_negative_signs.SOURCE_SUBREDDIT.unique())

4.978155897907565

In [168]:
indices = []
for i,elem in zip(range(len(all_negative_signs)),all_negative_signs.SOURCE_SUBREDDIT.values):
    if elem in set(enemies_df.source.unique()):
        indices.append(i)

In [169]:
negative_links_originated_from_enemies_df = all_negative_signs.loc[indices].copy()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


#### Here we compute the pourcentage of negative links originated from subreddits with more than 2 conflicts comparing to the set of all negative links

In [170]:
100*len(negative_links_originated_from_enemies_df)/len(all_negative_signs)

61.54725702469286

In [171]:
indices = []
for i,source,target in zip(range(len(Data)),Data.SOURCE_SUBREDDIT.values, Data.TARGET_SUBREDDIT.values):
    if (source in enemies_df.source.unique() or target in enemies_df.source.unique()):
        indices.append(i)

In [172]:
links_sourceORtarget_sourceConfilcts = Data.loc[indices].dropna()
links_sourceORtarget_sourceConfilcts

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,LINK_SENTIMENT
3,fitnesscirclejerk,leangains,1.0
3,nfl,cfb,1.0
4,fitnesscirclejerk,lifeprotips,1.0
4,playmygame,gamedev,1.0
7,bestoftldr,tifu,1.0
...,...,...,...
571914,subredditdrama,alpinism,1.0
571919,anarcho_capitalism,truereddit,-1.0
571920,peoplewhosayheck,dfsports,1.0
571923,peoplewhosayheck,spiderman,1.0


In [173]:
count = 0
for elem in set(all_negative_signs.SOURCE_SUBREDDIT.unique()):
    if elem in set(all_negative_signs.TARGET_SUBREDDIT.unique()):
        count+=1

In [174]:
100*len(set(all_negative_signs.SOURCE_SUBREDDIT.unique()) and set(all_negative_signs.TARGET_SUBREDDIT.unique())) / len(set(all_negative_signs.SOURCE_SUBREDDIT.unique()))

76.2819038859508

### We cay say that if a subreddit initiated a negative link to another subreddit, it will be implicated at 76% in a conflict as a target.

In [175]:
100*len(set(all_negative_signs.SOURCE_SUBREDDIT.unique()) or set(all_negative_signs.TARGET_SUBREDDIT.unique())) / len(set(Data.SOURCE_SUBREDDIT.unique()) or set(Data.TARGET_SUBREDDIT.unique()))

15.570234323255105

### We can say that from 15.5% of the nodes originates all the conflicts between subreddit communities

## Now let's take a look at the triads in the two different graphs

In [176]:
def compute_count_wanted_triads(graph):
    triads = nx.triadic_census(graph)
    total_triads = triads['030T']+triads['030C']
    total_triads= total_triads+(2*(triads['120U']+triads['120D']+triads['120C']))
    total_triads=total_triads+(4*triads['210'])+(8*triads['300'])
    return total_triads

In [178]:
total_multigraph_triads = compute_count_wanted_triads(complete_multi_graph)
total_meanedgraph_triads = compute_count_wanted_triads(graph_mean)

In [181]:
print(20*'=')
print('MEANED WEIGHTS DIRECTED GRAPH')
nbr_nodes_mean = graph_mean.number_of_nodes()
nbr_edges_mean = graph_mean.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes_mean," Edges =",nbr_edges_mean)


sum_of_pos_mean = sum(1 if w["sign"]>0 else 0 for (_,_,w) in graph_mean.edges(data=True))
sum_of_neg_mean = sum(1 if w["sign"]<0 else 0 for (_,_,w) in graph_mean.edges(data=True))
pourc_of_pos_mean = round(100 *sum_of_pos_mean/nbr_edges_mean,1)
pourc_of_neg_mean = round(100 *sum_of_neg_mean/nbr_edges_mean,1)
print("REDDIT : Pourcentage of (+)edges is {pos}% and pourcentage of (-)edges is {neg}%".format(pos = pourc_of_pos_mean, neg= pourc_of_neg_mean))

print(20*'=')
print('MULTI EDGED DIRECTED GRAPH')
nbr_nodes_multi_graph = complete_multi_graph.number_of_nodes()
nbr_edges_multi_graph = complete_multi_graph.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes_multi_graph," Edges =",nbr_edges_multi_graph)


sum_of_pos_multi_graph = sum(1 if w["LINK_SENTIMENT"]>0 else 0 for (_,_,w) in complete_multi_graph.edges(data=True))
sum_of_neg_multi_graph = sum(1 if w["LINK_SENTIMENT"]<0 else 0 for (_,_,w) in complete_multi_graph.edges(data=True))
pourc_of_pos_multi_graph = round(100 *sum_of_pos_multi_graph/nbr_edges_multi_graph,1)
pourc_of_neg_multi_graph = round(100 *sum_of_neg_multi_graph/nbr_edges_multi_graph,1)
print("REDDIT : Pourcentage of (+)edges is {pos}% and pourcentage of (-)edges is {neg}%".format(pos = pourc_of_pos_multi_graph, neg= pourc_of_neg_multi_graph))

MEANED WEIGHTS DIRECTED GRAPH
REDDIT:    Nodes = 67180  Edges = 339643
REDDIT : Pourcentage of (+)edges is 92.6% and pourcentage of (-)edges is 5.7%
MULTI EDGED DIRECTED GRAPH
REDDIT:    Nodes = 67180  Edges = 858488
REDDIT : Pourcentage of (+)edges is 90.4% and pourcentage of (-)edges is 9.6%


In [182]:
output_df = pd.DataFrame(columns=["MultiGraph","meaned_weights"])
output_df=output_df.append(pd.DataFrame([["{:,}".format(nbr_nodes_multi_graph),"{:,}".format(nbr_nodes_mean)]],columns=["MultiGraph","meaned_weights"],index= {"Nodes"}))
output_df=output_df.append(pd.DataFrame([["{:,}".format(nbr_edges_multi_graph),"{:,}".format(nbr_edges_mean)]],columns=["MultiGraph","meaned_weights"],index= {"Edges"}))
output_df=output_df.append(pd.DataFrame([[str(pourc_of_pos_multi_graph)+'%',str(pourc_of_pos_mean)+'%']],columns=["MultiGraph","meaned_weights"],index={"+ edges"}))
output_df=output_df.append(pd.DataFrame([[str(pourc_of_neg_multi_graph)+"%",str(pourc_of_neg_mean)+'%']],columns=["MultiGraph","meaned_weights"],index={"- edges"}))
output_df=output_df.append(pd.DataFrame([["{:,}".format(total_multigraph_triads),"{:,}".format(total_meanedgraph_triads)]],columns=["MultiGraph","meaned_weights"],index={"Triads"}))
output_df

Unnamed: 0,MultiGraph,meaned_weights
Nodes,67180,67180
Edges,858488,339643
+ edges,90.4%,92.6%
- edges,9.6%,5.7%
Triads,4077337,4077337


## Now we will generate both the predictions of status with respect to the generative and receptive surprise , and the predictions of structural balance for EPINIONS dataset and compare its results with the results of REDDIT dataset. We will also create visualizations for our datastory.

In [281]:
## Helper function used to parse wikipedia data

def custom_parsing(path):
    ## Initializing everything before parsing
    
    result_list = []

    to_node = None
    from_node = None
    sign = None

    ## Opening the file
    ## Chose encoding="iso8859_16" as simple "UTF-8" gave me errors
    with open(path, 'r', encoding="iso8859_16") as f:
        
        ## For each line ... 
        for line in f:
            ## Split the line by " "
            splitted = line.split()

            ## If empty line, continue
            if(len(splitted) == 0):
                continue

            ## If this is a "U" line ...
            elif(splitted[0] == 'U'):
                ## Take the id of the nominated user
                to_node = int(splitted[1])

            ## If this is a "V" line ...
            elif(splitted[0] == 'V'):
                ## Take the sign of the vote
                sign = int(splitted[1])
                
                ## Take the id of voter
                from_node = int(splitted[2])
                
                date = splitted[3]
                date = date.replace('-', '/')
                
                ## If the vote was neutral, don't take it (continue)
                ## Else store the line in the intermediary list
                if(sign == 0):
                    continue
                else:
                    result_list.append([from_node, to_node, sign, date])
            
            ## If this is any other kind of line, continue
            else:
                continue

    ## Converting the intermediary list into a dataframe and name columns correctly
    result_df = pd.DataFrame(result_list, columns=['FromNodeId', 'ToNodeId', 'Sign', 'Date'])
    
    return result_df

In [284]:
## Loading data and sorting

epinions_path = "Data/user_rating.txt"
# epinions_path = "data/soc-sign-epinions.txt"

epinions_df = pd.read_csv(epinions_path, sep="\t", header=None, 
                          comment="#", names=['FromNodeId', 'ToNodeId', 'Sign', 'Date'])
epinions_df = epinions_df.sort_values(by=["FromNodeId", "ToNodeId"]).reset_index(drop=True)

slashdot_df = pd.read_csv("data/soc-sign-Slashdot090221.txt", sep="\t", header=None, 
                          comment="#", names=['FromNodeId', 'ToNodeId', 'Sign'])
slashdot_df = slashdot_df.sort_values(by=["FromNodeId", "ToNodeId"]).reset_index(drop=True)

wikipedia_df = custom_parsing("data/wikiElec.ElecBs3.txt")
wikipedia_df = wikipedia_df.sort_values(by=["FromNodeId", "ToNodeId"]).reset_index(drop=True)

In [285]:
epinions_graph = nx.from_pandas_edgelist(epinions_df, source="FromNodeId", target="ToNodeId", 
                                         edge_attr=["Sign", "Date"], create_using=nx.DiGraph)

slashdot_graph = nx.from_pandas_edgelist(slashdot_df, source="FromNodeId", target="ToNodeId", 
                                         edge_attr="Sign", create_using=nx.DiGraph)

wikipedia_graph = nx.from_pandas_edgelist(wikipedia_df, source="FromNodeId", target="ToNodeId", 
                                          edge_attr=["Sign", "Date"], create_using=nx.DiGraph)

In [291]:
C_LINKS_TYPES = {
    tuple(sorted([('vw', 1), ('wu', 1)])) : 't1',
    tuple(sorted([('vw', 1), ('wu', -1)])) : 't2',
    tuple(sorted([('vw', 1), ('uw', 1)])) : 't3',
    tuple(sorted([('vw', 1), ('uw', -1)])) : 't4',
    tuple(sorted([('vw', -1), ('wu', 1)])) : 't5',
    tuple(sorted([('vw', -1), ('wu', -1)])) : 't6',
    tuple(sorted([('vw', -1), ('uw', 1)])) : 't7',
    tuple(sorted([('vw', -1), ('uw', -1)])) : 't8', 
    tuple(sorted([('wv', 1), ('wu', 1)])) : 't9',
    tuple(sorted([('wv', 1), ('wu', -1)])) : 't10',
    tuple(sorted([('wv', 1), ('uw', 1)])) : 't11',
    tuple(sorted([('wv', 1), ('uw', -1)])) : 't12',
    tuple(sorted([('wv', -1), ('wu', 1)])) : 't13',
    tuple(sorted([('wv', -1), ('wu', -1)])) : 't14',
    tuple(sorted([('wv', -1), ('uw', 1)])) : 't15',
    tuple(sorted([('wv', -1), ('uw', -1)])) : 't16',
}

In [292]:
def c_links_census(graph):
    census = {
        f't{i}': {
            '+': 0,
            '-': 0
    } for i in range(1, 17)}
    
    census_edges = {
        f't{i}' : [] for i in range(1, 17)
    }
    
    c_links_names = [f't{i}' for i in range(1, 17)]
    
    census = pd.DataFrame(0, index = pd.Index(c_links_names), columns = ['+', '-']) 
    
    for v in graph:
        vnbrs = set(graph.succ[v])
        
        for u in vnbrs:
            neighbors_all = {
                ('wv', 'wu'): (set(graph.pred[v]) & set(graph.pred[u])) - {u, v},
                ('wv', 'uw'): (set(graph.pred[v]) & set(graph.succ[u])) - {u, v},
                ('vw', 'wu'): (vnbrs & set(graph.pred[u])) - {u, v},
                ('vw', 'uw'): (vnbrs & set(graph.succ[u])) - {u, v}
            }
            
            for key, neighbors in neighbors_all.items():
                for w in neighbors:
                    mapping = {
                            'wv': (w, v),
                            'vw': (v, w),
                            'wu': (w, u),
                            'uw': (u, w)
                        }
                    
                    edge_1 = mapping[key[0]]
                    edge_2 = mapping[key[1]]
                    
                    if(graph[v][u]['Date'] > graph[edge_1[0]][edge_1[1]]['Date'] and 
                       graph[v][u]['Date'] > graph[edge_2[0]][edge_2[1]]['Date']):

                        c_link_index = []

                        sign_edge_1 = graph[edge_1[0]][edge_1[1]]['Sign']
                        sign_edge_2 = graph[edge_2[0]][edge_2[1]]['Sign']

                        c_link_index.append((key[0], sign_edge_1))
                        c_link_index.append((key[1], sign_edge_2))

                        c_link_type = C_LINKS_TYPES[tuple(sorted(c_link_index))]

                        vu_sign = graph[v][u]['Sign']

                        sign_idx = '+' if vu_sign == 1 else '-'
                        
#                         not_allowed.add(tuple(sorted([v, u, w])))

                        census.loc[c_link_type][sign_idx] += 1
                        census_edges[c_link_type].append((v, u))
                    
    return census, census_edges

In [293]:
def compute_baseline(graph, list_of_c_links, gen_or_rec):
    
    if(gen_or_rec != 'generative' and gen_or_rec != 'receptive'):
        raise ValueError('Impossible value for gen_or_rec argument !')
    
    sum_of_baselines = 0
    
    if(gen_or_rec == 'generative'):
        computed_baseline = {needed_node: None for (needed_node, useless_node) in list_of_c_links}
    else:
        computed_baseline = {needed_node: None for (useless_node, needed_node) in list_of_c_links}

    for c_link in list_of_c_links:
        
        baseline = 0
        
        if(gen_or_rec == 'generative'):
            v = c_link[0]
            succ = graph.succ[v]

            if(computed_baseline[v] is None):
                list_of_edges = [(v, successor) for successor in succ]
                list_of_positive_edges = [
                    (v, successor) for successor in succ if graph[v][successor]['Sign'] == 1
                ]
                
                total_edges = len(list_of_edges)
                total_positive_edges = len(list_of_positive_edges)
                
                baseline = total_positive_edges/total_edges
                
                computed_baseline[v] = baseline
            else:
                baseline = computed_baseline[v]
            
        else:
            u = c_link[1]
            pred = graph.pred[u]
            
            if(computed_baseline[u] is None):
                list_of_edges = [(predecessor, u) for predecessor in pred]
                list_of_positive_edges = [
                    (predecessor, u) for predecessor in pred if graph[predecessor][u]['Sign'] == 1
                ]
                
                total_edges = len(list_of_edges)
                total_positive_edges = len(list_of_positive_edges)
                
                baseline = total_positive_edges/total_edges
                
                computed_baseline[u] = baseline
            else:
                baseline = computed_baseline[u]
        
        sum_of_baselines += baseline
        
    return sum_of_baselines

In [294]:
PRED_B_g = [1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1]
PRED_B_r = [1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1]
PRED_S_g = [1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1]
PRED_S_r = [1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1]

def build_predictions(c_link_census, edges_per_c_link, graph):
    predictions = c_link_census
    predictions['count'] = predictions['+'] + predictions['-']
    predictions['p(+)'] = predictions['+'] / predictions['count']
    
    c_links_names = [key for key in edges_per_c_link.keys()]
    baselines = pd.DataFrame(index = pd.Index(c_links_names), columns = ['generative', 'receptive'])
    
    for c_link_type in edges_per_c_link.keys():    
        list_of_c_links = list(edges_per_c_link[c_link_type])

        print(80 * '=')
        print(c_link_type)
        print('Beginning generative baseline ...')

        baselines.loc[c_link_type]['generative'] = compute_baseline(graph, list_of_c_links, 'generative')

        print('Generative baseline finished !')
        print('Beginning receptive baseline ...')

        baselines.loc[c_link_type]['receptive'] = compute_baseline(graph, list_of_c_links, 'receptive')

        print('Receptive baseline finished !')
        print(80 * '=')
    
    baselines = baselines.astype(float)
    predictions = pd.concat([predictions, baselines], axis=1)
    
    predictions['s_g'] = predictions['count']*predictions['p(+)'] - predictions['generative']
    predictions['s_g'] = predictions['s_g'] / np.sqrt(predictions['generative'] * (1 - (predictions['generative']/predictions['count'])))

    predictions['s_r'] = predictions['count']*predictions['p(+)'] - predictions['receptive']
    predictions['s_r'] = predictions['s_r'] / np.sqrt(predictions['receptive'] * (1 - (predictions['receptive']/predictions['count'])))
    
    predictions['pred_B_g'] = PRED_B_g
    predictions['pred_B_r'] = PRED_B_r
    predictions['pred_S_g'] = PRED_S_g
    predictions['pred_S_r'] = PRED_S_r
    
    predictions['B_g'] = predictions['pred_B_g'] * predictions['s_g'] > 0 
    predictions['B_r'] = predictions['pred_B_r'] * predictions['s_r'] > 0
    predictions['S_g'] = predictions['pred_S_g'] * predictions['s_g'] > 0
    predictions['S_r'] = predictions['pred_S_r'] * predictions['s_r'] > 0
    
    final_df = predictions[['count', 'p(+)', 's_g', 's_r', 'B_g', 'B_r', 'S_g', 'S_r']]
    
    total_series = [final_df['count'].sum(), np.nan, np.nan, np.nan, 
                    final_df['B_g'].sum(), final_df['B_r'].sum(), final_df['S_g'].sum(), final_df['S_r'].sum()]
    total_series = pd.Series(total_series, name='total', index=pd.Index(final_df.columns))
    total_series = total_series.to_frame().transpose()
    
    final_df = pd.concat([final_df, total_series], axis=0)
    
    final_df['count'] = final_df['count'].astype(int)
    final_df['p(+)'] = round(final_df['p(+)'], 3)
    final_df['s_g'] = round(final_df['s_g'], 1)
    final_df['s_r'] = round(final_df['s_r'], 1)
    final_df['B_g'] = final_df['B_g'].astype(int)
    final_df['B_r'] = final_df['B_r'].astype(int)
    final_df['S_g'] = final_df['S_g'].astype(int)
    final_df['S_r'] = final_df['S_r'].astype(int)
    
    return final_df

In [295]:
c_link_census_epinions, edges_per_c_link_epinions = c_links_census(epinions_graph)

In [297]:
c_link_census_wikipedia, edges_per_c_link_wikipedia = c_links_census(wikipedia_graph)

In [298]:
predictions_epinions = build_predictions(c_link_census_epinions, edges_per_c_link_epinions, epinions_graph)
predictions_wikipedia = build_predictions(c_link_census_wikipedia, edges_per_c_link_wikipedia, wikipedia_graph)

t1
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t2
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t3
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t4
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t5
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t6
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t7
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t8
Beginning generative baseline ...
Generative baseline finished !
Beginning recep

Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t14
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t15
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t16
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !


In [300]:
predictions_epinions

Unnamed: 0,count,p(+),s_g,s_r,B_g,B_r,S_g,S_r
t1,1047479,0.95,316.7,58.6,1,1,1,1
t2,102593,0.492,-149.2,-56.1,1,1,1,0
t3,1120362,0.934,295.6,51.7,1,1,0,1
t4,64495,0.773,-22.0,-58.9,1,1,0,0
t5,125029,0.417,87.7,-555.8,0,1,1,1
t6,24099,0.401,16.6,-111.6,1,0,0,1
t7,90981,0.288,10.1,-473.9,0,1,0,1
t8,116592,0.692,189.0,-223.2,1,0,1,1
t9,1345595,0.887,416.3,-197.3,1,0,1,1
t10,80520,0.333,-134.1,-105.0,1,1,1,1


In [299]:
predictions_wikipedia

Unnamed: 0,count,p(+),s_g,s_r,B_g,B_r,S_g,S_r
t1,114678,0.898,55.6,32.1,1,1,1,1
t2,22582,0.684,-48.9,10.5,1,0,1,1
t3,244642,0.908,105.6,72.3,1,1,0,1
t4,21671,0.862,13.6,16.6,0,0,1,1
t5,15088,0.696,11.3,-40.3,0,1,1,1
t6,3718,0.53,-16.8,-14.3,0,0,1,1
t7,23621,0.706,15.3,-37.3,0,1,0,1
t8,17884,0.811,30.9,-3.1,1,0,1,1
t9,131026,0.878,68.2,17.2,1,1,1,0
t10,20185,0.651,-40.4,1.7,1,0,1,0


In [401]:
def c_links_census_meaned_reddit(graph):
    census = {
        f't{i}': {
            '+': 0,
            '-': 0
    } for i in range(1, 17)}
    
    census_edges = {
        f't{i}' : [] for i in range(1, 17)
    }
    
    c_links_names = [f't{i}' for i in range(1, 17)]
    
    census = pd.DataFrame(0, index = pd.Index(c_links_names), columns = ['+', '-']) 
    
    for v in graph:
        vnbrs = set(graph.succ[v])
        
        for u in vnbrs:
            neighbors_all = {
                ('wv', 'wu'): (set(graph.pred[v]) & set(graph.pred[u])) - {u, v},
                ('wv', 'uw'): (set(graph.pred[v]) & set(graph.succ[u])) - {u, v},
                ('vw', 'wu'): (vnbrs & set(graph.pred[u])) - {u, v},
                ('vw', 'uw'): (vnbrs & set(graph.succ[u])) - {u, v}
            }
            
            for key, neighbors in neighbors_all.items():
                for w in neighbors:
                    mapping = {
                            'wv': (w, v),
                            'vw': (v, w),
                            'wu': (w, u),
                            'uw': (u, w)
                        }
                    
                    edge_1 = mapping[key[0]]
                    edge_2 = mapping[key[1]]
                    
                    if(graph[v][u]['Date'] > graph[edge_1[0]][edge_1[1]]['Date'] and 
                       graph[v][u]['Date'] > graph[edge_2[0]][edge_2[1]]['Date']):

                        c_link_index = []

                        sign_edge_1 = graph[edge_1[0]][edge_1[1]]['Sign']
                        sign_edge_2 = graph[edge_2[0]][edge_2[1]]['Sign']

                    #    c_link_index.append((key[0], sign_edge_1))
                    #    c_link_index.append((key[1], sign_edge_2))
                        
                        if (sign_edge_1 >= 0) : 
                            c_link_index.append((key[0], 1))
                        else: c_link_index.append((key[0],-1))
                            
                        if (sign_edge_2 >= 0) : 
                            c_link_index.append((key[1], 1))
                        else: c_link_index.append((key[1],-1))    

                        c_link_type = C_LINKS_TYPES[tuple(sorted(c_link_index))]

                        vu_sign = graph[v][u]['Sign']

                        sign_idx = '+' if vu_sign >= 0 else '-'
                        
#                         not_allowed.add(tuple(sorted([v, u, w])))

                        census.loc[c_link_type][sign_idx] += 1
                        census_edges[c_link_type].append((v, u))
                    
    return census, census_edges

In [402]:
c_link_census_reddit_meaned, edges_per_c_link_reddit_meaned = c_links_census_meaned_reddit(graph_mean)

In [403]:
predictions_reddit = build_predictions(c_link_census_reddit_meaned, edges_per_c_link_reddit_meaned, graph_mean)

t1
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t2
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t3
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t4
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t5
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t6
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t7
Beginning generative baseline ...
Generative baseline finished !
Beginning receptive baseline ...
Receptive baseline finished !
t8
Beginning generative baseline ...
Generative baseline finished !
Beginning recep

In [404]:
predictions_reddit

Unnamed: 0,count,p(+),s_g,s_r,B_g,B_r,S_g,S_r
t1,1097742,0.954,442.0,330.6,1,1,1,1
t2,65239,0.931,114.5,80.2,0,0,0,1
t3,985168,0.945,384.2,306.4,1,1,0,1
t4,60321,0.929,107.1,88.5,0,0,1,1
t5,96831,0.903,170.5,64.9,0,0,1,0
t6,7770,0.882,48.1,18.7,1,1,0,0
t7,76253,0.883,136.4,57.0,0,0,0,0
t8,7227,0.873,43.1,23.2,1,1,1,0
t9,1004639,0.961,355.9,332.6,1,1,1,0
t10,69943,0.94,100.7,86.0,0,0,0,0


## Now we will try and create clusters/communities from the individuals datasets (Epinions, wikipedi, slashdot)

In [245]:
from networkx.algorithms.community import *

We tried several algorithms and techniques to produce a meaningful seperation of individuals and tronsform them to communities, but all the methods that we tried from networkx were very computational heavy and we couldn't obtain any result.
Note that we also tried to generate clusters (from which we could have treated as communities, but even those algorithms were infeasable). We disregarded the used methods but we can state that we tried "asyn_lpa_communities" that detects communities in Graph asynchronous label propagation, the Girvan–Newman method "girvan_newman(), k_clique_communities .. etc
We 


As a consequence we tried to reproduce an algorithm that we learn about in a paper : Community Detection in Weighted Networks:
Algorithms and Applications (link : https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6526730) 

In [None]:
def compute_sigma(G,Community,weight_sum_community):
    cut = cut_size(G,Community,set(G.nodes())-Community,'sign')
    return cut/weight_sum_community

def compute_sum_weights(G,Community,weight_c,selected_node):
    summed = weight_c
    for node in Community:
        if G.has_edge(selected_node,node):
            summed+=G[selected_node][node]['sign']
        if G.has_edge(node,selected_node):
            summed+=G[node][selected_node]['sign']
    return summed

def detection_algorithm(G):
    E = set(G.edges())
    C_set = set()
    
    pos_edges = [e for e in E if G[e[0]][e[1]]['sign'] == 1]
    neg_edges = [e for e in E if G[e[0]][e[1]]['sign'] == -1]
    weight_c = 0
    while len(E)!= 0:
        print(80*'=')
        print('len(E)=',len(E))
        print(80*'=')
        if (len(pos_edges)!=0) : e=pos_edges.pop(-1)
        else: e=neg_edges.pop(-1)
        C = set()
        C.add(e[0])
        C.add(e[1])
        weight_c+=G[e[0]][e[1]]['sign']
        Nc=set()
        for v in C:
            Nc=Nc | set(G.neighbors(v))
        print(len(Nc))
        while len(Nc)!=0:
            if len(Nc)%10 == 0 : print('len(Nc) =',len(Nc))
            nodes=[]
            values=[]
            for n in C:
                k_node=0
                neighbors = [node for node in G if G.has_edge(node,n) and node not in C]
                for elem in neighbors:
                    if G.has_edge(elem,n):
                        k_node+=G[elem][n]['sign']
                    else: #G.has_edge(n,elem): 
                        k_node+=G[n][elem]['sign']
                    nodes.append(elem)
                    values.append(compute_BwC(G,elem,C,k_node))
            selected_node = nodes[np.argmax(values)]
            Nc=Nc- {selected_node}
            C1 = C | {selected_node}
            sigma_C1 = compute_sigma(G,C1,compute_sum_weights(G,C1,weight_c,selected_node))
            sigma_C = compute_sigma(G,C,weight_c)
            if sigma_C1<sigma_C :
                C=C1
            else: 
                break
        E = E - set(G.subgraph(C).edges())
        C_set = C_set | C
    return C_set

Even this algorithm was very heavy to compute ie: O(|V|²) and thus we failed to accomplish the part of comparing communities of individuals from Epinions , to communities in Reddit (subreddits)

### This motivated us to work more on the presentations and since our data have informations about the timestamp of each created edge, we decided to work on interactive visualisations of graphs and nodes in time. 

In [472]:
from bokeh.io import output_file, show
from bokeh.models import (BoxZoomTool, Circle, HoverTool,
                          MultiLine, Plot, Range1d, ResetTool, Arc, Button, CustomJS,
                          TapTool, BoxSelectTool, EdgesAndLinkedNodes, NodesAndLinkedEdges)
from bokeh.palettes import Spectral4
from bokeh.plotting import from_networkx

In [473]:
import math

## Helper functions that sorts the nodes and creates a a grid layout
def grid_layout_with_ordering(G, nb_rows=4, ordering='GENERATIVE'):
    counts_df = pd.DataFrame(0, index = pd.Index([id_ for id_ in G.nodes()]), columns=['IN+', 'IN-', 'OUT+', 'OUT-'])
    
    result_dict = {
        id_ : None for id_ in G.nodes()
    }
    
    for edge in G.edges():
        sign = G[edge[0]][edge[1]]['Sign']
        
        sign_str = '+' if sign == 1 else '-'
        
        counts_df.loc[edge[0], 'OUT'+sign_str] += 1
        counts_df.loc[edge[1], 'IN'+sign_str] += 1

        
    counts_df['GENERATIVE_SCORE'] = (counts_df['OUT+'] - counts_df['OUT-']) / (counts_df['OUT+'] + counts_df['OUT-'])
    counts_df['RECEPTIVE_SCORE'] = (counts_df['IN+'] - counts_df['IN-']) / (counts_df['IN+'] + counts_df['IN-'])
    
    counts_df = counts_df.replace([np.inf, -np.inf], np.nan)
    counts_df = counts_df.replace([np.nan], 0)
    
    if(ordering == 'GENERATIVE'):
        counts_df.sort_values(by='GENERATIVE_SCORE', axis=0, ascending=False, inplace=True)
    else:
        counts_df.sort_values(by='RECEPTIVE_SCORE', axis=0, ascending=False, inplace=True)

    nb_nodes = counts_df.shape[0]
    
    nb_cols = math.ceil(nb_nodes/nb_rows)
    
    xs = np.linspace(-1, 1, nb_cols)
    X = xs.shape[0]
    
    ys = np.linspace(1, -1, nb_rows)
    Y = ys.shape[0]
    
    completed = False
          
    for idx_y, y in enumerate(ys):

        
        for idx_x, x in enumerate(xs):
            idx_all = idx_y * X + idx_x
            
            if(idx_all >= nb_nodes):
                completed = False
                break
            
            node = counts_df.index[idx_all]
            pos = np.array([x, y])
            
            result_dict[node] = pos
            
        if(completed):
            break
        
    return result_dict, counts_df

In [510]:
multiple_edges_meaned_data

Unnamed: 0,source,target,Sign,Date
22,100daysofketo,keto,1.0,2016/05/07
25,100daysofrejection,30daystosortmylifeout,1.0,2015/12/01
27,100daysofrejection,howtonotgiveafuck,1.0,2015/11/14
30,100movies365days,movieaweek,1.0,2014/07/12
31,100movies365days,movieclub,1.0,2014/07/12
...,...,...,...,...
339586,zorceror44,writingprompts,1.0,2016/08/25
339596,zreviews,audiophile,1.0,2016/06/25
339634,zylooxwrites,writingprompts,1.0,2015/11/05
339635,zyramains,leagueoflegends,1.0,2016/11/16


In [512]:
meaned_graph_more3links = nx.from_pandas_edgelist(multiple_edges_meaned_data,source='source', target="target", edge_attr = ["Sign","Date"],create_using=nx.DiGraph)
reddit_grid_pos, reddit_scores_df = grid_layout_with_ordering(meaned_graph_more3links, 40)

In [498]:
## create graph for visaulisation : (we discard the nodes implicated in less than 3 links)
meaned_graph_more3links = nx.from_pandas_edgelist(multiple_edges_meaned_data,source='source', target="target", edge_attr = ["Sign","Date"],create_using=nx.DiGraph)

wikipedia_grid_pos, wikipedia_scores_df = grid_layout_with_ordering(wikipedia_graph, 40)
reddit_grid_pos, reddit_scores_df = grid_layout_with_ordering(meaned_graph_more3links, 40)

In [475]:
## Create a dict { date : [nodes_active] } and dict { date : [edges_active]} for later use
## was done in javascript instead

def nodes_and_edges_per_date(G, dates, graph_renderer):
    froms = np.array(graph_renderer.edge_renderer.data_source.data['start'])
    tos = np.array(graph_renderer.edge_renderer.data_source.data['end'])
    Date = np.array(graph_renderer.edge_renderer.data_source.data['Date'])    
    nodes = np.array(graph_renderer.node_renderer.data_source.data['index'])
    
    edges_dict = {
        date : [] for date in dates
    }

    nodes_dict = {
        date : [] for date in dates
    }
    
    for date in dates:
        indices = np.where(Date == date)
        
        edges = indices

        edges_dict[date] = list(edges)

        nodes = set(froms[indices]) | set(tos[indices])

        nodes_dict[date] = list(nodes)
        
    return nodes_dict, edges_dict

In [476]:
## JS code for bokehJS

START_CODE = """ 
        if(dataset_title === 'wikipedia') {
        
        
        nb_clicks_wikipedia = nb_clicks_wikipedia + 1
        
        if(nb_clicks_wikipedia != 1) {
            nb_clicks_wikipedia = nb_clicks_wikipedia - 1
            return
        }
        
        simulation_in_progress_wikipedia = true

        var active_edges_idx = []
        var active_edges = {}
        var active_nodes = []
        
        var all_nodes = graph_renderer.node_renderer.data_source.data['index']
        
        if(isEmpty(nodes_dict_wikipedia) || isEmpty(edges_dict_wikipedia)) {
            dates_arr_wikipedia.forEach(function (date, index) {
                edges_dict_wikipedia[date] = backup_copy['Date'].reduce(function(a, e, i) {
                    if (e === date)
                        a.push(i);
                    return a;
                }, []); 

                nodes_dict_wikipedia[date] = backup_copy['Date'].reduce(function(a, e, i) {
                    if (e === date) {
                        a.add(all_nodes.indexOf(backup_copy['start'][i]));
                        a.add(all_nodes.indexOf(backup_copy['end'][i]))
                    }   
                    return a;
                }, new Set([])); 

            });
        }
        

        
        var all_edges = {
            'Sign': [],
            'Date': [],
            'edge_color_selection': [],
            'start': [],
            'end': []
        }
        
        graph_renderer.edge_renderer.data_source.data = all_edges
        
        function loop() {
            if(simulation_in_progress_wikipedia == false || date_idx_wikipedia == dates_arr_wikipedia.length)
            {
                nb_clicks_wikipedia = 0
                return
            }
            
            var date = dates_arr_wikipedia[date_idx_wikipedia]
            
            active_edges_idx = edges_dict_wikipedia[date]
            active_nodes = nodes_dict_wikipedia[date]
            
            active_edges = {
                'Sign': [],
                'Date': [],
                'edge_color_selection': [],
                'start': [],
                'end': []
            }
            
            var old_count = all_edges['Sign'].length
            
            active_edges_idx.forEach(function (edge_idx, index) {
                active_edges['Sign'].push(backup_copy['Sign'][edge_idx])
                active_edges['Date'].push(backup_copy['Date'][edge_idx])
                active_edges['edge_color_selection'].push(backup_copy['edge_color_selection'][edge_idx])
                active_edges['start'].push(backup_copy['start'][edge_idx])
                active_edges['end'].push(backup_copy['end'][edge_idx])
            });
        
            all_edges = {
                'Sign': all_edges['Sign'].concat(active_edges['Sign']) ,
                'Date': all_edges['Date'].concat(active_edges['Date']),
                'edge_color_selection': all_edges['edge_color_selection'].concat(active_edges['edge_color_selection']),
                'start': all_edges['start'].concat(active_edges['start']),
                'end': all_edges['end'].concat(active_edges['end'])
            }
            
            graph_renderer.edge_renderer.data_source.data = all_edges
            
            var indices = [...Array(all_edges['Sign'].length).keys()].map(i => i + old_count) ;

            graph_renderer.edge_renderer.data_source.selected.indices = indices
            graph_renderer.node_renderer.data_source.selected.indices = Array.from(active_nodes)


            date_idx_wikipedia = date_idx_wikipedia + 1
             
            setTimeout(loop, 700)
        
        }
        
        loop()
        
        nb_clicks_wikipedia = 0
        date_idx_wikipedia = 0
        
        
        } else {
        
        
        nb_clicks_reddit = nb_clicks_reddit + 1
        
        if(nb_clicks_reddit != 1) {
            nb_clicks_reddit = nb_clicks_reddit - 1
            return
        }
        
        simulation_in_progress_reddit = true

        var active_edges_idx = []
        var active_edges = {}
        var active_nodes = []
        
        var all_nodes = graph_renderer.node_renderer.data_source.data['index']
        
        if(isEmpty(nodes_dict_reddit) || isEmpty(edges_dict_reddit)) {
            dates_arr_reddit.forEach(function (date, index) {
                edges_dict_reddit[date] = backup_copy['Date'].reduce(function(a, e, i) {
                    if (e === date)
                        a.push(i);
                    return a;
                }, []); 

                nodes_dict_reddit[date] = backup_copy['Date'].reduce(function(a, e, i) {
                    if (e === date) {
                        a.add(all_nodes.indexOf(backup_copy['start'][i]));
                        a.add(all_nodes.indexOf(backup_copy['end'][i]))
                    }   
                    return a;
                }, new Set([])); 

            });
        }
        

        
        var all_edges = {
            'Sign': [],
            'Date': [],
            'edge_color_selection': [],
            'start': [],
            'end': []
        }
        
        graph_renderer.edge_renderer.data_source.data = all_edges
        
        function loop() {
            if(simulation_in_progress_reddit == false || date_idx_reddit == dates_arr_reddit.length)
            {
                nb_clicks_reddit = 0
                return
            }
            
            var date = dates_arr_reddit[date_idx_reddit]
            
            active_edges_idx = edges_dict_reddit[date]
            active_nodes = nodes_dict_reddit[date]
            
            active_edges = {
                'Sign': [],
                'Date': [],
                'edge_color_selection': [],
                'start': [],
                'end': []
            }
            
            var old_count = all_edges['Sign'].length
            
            active_edges_idx.forEach(function (edge_idx, index) {
                active_edges['Sign'].push(backup_copy['Sign'][edge_idx])
                active_edges['Date'].push(backup_copy['Date'][edge_idx])
                active_edges['edge_color_selection'].push(backup_copy['edge_color_selection'][edge_idx])
                active_edges['start'].push(backup_copy['start'][edge_idx])
                active_edges['end'].push(backup_copy['end'][edge_idx])
            });
        
            all_edges = {
                'Sign': all_edges['Sign'].concat(active_edges['Sign']) ,
                'Date': all_edges['Date'].concat(active_edges['Date']),
                'edge_color_selection': all_edges['edge_color_selection'].concat(active_edges['edge_color_selection']),
                'start': all_edges['start'].concat(active_edges['start']),
                'end': all_edges['end'].concat(active_edges['end'])
            }
            
            graph_renderer.edge_renderer.data_source.data = all_edges
            
            var indices = [...Array(all_edges['Sign'].length).keys()].map(i => i + old_count) ;

            graph_renderer.edge_renderer.data_source.selected.indices = indices
            graph_renderer.node_renderer.data_source.selected.indices = Array.from(active_nodes)


            date_idx_reddit = date_idx_reddit + 1
             
            setTimeout(loop, 700)
        
        }
        
        loop()
        
        nb_clicks_reddit = 0
        date_idx_reddit = 0
        
        
        }



    
    """

PAUSE_CODE = """ 
                if(dataset_title === 'wikipedia') {
        simulation_in_progress_wikipedia = false
        } else {
        simulation_in_progress_reddit = false
        }


        
    """

END_CODE = """ 
        if(dataset_title === 'wikipedia') {
            simulation_in_progress_wikipedia = false
            date_idx_wikipedia = 0
            graph_renderer.edge_renderer.data_source.selected.indices = []
            graph_renderer.node_renderer.data_source.selected.indices = []
        } else {
            simulation_in_progress_reddit = false
            date_idx_reddit = 0
            graph_renderer.edge_renderer.data_source.selected.indices = []
            graph_renderer.node_renderer.data_source.selected.indices = []
        
        
        }


    """

In [520]:
from bokeh.layouts import column, row
from bokeh.embed import components, json_item
import copy
import json

## Make everything and create the component for the visualization. (time simulation)
## dump it to json, and use it later in the html/js as a json got async
def create_viz_timing(G, pos_dict, dates, dataset_title, plot_width=1850, plot_height=1080):
    POSITIVE_COLOR_SELECTION, NEGATIVE_COLOR_SELECTION = "#27FF00", "#FF2A00"
    
    edge_attrs_selection = {}
    
    for start_node, end_node, _ in G.edges(data=True):
        edge_color_selection = POSITIVE_COLOR_SELECTION if G[start_node][end_node]['Sign'] > 0 else NEGATIVE_COLOR_SELECTION

        edge_attrs_selection[(start_node, end_node)] = edge_color_selection
        
    nx.set_edge_attributes(G, edge_attrs_selection, "edge_color_selection")
    
    plot = Plot(plot_width=plot_width, plot_height=plot_height,
            x_range=Range1d(-1.1, 1.1), y_range=Range1d(-1.1, 1.1))
    
    graph_renderer = from_networkx(G, pos_dict)
    
#     nodes_dict, edges_dict = nodes_and_edges_per_date(G, dates, graph_renderer)
    
    graph_renderer.node_renderer.glyph = Circle(size=5, fill_color="blue")
    graph_renderer.node_renderer.selection_glyph = Circle(size=5, fill_color="red")

    graph_renderer.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=1)
    graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color="edge_color_selection", line_width=2.5)
    
    plot.renderers.append(graph_renderer)
    
    button_start = Button(label='Start', button_type="success")
    button_pause = Button(label='Pause', button_type="warning")
    button_end = Button(label='End', button_type="danger")
    
    backup_edges_data = copy.deepcopy(graph_renderer.edge_renderer.data_source.data)
    
    start_callback = CustomJS(args= dict(
        dataset_title = dataset_title,
        graph_renderer = graph_renderer,
        backup_copy = backup_edges_data
    ), code = START_CODE)

    button_start.js_on_click(start_callback)
    
    pause_callback = CustomJS(args= dict(
        dataset_title = dataset_title
    ), code = PAUSE_CODE)
    
    button_pause.js_on_click(pause_callback)
    
    end_callback = CustomJS(args= dict(
        dataset_title = dataset_title,
        graph_renderer = graph_renderer
    ),code = END_CODE)

    button_end.js_on_click(end_callback)
    
#     output_file("interactive_graphs.html")

    button_start.css_classes.append('my-button')
    button_pause.css_classes.append('my-button')
    button_end.css_classes.append('my-button')

    layout_row = row(button_start, button_pause, button_end)

    layout_row.css_classes.append('button-row')

    layout_column = column(layout_row, plot)

    layout_column.css_classes.append('bk-container')

#     show(column(layout_column))  
    
    json_ = json_item(layout_column, "wikipedia-plot")
    
    with open('embedabble_'+ dataset_title + '_demo.json', 'w') as outfile:
        json.dump(json_, outfile)
        
    with open(dataset_title + '_dates.json', 'w') as outfile:
        json.dump(dates, outfile)
    
def extract_dates(df):
    dates = set(df['Date'].values.tolist())
    dates = sorted(list(dates))
    
    return dates

In [521]:
## JS code for bokehJS

OUT_CODE = """ 

console.log(graph_renderer)
        if(dataset_title === 'wikipedia') {
        
        $(this)
                                        var buttons = $('.wikipedia-plot .my-button')

        buttons.each(function (index, item) {
            if($(this).hasClass('outgoing'))
                $(this).addClass('active')
            else
                $(this).removeClass('active')
        })
        
        
        ingoing_wikipedia = false
        outgoing_wikipedia = true
                
        graph_renderer.edge_renderer.data_source.selected.indices = selected_edges_outgoing_wikipedia
        graph_renderer.node_renderer.data_source.selected.indices = selected_non_prime_outgoing_wikipedia.concat(selected_prime_node_wikipedia)
        
        } else {
        
                                                var buttons = $('.reddit-plot .my-button')

        buttons.each(function (index, item) {
            if($(this).hasClass('outgoing'))
                $(this).addClass('active')
            else
                $(this).removeClass('active')
        })
        
        ingoing_reddit = false
        outgoing_reddit = true
                
        graph_renderer.edge_renderer.data_source.selected.indices = selected_edges_outgoing_reddit
        graph_renderer.node_renderer.data_source.selected.indices = selected_non_prime_outgoing_reddit.concat(selected_prime_node_reddit)
        
        }

    """

IN_CODE = """ 

        console.log(graph_renderer)
        
        if(dataset_title === 'wikipedia') {
        
                                        var buttons = $('.wikipedia-plot .my-button')

        buttons.each(function (index, item) {
            if($(this).hasClass('ingoing'))
                $(this).addClass('active')
            else
                $(this).removeClass('active')
        })
        
        ingoing_wikipedia = true
        outgoing_wikipedia = false
        
        graph_renderer.edge_renderer.data_source.selected.indices = selected_edges_ingoing_wikipedia
        graph_renderer.node_renderer.data_source.selected.indices = selected_non_prime_ingoing_wikipedia.concat(selected_prime_node_wikipedia)
        
        
        
        } else {
        
            var buttons = $('.reddit-plot .my-button')

        buttons.each(function (index, item) {
            if($(this).hasClass('ingoing'))
                $(this).addClass('active')
            else
                $(this).removeClass('active')
        })
        
        ingoing_reddit = true
        outgoing_reddit = false
        
        graph_renderer.edge_renderer.data_source.selected.indices = selected_edges_ingoing_reddit
        graph_renderer.node_renderer.data_source.selected.indices = selected_non_prime_ingoing_reddit.concat(selected_prime_node_reddit)
        
        
        
        }
        
        
    """

BOTH_CODE = """ 

        console.log(graph_renderer)
        

        
        if(dataset_title === 'wikipedia') {
                
        var buttons = $('.wikipedia-plot .my-button')

        buttons.each(function (index, item) {
            if($(this).hasClass('both'))
                $(this).addClass('active')
            else
                $(this).removeClass('active')
        })
                
                
                ingoing_wikipedia = true
        outgoing_wikipedia = true
        
                var all_edges = selected_edges_ingoing_wikipedia.concat(selected_edges_outgoing_wikipedia)
        all_edges.sort()
        var all_nodes = selected_non_prime_ingoing_wikipedia.concat(selected_non_prime_outgoing_wikipedia, selected_prime_node_wikipedia)
        all_nodes.sort()
        
        graph_renderer.edge_renderer.data_source.selected.indices = all_edges
        graph_renderer.node_renderer.data_source.selected.indices = all_nodes
        
        } else {
        
                var buttons = $('.reddit-plot .my-button')

        buttons.each(function (index, item) {
            if($(this).hasClass('both'))
                $(this).addClass('active')
            else
                $(this).removeClass('active')
        })
        
                var all_edges = selected_edges_ingoing_reddit.concat(selected_edges_outgoing_reddit)
        all_edges.sort()
        var all_nodes = selected_non_prime_ingoing_reddit.concat(selected_non_prime_outgoing_reddit, selected_prime_node_reddit)
        all_nodes.sort()
        graph_renderer.edge_renderer.data_source.selected.indices = all_edges
        graph_renderer.node_renderer.data_source.selected.indices = all_nodes
        
        
        }
        

    """

NONE_CODE = """ 
        if(dataset_title === 'wikipedia') {
        
            var buttons = $('.wikipedia-plot .my-button')

            buttons.each(function (index) {
                if($(this).hasClass('none'))
                    $(this).addClass('active')
                else
                    $(this).removeClass('active')
            })
                
                ingoing_wikipedia = false
        outgoing_wikipedia = false
        
        graph_renderer.edge_renderer.data_source.selected.indices = []
        graph_renderer.node_renderer.data_source.selected.indices = selected_prime_node_wikipedia
        
        } else {
                        var buttons = $('.reddit-plot .my-button')

        buttons.each(function (index, item) {
            if(item.hasClass('none'))
                item.addClass('active')
            else
                item.removeClass('active')
        })
        
                ingoing_reddit = false
        outgoing_reddit = false
        
        graph_renderer.edge_renderer.data_source.selected.indices = []
        graph_renderer.node_renderer.data_source.selected.indices = selected_prime_node_reddit
        
        }
        

    """

CLICKED_CODE = """ 
    console.log('first')
    console.log(graph_renderer)
    console.log(ingoing_wikipedia)
    console.log(outgoing_wikipedia)
    
    console.log('second')
    
    var selected_nodes = graph_renderer.node_renderer.data_source.selected.indices
    
    if(selected_nodes === undefined || selected_nodes.length === 0)
        return
    
    if(dataset_title === 'wikipedia') {
        var clicked_node_idx = graph_renderer.node_renderer.data_source.selected.indices[0]
        var clicked_node = graph_renderer.node_renderer.data_source.data['index'][clicked_node_idx]
    
        console.log(clicked_node)
    
        selected_prime_node_wikipedia = [clicked_node_idx]
    
    
        var tab_out = findOutgoingEdgesAndNodes(clicked_node, graph_renderer)
        var tab_in = findIngoingEdgesAndNodes(clicked_node, graph_renderer)

        selected_edges_outgoing_wikipedia = tab_out[0]
        selected_non_prime_outgoing_wikipedia = Array.from(tab_out[1])
        selected_edges_ingoing_wikipedia = tab_in[0]
        selected_non_prime_ingoing_wikipedia = Array.from(tab_in[1])

        console.log(selected_edges_outgoing_wikipedia, selected_non_prime_outgoing_wikipedia)
        console.log(selected_edges_ingoing_wikipedia, selected_non_prime_ingoing_wikipedia)

        if(ingoing_wikipedia && !outgoing_wikipedia) {
            graph_renderer.edge_renderer.data_source.selected.indices = selected_edges_ingoing_wikipedia
            graph_renderer.node_renderer.data_source.selected.indices = selected_non_prime_ingoing_wikipedia.concat(selected_prime_node_wikipedia)
        }
        else if(!ingoing_wikipedia && outgoing_wikipedia) {
            console.log('tay1')
            graph_renderer.edge_renderer.data_source.selected.indices = selected_edges_outgoing_wikipedia
            graph_renderer.node_renderer.data_source.selected.indices = selected_non_prime_outgoing_wikipedia.concat(selected_prime_node_wikipedia)
        } else if(ingoing_wikipedia && outgoing_wikipedia){
            console.log('tay2')
            var all_edges = selected_edges_ingoing_wikipedia.concat(selected_edges_outgoing_wikipedia)
            all_edges.sort()
            var all_nodes = selected_non_prime_ingoing_wikipedia.concat(selected_non_prime_outgoing_wikipedia, selected_prime_node_wikipedia)
            all_nodes.sort()

            graph_renderer.edge_renderer.data_source.selected.indices = all_edges
            graph_renderer.node_renderer.data_source.selected.indices = all_nodes
        } else {
            console.log('tay3')
            graph_renderer.edge_renderer.data_source.selected.indices = []
            graph_renderer.node_renderer.data_source.selected.indices = selected_prime_node_wikipedia
        }
        
        } else {
            var clicked_node_idx = graph_renderer.node_renderer.data_source.selected.indices[0]
    var clicked_node = graph_renderer.node_renderer.data_source.data['index'][clicked_node_idx]
    
    selected_prime_node_reddit = [clicked_node_idx]
    
        var tab_out = findOutgoingEdgesAndNodes(clicked_node, graph_renderer)
    var tab_in = findIngoingEdgesAndNodes(clicked_node, graph_renderer)

    selected_edges_outgoing_reddit = tab_out[0]
    selected_non_prime_outgoing_reddit = Array.from(tab_out[1])
    selected_edges_ingoing_reddit = tab_in[0]
    selected_non_prime_ingoing_reddit = Array.from(tab_in[1])

    if(ingoing_reddit && !outgoing_reddit) {
        graph_renderer.edge_renderer.data_source.selected.indices = selected_edges_ingoing_reddit
        graph_renderer.node_renderer.data_source.selected.indices = selected_non_prime_ingoing_reddit.concat(selected_prime_node_reddit)
    }
    else if(!ingoing_reddit && outgoing_reddit) {
graph_renderer.edge_renderer.data_source.selected.indices = selected_edges_outgoing_reddit
        graph_renderer.node_renderer.data_source.selected.indices = selected_non_prime_outgoing_reddit.concat(selected_prime_node_reddit)
    
    } else if(ingoing_reddit && outgoing_reddit){
        var all_edges = selected_edges_ingoing_reddit.concat(selected_edges_outgoing_reddit)
        all_edges.sort()
        var all_nodes = selected_non_prime_ingoing_reddit.concat(selected_non_prime_outgoing_reddit, selected_prime_node_reddit)
        all_nodes.sort()
        graph_renderer.edge_renderer.data_source.selected.indices = all_edges
        graph_renderer.node_renderer.data_source.selected.indices = all_nodes
    } else {
        graph_renderer.edge_renderer.data_source.selected.indices = []
        graph_renderer.node_renderer.data_source.selected.indices = selected_prime_node_reddit
    }
        
        
        }

    """

In [522]:
from bokeh.layouts import column, row
from bokeh.embed import components, json_item
import copy
from bokeh.events import Tap
import json
from bokeh.models import WheelZoomTool

## Make everything and create the component for the visualization. (picker simulation)

def create_viz_picker(G, pos_dict, dataset_title, plot_width=1850, plot_height=1080):
    POSITIVE_COLOR_SELECTION, NEGATIVE_COLOR_SELECTION = "#27FF00", "#FF2A00"
    
    edge_attrs_selection = {}
    
    for start_node, end_node, _ in G.edges(data=True):
        edge_color_selection = POSITIVE_COLOR_SELECTION if G[start_node][end_node]['Sign'] > 0 else NEGATIVE_COLOR_SELECTION

        edge_attrs_selection[(start_node, end_node)] = edge_color_selection
        
    nx.set_edge_attributes(G, edge_attrs_selection, "edge_color_selection")
    
    plot = Plot(plot_width=plot_width, plot_height=plot_height,
            x_range=Range1d(-1.1, 1.1), y_range=Range1d(-1.1, 1.1))
    
    plot.add_tools(TapTool(), WheelZoomTool())
    
    graph_renderer = from_networkx(G, pos_dict)
    
    graph_renderer.node_renderer.glyph = Circle(size=5, fill_color="blue")
    graph_renderer.node_renderer.selection_glyph = Circle(size=5, fill_color="red")


    graph_renderer.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=1)
    graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color="edge_color_selection", line_width=2.5)
    
    plot.renderers.append(graph_renderer)
    
    button_both = Button(label='Both', button_type="success")
    button_ingoing = Button(label='Ingoing edges only', button_type="warning")
    button_outgoing = Button(label='Outgoing edges only', button_type="warning")
    button_none = Button(label='None', button_type="danger")
    
    both_callback = CustomJS(args= dict(
        graph_renderer = graph_renderer,
        dataset_title = dataset_title
    ), code = BOTH_CODE)

    button_both.js_on_click(both_callback)
    
    out_callback = CustomJS(args= dict(
        graph_renderer = graph_renderer,
        dataset_title = dataset_title
    ), code = OUT_CODE)

    button_outgoing.js_on_click(out_callback)
    
    in_callback = CustomJS(args= dict(
        graph_renderer = graph_renderer,
        dataset_title = dataset_title
    ), code = IN_CODE)

    button_ingoing.js_on_click(in_callback)
    
    none_callback = CustomJS(args= dict(
        graph_renderer = graph_renderer,
        dataset_title = dataset_title
    ), code = NONE_CODE)

    button_none.js_on_click(none_callback)
    
    click_callback = CustomJS(args=dict(
        graph_renderer = graph_renderer,
        dataset_title = dataset_title
    ), code = CLICKED_CODE)
    
    plot.js_on_event(Tap, click_callback)

    button_both.css_classes.append('my-button')
    button_both.css_classes.append('both')
    button_outgoing.css_classes.append('my-button')
    button_outgoing.css_classes.append('outgoing')
    button_ingoing.css_classes.append('my-button')
    button_ingoing.css_classes.append('ingoing')
    button_none.css_classes.append('my-button')
    button_none.css_classes.append('none')

    layout_row = row(button_both, button_outgoing, button_ingoing, button_none)

    layout_row.css_classes.append('button-row')

    layout_column = column(layout_row, plot)

    layout_column.css_classes.append('bk-container')
    
    json_ = json_item(layout_column, "wikipedia-plot")
    
    with open('embedabble_'+ dataset_title + '_picker.json', 'w') as outfile:
        json.dump(json_, outfile)

In [519]:
create_viz_timing(wikipedia_graph, wikipedia_grid_pos, extract_dates(wikipedia_df), 'wikipedia')
create_viz_picker(wikipedia_graph, wikipedia_grid_pos, 'wikipedia')

In [523]:
create_viz_timing(meaned_graph_more3links, reddit_grid_pos, extract_dates(multiple_edges_meaned_data), 'reddit')
create_viz_picker(meaned_graph_more3links, reddit_grid_pos, 'reddit')