# Exercise 3: Community detection on the network of Computational Social Scientists.
## Week 6, Exercise 4

- Consider the network you built in Week 4.

In [1]:
import json
import networkx as nx
import netwulf as nw
from netwulf import visualize

f = open('data/graph.json')
data = json.load(f)


In [2]:
G = nx.node_link_graph(data) 
print(f"The number of nodes before the GCC has been found: {len(list(G.nodes))}")
largest_cc = max(nx.connected_components(G), key=len)
# update graph to only include the larget connected component. 
G = G.subgraph(largest_cc)
print(f"The number of nodes after the GCC has been found: {len(list(G.nodes))}")

The number of nodes before the GCC has been found: 2162
The number of nodes after the GCC has been found: 1271


In [3]:
# draw with netwulf
visualize(G)


(None, None)

- Use the Python Louvain-algorithm implementation to find communities. How many communities do you find? What are their sizes? Report the value of modularity found by the algorithm. Is the modularity significantly different than 0?
> The modularity is 0.899, which is significantly different than 0. Modularity measures <> and is found within the range [-1/2, 1]. Thus, this partition is well done. 

In [5]:
import community


# Find all communities in the graph
partition = community.best_partition(G)  # Maps author id to community id

# Show the communities
print(f"The number of communities found: {len(set(partition.values()))}")
print("The communities are:")

# Get sizes of each community


sizes = []
for i in set(partition.values()):
    # print their sizes
    print(f"Community {i}: {len([k for k, v in partition.items() if v == i])}")
    # print(f"Community {i}: {[k for k, v in partition.items() if v == i]}")

# get value of modularity
modularity = community.modularity(partition, G)
print(f"The modularity of the graph is: {modularity}")

The number of communities found: 30
The communities are:
Community 0: 27
Community 1: 27
Community 2: 61
Community 3: 28
Community 4: 62
Community 5: 52
Community 6: 46
Community 7: 65
Community 8: 80
Community 9: 22
Community 10: 29
Community 11: 17
Community 12: 42
Community 13: 120
Community 14: 86
Community 15: 39
Community 16: 43
Community 17: 30
Community 18: 22
Community 19: 65
Community 20: 76
Community 21: 18
Community 22: 17
Community 23: 5
Community 24: 31
Community 25: 41
Community 26: 37
Community 27: 18
Community 28: 36
Community 29: 29
The modularity of the graph is: 0.8985507540065247


- If you are curious, you can also try the Infomap algorithm. Go to [this page]. (https://mapequation.github.io/infomap/python/). It's harder to install, but a better community detection algorithm. You can read about it in advanced topics 9B.

- Visualize the network, using netwulf (see Week 5). This time assign each node a different color based on their community. Describe the structure you observe.
> The structure looks fine as hell. 


In [6]:
# partition[G.nodes[0]]
# Assign each node their communiy with different colors

nx.set_node_attributes(G, partition, name="group")  # group controls color
G.nodes(data=True)
# for node in G.nodes:
#     print(partition[node])
#     G.nodes[node]['community'] = partition[node]




NodeDataView({'2101037': {'field': 'Computer Science', 'name': 'Florian Lemmerich', 'citation_count': 12.0, 'num_ccs_papers': 6, 'first_css_paper_year': 2017.0, 'group': 0}, '3001795': {'field': 'Computer Science', 'name': 'B Bruce Ferwerda', 'citation_count': 13.0, 'num_ccs_papers': 1, 'first_css_paper_year': 2017.0, 'group': 0}, '2080155085': {'field': 'Business', 'name': 'Ahmed Ali Moustafa Soliman', 'citation_count': 29.0, 'num_ccs_papers': 1, 'first_css_paper_year': 2019.0, 'group': 0}, '33570565': {'field': 'Psychology', 'name': 'Mohsen Jadidi', 'citation_count': 9.0, 'num_ccs_papers': 3, 'first_css_paper_year': 2015.0, 'group': 0}, '66118125': {'field': 'Computer Science', 'name': 'Gesis', 'citation_count': 0.5, 'num_ccs_papers': 2, 'first_css_paper_year': 2015.0, 'group': 0}, '153503994': {'field': 'Sociology', 'name': 'John J. Hafer', 'citation_count': 29.0, 'num_ccs_papers': 1, 'first_css_paper_year': 2019.0, 'group': 0}, '48265513': {'field': 'Economics', 'name': 'Maria Zens

- Make sure you save the assignment of authors to communities.

In [8]:
# get degree of each node
degree = dict(G.degree())
# G.nodes(data=True)
degree

{'2101037': 10,
 '3001795': 2,
 '2080155085': 2,
 '33570565': 4,
 '66118125': 3,
 '153503994': 2,
 '48265513': 6,
 '46417808': 4,
 '2538277': 1,
 '1729484': 2,
 '98808950': 4,
 '51040873': 19,
 '145059556': 12,
 '1389955131': 1,
 '119151277': 5,
 '96893747': 5,
 '30421024': 9,
 '2871138': 3,
 '2020262166': 5,
 '2349638': 4,
 '151377924': 5,
 '151463421': 8,
 '2088358': 12,
 '6673771': 10,
 '97678793': 3,
 '1395385575': 5,
 '26435264': 3,
 '145212340': 9,
 '32216871': 2,
 '49305855': 13,
 '47205422': 5,
 '51435753': 4,
 '2557324': 6,
 '52229695': 3,
 '151467651': 10,
 '1743706': 3,
 '2255094': 20,
 '3145906': 6,
 '1682773': 27,
 '4462678': 7,
 '2254522': 2,
 '1715670': 5,
 '2202643918': 6,
 '1684412': 7,
 '2698329': 10,
 '34893194': 2,
 '1769960': 7,
 '3118802': 1,
 '143653472': 9,
 '2061878581': 1,
 '1403820241': 4,
 '2617057': 3,
 '3329961': 29,
 '144684950': 3,
 '145190501': 2,
 '2091576379': 4,
 '2381124': 2,
 '2080197097': 4,
 '2592694': 19,
 '144626725': 4,
 '34616778': 2,
 '20885

## Exercise 4: TF-IDF and the Computational Social Science communities.
The goal for this exercise is to find the words charachterizing each of the communities of Computational Social Scientists.

### 4.2
Now, we want to find out which words are important for each community, so we're going to create several *large documents, one for each community*. Each document includes all the tokens of abstracts written by members of a given community.

- Consider a community c
- Find all the abstracts of papers written by a (ALL) member(S) of community c.
- Create a long array that stores all the abstract tokens
- Repeat for all the communities.

In [9]:
# Tokenizer code written in week7

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

urls = '\S+www\S+\w'    # remove urls by searching for www
symbols = '[^\w\s]'  # remove punctuation
numbers = '\d+' # remove numbers
stop_words = stopwords.words('english')

def tokenize(text):
    if text is None:
        return None
    text = text.lower()
    text = re.sub(fr'{symbols}|{urls}|{numbers}','',text)
    text = [word for word in text.split() if word not in stop_words]
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
import pickle

# abstracDataSet contains paperId and their abstracts
# paperDataSet contains paperId	title	year	externalId.DOI	citationCount	fields	authorIds	author_field

with open('data/paperAbstractDataSet.pkl', 'rb') as f:
    abstractDataSet = pickle.load(f)

abstractDataSet=abstractDataSet.drop_duplicates(subset=['papersId'])

with open('data/ccs_papers.pkl', 'rb') as f:
    paperDataSet = pickle.load(f)




First things first: We need access to the papers written by the authors in the graph. We have a paperDataSet that has up to a million entrees. This is cut down by filtering out papers where none of the contributors exists in the graph. Then using the IDs from filtered papers, cut down the abstracts. 

In [11]:
# Cut down the paperDataSet to only include authors in graph
tempDict=dict(G.nodes)
valid = paperDataSet['authorIds'].apply(lambda x: any(elem in tempDict for elem in x))
papers = paperDataSet[valid]
papers = papers.explode('authorIds') # explode the authorIds column such that each row has one authorId.
                                    # but papers appear multiple times

In [12]:
# ----- create the dataframe
#first add each node's degree as attribute. 
import pandas as pd
degree = dict(G.degree())
nx.set_node_attributes(G, degree, name="degree") 
df=pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')
df = df.reset_index(names="authorID")
#G.nodes["3001795"]
df = df[['authorID','group', 'degree']]
df

Unnamed: 0,authorID,group,degree
0,2101037,0,10
1,3001795,0,2
2,2080155085,0,2
3,33570565,0,4
4,66118125,0,3
...,...,...,...
1266,10852593,8,2
1267,1734917,7,1
1268,9486542,14,2
1269,144188281,27,1


In [39]:
# Create a dictionary that maps communities to a collection of abstracts
communityPaperIDs = {i : set() for i in range(len(set(partition.values())))}

author = "98808950"
partition[author]



0

In [None]:
communityPaperIDs = {i : set() for i in range(len(set(partition.values())))}
for node in G.nodes(data=True):
    author = node[0]
    writtenPapers=papers[papers["authorIds"].isin([author])]["paperId"]
    community = partition[author]
    communityPaperIDs[community].update(writtenPapers)
    
    #print(abstracts)
# communityPaperIDs

In [78]:
# For each community extract the abstracts
communityTokens = {i : [] for i in range(len(communityPaperIDs))}
# g'r med tom array of append
for community, paperIDs in communityPaperIDs.items():
    abstracts=abstractDataSet[abstractDataSet["papersId"].isin(paperIDs)]["papersAbstract"]

    abstracts = abstracts.dropna()    # Drop all rows with None values
    abstracts.apply(lambda x: communityTokens[community].extend(tokenize(x)))
len(communityTokens)
# 1 minute

30

In [79]:
communityTokens

{0: ['paper',
  'introduces',
  'hoprank',
  'algorithm',
  'modeling',
  'human',
  'navigation',
  'semantic',
  'networks',
  'hoprank',
  'leverages',
  'assumption',
  'users',
  'know',
  'see',
  'whole',
  'structure',
  'network',
  'therefore',
  'besides',
  'following',
  'links',
  'also',
  'follow',
  'nodes',
  'certain',
  'distances',
  'ie',
  'khop',
  'neighborhoods',
  'random',
  'suggested',
  'pagerank',
  'assumes',
  'links',
  'known',
  'visible',
  'observe',
  'preference',
  'towards',
  'khop',
  'neighborhoods',
  'bioportal',
  'one',
  'leading',
  'repositories',
  'biomedical',
  'ontologies',
  'web',
  'general',
  'users',
  'navigate',
  'within',
  'vicinity',
  'concept',
  'also',
  'jump',
  'distant',
  'concepts',
  'less',
  'frequently',
  'fit',
  'model',
  'ontologies',
  'using',
  'transition',
  'matrix',
  'clickstreams',
  'show',
  'semantic',
  'structure',
  'influence',
  'teleportation',
  'pagerank',
  'suggests',
  'users

In [221]:
Extract all abstracts that with paperIDs in abstractDataSet
# CUT_abstractDataSet = abstractDataSet[abstractDataSet["papersId"].isin(paperIDs)]
# test=abstractDataSet.groupby()
for node in G.nodes:
    print(node)
    break

2101037


In [287]:
# community_abstracts = [0] * 30
# def kek(x):
#     return x
# kek = df.groupby('group').aggregate(kek)
# kek = df.groupby('group').agg({'authorID': sum})
df

Unnamed: 0,authorID,group,degree
0,2101037,0,10
1,3001795,0,2
2,2080155085,0,2
3,33570565,0,4
4,66118125,0,3
...,...,...,...
1266,10852593,8,2
1267,1734917,7,1
1268,9486542,11,2
1269,144188281,28,1


In [159]:
# Iterate over community
# create dataframe from graph



AttributeError: 'Graph' object has no attribute 'head'