In [None]:
import pandas as pd
import re
import networkx as nx

>* We are going back to week 5 when we analyzed social media data to build edges between users. We will use the data from week 3, which is collected from Politician's Twitter and Facebook accounts. We will use the data to build a social network graph and analyze the network.

In [None]:
data=pd.read_csv('../week3/Political-media-DFE.csv', encoding='latin1')

In [None]:
data.columns

>* Let's subset the DataFrame to have `label`, `source`, and `text` columns

In [None]:
df=data[['label', 'source', 'text']]

>* With this DataFrame, we want to extract the name of the user from the `label` column and create a new column called `username` in the DataFrame.

In [None]:
pattern=re.compile(r'(.+?)\s*\(')
df['username']=df['label'].str.replace('From: ', '').str.findall(pattern).str[0]

>* Let's lowercase the `username` column

In [None]:
df['username']=df['username'].str.lower()

> * Let's extract the mentions from the `text` column and create a new column `mention` with the mentions.

In [None]:
pattern = re.compile(r'@[a-zA-Z0-9]+')
df['mention']=df['text'].apply(lambda x: pattern.findall(x))
df['mention']=df['mention'].apply(lambda x: [y.lower() for y in x])

>* We also don't want to forget to convert the text into lowercase.

In [None]:
df['text']=df['text'].str.lower()

In [None]:
df.head(5)

> * Let's build edges between the users who have mentioned.
> * To do so, we will use the `mention` column and iterate over the rows to create edges between the users in the `mention` column.

>* There is an information about whether the political leader is a Representative or a Senator.
>* Let's use RegEx to extract that and create a dictionary called `title`.

In [None]:
pattern=re.compile(r'\([A-Za-z]*')
df['title']=df['label'].str.findall(pattern).str[0].str.strip('(')

In [None]:
title={}
for idx, row in df.iterrows():
    title[row['username']]=row['title']

In [None]:
title_shape = {key: 'o' if value == 'Representative' else 'x' for key, value in title.items()}

>* There is another information about which State the political leader is from.
>* Let's use RegEx to extract that and create a dictionary called `state`.
>* We are changing the value of Representative and Senator to 'o' and 'x' to use it as a node shape.

In [None]:
pattern=re.compile(r'from\s.*')
df['state']=df['label'].str.findall(pattern).str[0].str.replace('from','').str.replace(')', '').str.strip()

In [None]:
state={}
for idx, row in df.iterrows():
    state[row['username']]=row['state']

In [None]:
import random
import seaborn as sns
# Get the unique values from the state dictionary
unique_states = list(set(state.values()))

# Generate a color palette using seaborn
color_palette = sns.color_palette("hls", len(unique_states))

# Create a dictionary to map each unique state to a color
state_colors_dict = {key: color_palette[i] for i, key in enumerate(unique_states)}

In [None]:
df[df['username']=='trey radel']

In [None]:
from collections import defaultdict
default_dict=defaultdict(lambda: None, state)
for key, value in state.items():
    if value in state_colors_dict.keys():
        default_dict[key]=state_colors_dict.get(value)
color_mapped=dict(default_dict)

In [None]:
color_palette

In [None]:
color_mapped['trey radel']

In [None]:
#first we need to iterate through the mention column and then we need to iterate through usernames column
#and we need to create a tuple of the username and the mention

edges=[]
for idx, val in df.iterrows():
    if len(val['mention']) == 0: #when there are no mentions, we skip the iteration
        continue
    elif len(val['mention']) > 0: #when there are mentions, we iterate through the mentions
        for mention in val['mention']: 
            edges.append((val['username'].lower(), mention.strip('@').lower())) 
            #we append the tuple of the username and the mention to the edges list
            #lower() is used to convert the username to lowercase
            #strip() is used to remove the @ from the mention

In [None]:
edges[:10]

In [None]:
len(edges)

>* We will generate an empty graph object `G` and populate the graph with the edges.

In [None]:
G = nx.Graph()

In [None]:
G.add_edges_from(edges)

>* When you add edges, the graph object will add the nodes automatically.

In [None]:
len(list(G.nodes))

>* Let's add node attributes.
>* We will add the `title` attributes to the node.

In [None]:
df['username']

In [None]:
df['title']

>* We want to try iterating over the `G` object to assign the title as the value of the key `title`.
>* But this will return you an error with the message: `index 0 is out of bounds for axis 0 with size 0`
>* Let's think why this is happening
>* Q. Why is this happening?

In [None]:
#You will get an error from this code.
#Can you think why this error is happening?
for node in G.nodes():
    G.nodes[node]['title']=df[df['username']==node]['title'].unique()[0]

>* It is becuase nodes in the graph contains the nodes that are not in the `df['username']`
>* As nodes include the nodes that have been mentioned but not in the `df['username']`, we need to proceess the nodes that are not in `df['username']`.

In [None]:
for node in G.nodes():
    if node in df['username'].values: #check if the node is in the username column
        G.nodes[node]['title']=df[df['username']==node]['title'].unique()[0]
    else: #if the node is not in the username column, we assign the title attribute as Unknown
        G.nodes[node]['title']='Unknown'

In [None]:
G.nodes['nowthisnews']

>* We will add the `state` attributes to the node.
>* Similarly, we will add `Unknown` to the nodes that are not in the `df['username']`.

In [None]:
for node in G.nodes():
    if node in df['username'].values:
        G.nodes[node]['state']=df[df['username']==node]['state'].unique()[0]
    else:
        G.nodes[node]['state']='Unknown'

In [None]:
G.nodes['nowthisnews']

In [None]:
G.nodes['trey radel']

>* Now that we have a dictionary, `color_mapped` that as usernames as keys and the value for RGB as values, we can use this dictionary to add the color to the nodes.

In [None]:
color_mapped['trey radel']

In [None]:
for node in G.nodes():
    if node in df['username'].values:
        G.nodes[node]['color']=color_mapped[node]
    else:
        G.nodes[node]['color']=(0,0,0)

>* Thus far, we calculated the degree of the nodes by hardcoding the values. But we can use the built-in function of networkx to calculate various centrality measures.

>* Degree centrality: The number of edges that are connected to the node.
>* Betweenness centrality: The number of times the node acts as a bridge along the shortest path between two other nodes.
>* Closeness centrality: The average length of the shortest path between the node and all other nodes.

>* Degree centrality: Node connectivity, local influence
>* Betweenness centrality: Bridging roles, broker
>* Closeness centrality: Proximity to other nodes, efficient communication

In [None]:
nx.degree_centrality(G)['trey radel'] #degree centrality

In [None]:
nx.betweenness_centrality(G)['trey radel'] #betweenness centrality
#e-06 means 10^-6 or 0.000001

In [None]:
nx.closeness_centrality(G)['trey radel'] #closeness centrality

>* Q. Who are the top 5 usernames who have the highest degree centrality?

In [None]:
sorted(nx.degree_centrality(G).items(), key=lambda x:x[1], reverse=True)[:5]

>* Let's put the degree centrality score as a node attribute called `degree`.
>* To do so, we use the `nx.set_node_attributes` method.

In [None]:
nx.set_node_attributes(G, nx.degree_centrality(G), 'degree')

>* Q. Who are the top 5 usernames who have the highest betweenness centrality?

In [None]:
sorted(nx.betweenness_centrality(G).items(), key=lambda x:x[1], reverse=True)[:5]

>* Let's put the betweenness centrality score as a node attribute called `betweenness`.
>* To do so, we use the `nx.set_node_attributes` method.

In [None]:
nx.set_node_attributes(G, nx.betweenness_centrality(G), 'betweenness')

>* Q. Who are the top 5 usernames who have the highest closeness centrality?

In [None]:
sorted(nx.closeness_centrality(G).items(), key=lambda x:x[1], reverse=True)[:5]

>* Let's put the closeness centrality score as a node attribute called `closeness`.
>* To do so, we use the `nx.set_node_attributes` method.

In [None]:
nx.set_node_attributes(G, nx.closeness_centrality(G), 'closeness')

In [None]:
G.nodes['trey radel']

>* Homophily is the tendency of individuals to associate and bond with similar others.
>* We can measure homophily by comparing the number of edges between nodes of the same type to the number of edges between nodes of different types.
>* One of popular ways to measure the node-level homophily is to calculate the E-I index proposed by Krackhardt and Stern (1988).
>* https://doi.org/10.2307/2786835

<img src="../week8/ei-index.png" width=500px height=500px />

>* Unfortunately, networkx does not have a built-in function to calculate the E-I index.
>* We will make a function to calculate the E-I index.

>* Let's create `G_eiindex` for the E-I index calculation.

In [None]:
G_eiindex=nx.Graph()
ei_edges=[('mishra', 'park'), ('singh', 'xiao'), ('xiao', 'mishra'), \
        ('xiao', 'park'), ('simon', 'park'), ('simon', 'xiao'), \
        ('simon', 'mishra'), ('simon', 'singh')]
G_eiindex.add_edges_from(ei_edges)

In [None]:
G_eiindex.nodes['mishra']['title']='Representative'
G_eiindex.nodes['singh']['title']='Representative'
G_eiindex.nodes['xiao']['title']='Representative'
G_eiindex.nodes['park']['title']='Sentor'
G_eiindex.nodes['simon']['title']='Sentor'
G_eiindex.nodes['mishra']['color']='blue'
G_eiindex.nodes['singh']['color']='blue'
G_eiindex.nodes['xiao']['color']='blue'
G_eiindex.nodes['park']['color']='red'
G_eiindex.nodes['simon']['color']='red'

In [None]:
nx.draw(G_eiindex, with_labels=True, node_color=[G_eiindex.nodes[i]['color'] for i in G_eiindex.nodes])

In [None]:
def ego_EI_idx(graph_object):
    EI_dic = {}
    for k in graph_object.nodes:
        external = 0
        internal = 0
        try:
            for i in graph_object.edges(k):
                if graph_object.nodes[i[1]]['title'] == graph_object.nodes[k]['title']:
                    internal += 1
                else:
                    external += 1
        except ZeroDivisionError:
            pass
        if external + internal != 0:
            EI_dic[k] = (external - internal) / (external + internal)
        else:
            EI_dic[k] = 0
    nx.set_node_attributes(graph_object, EI_dic, name="ei_idx")

In [None]:
ego_EI_idx(G_eiindex)

In [None]:
G_eiindex.nodes['mishra']

>* In order to calculate the graph-level homophily, there is another method called `assortativity coefficient`.
>* The assortativity coefficient is a measure used to quantify the degree to which nodes in a network tend to be connected to other nodes that are similar or dissimilar.
> * 1: Perfect assortative
> * -1: Perfect disassortative

In [None]:
nx.attribute_assortativity_coefficient(G_eiindex, 'title')

>* If we want to measure the level of clustering in the network, we can use (1) transitivity and (2) clustering coefficient.
>* `Transitivity` is the ratio of triangles to triplets in the network.
>* `Clustering coefficient` is the clustering coefficient of the node.
>* `Average clustering` is the average clustering coefficient of all the nodes in the network (Graph-level clustering).

> * Why do we look at `triangles` in the network?
>* https://faculty.ucr.edu/~hanneman/nettext/C8_Embedding.html
>* https://bryangraham.github.io/econometrics/downloads/working_papers/DynamicNetworks/Homophily_and_Transitivity_April2016.pdf

<img src="../week8/transitivity.png" width=3500px height=80px />

<img src="../week8/transitivity-figure.png" width=800px height=300px />

<img src="../week8/transitivity-1.png" width=700px height=70px />

<img src="../week8/transitivity-2.png" width=700px height=120px />

<img src="../week8/transitivity-3.png" width=700px height=120px />

<img src="../week8/triadic_closure.png" width=300px height=700px />

In [None]:
nx.transitivity(G_eiindex) #transitivity 

<img src="../week8/transitivity-metric.png" width=300px height=100px />

In [None]:
nx.clustering(G_eiindex) #clustering coefficient

<img src="../week8/clustering-metric.png" width=500px height=100px />

In [None]:
nx.average_clustering(G_eiindex) #average clustering coefficient

>* Louvain community detection algorithm is a method to detect the communities in the network.
>* First, it iteratively optimizes the modularity score of the network by moving nodes between communities.
>* Modularity:a measure of the structure of networks or graphs which measures the strength of division of a network into modules (also called groups, clusters or communities).
>* Second, it stops when the modularity score cannot be increased further.
>* Third, it returns the communities as the output.

<img src="../week8/louvain_community.png" width=700px height=300px />

>* `pip install community`
>* `pip install python-louvain`

In [None]:
import community.community_louvain
len(nx.community.louvain_communities(G)) #community detection

#### Practice

In [None]:
data=pd.read_csv('../week5/subset-2021-01-11-voter_fraud.csv')

>* Let's see what it has for column names.

In [None]:
#YOUR CODE HERE

>* Let's subset the data to have only `bodywithurls`, `username`, `followers`, and `following` columns.

In [None]:
#YOUR CODE HERE

>* Q. Print the first 5 rows of the DataFrame.

In [None]:
#YOUR CODE HERE

>* Make the `username` column lowercase.

In [None]:
#YOUR CODE HERE

> * Looks like there are duplicates in the dataset. Let's remove the duplicates.

In [None]:
#YOUR CODE HERE

> * Let's extract the mentions from the `bodywithurls` column and create a new column `mentions` with the mentions.

In [None]:
#YOUR CODE HERE

> * Let's build edges between the users who have mentioned.
> * To do so, we will use the `mentions` column and iterate over the rows to create edges between the users in the `mention` column.

In [None]:
#YOUR CODE HERE

>* Create an empty graph object `P` 

In [None]:
#YOUR CODE HERE

>* Populate the graph `p` with the edges (where you put the tuples to represent the edges)
>* You may have to use the `add_edges_from` method to add the edges to the graph.

In [None]:
#YOUR CODE HERE

>* How many unique nodes are there in the graph `P`?
>* Use `.nodes()` method to get the unique nodes.

In [None]:
#YOUR CODE HERE

>* In the DataFrame, there is a column called `followers` and `following`.
>* Let's add the `followers` and `following` as the node attributes to the graph `P`.
>* Remember how to deal with the nodes that are not in `username` column. If the node is not in the `username` column, add 0 for `followers` and `following` attributes.

In [None]:
#YOUR CODE HERE

>* Calculate the degree centrality of the graph `P` and assign the value of the degree centrality to the node as the node attribute.
>* Use the `nx.degree_centrality` method to calculate the degree centrality.
>* The name of attribute should be `degree_centrality`.

In [None]:
#YOUR CODE HERE