## Social Network Analysis

## Network X - Build Network from Twitter Data

Note that it may be faster to run this on a GPU.

Import relevant libraries:

In [0]:
import networkx as nx
import json
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt

Define global parameters:

In [0]:
twitter_followers_path = r'twitter_followers.json'

Define relevant functions:

In [0]:
#Get set of user_ids
def get_unique_ids(followers_dict_list):
    all_screen_names_list = []
    for follower in followers_dict_list:
        followers_list = list(follower.values())[0]
        all_screen_names_list = all_screen_names_list + followers_list
        all_screen_names_list.append(list(follower.keys())[0])
    unique_screen_names = list(set(all_screen_names_list))
    return unique_screen_names

#Get unique edges
def clean_edge_list(edge_list):
    temp = []
    for (a,b) in edge_list:
        if (a,b) not in temp and (b,a) not in temp:
            if (a,b) != (b,a):
                temp.append((a,b))
    output = 1*temp
    return output

#Get edge list
def get_edges(followers_dict_list):
    edge_list = []
    edge_list_out = []
    for relationship in followers_dict_list:
        for friend in list(relationship.values())[0]:
            edge = (list(relationship.keys())[0],friend)
            edge_list.append(edge)
    edge_list = list(set(edge_list))
    edge_list_out = clean_edge_list(edge_list)
    return edge_list_out

#build graph
def build_ego_graph(unique_screen_names, edge_list_out):
    G = nx.Graph()
    G.add_nodes_from(unique_screen_names)
    G.add_edges_from(edge_list_out)
    return G

Let's import our twitter followers list and print the first few items:

In [7]:
with open(twitter_followers_path, 'r') as twitter_file:
    followers_list = json.load(twitter_file)

print(followers_list[:10])
print(len(followers_list))

[{'Komarova220899': ['Saelky1', 'namedmeklass', 'esia96', 'Arsenteva_katy']}, {'kmUloUTTY30jlMP': ['Komarova220899', 'vitalikcalm', '_brkv_', 'bokkarev1', 'Malyshe2002', '__ELIZAVETA__', 'moskalevaal01', 'hellofriends92', 'marina_alek', 'l0224m', 'shidagis04033', 'Alina_Kata_', 'Polina_Ivchenko', 'imlerachka', 'ninka_ushastyy', 'dashadasha25', 'Koshka23082012', '_Tomilina_']}, {'svLja3KwvMDcrGi': []}, {'sshhfq': ['87FvuoW2GbufOPU', 'k_chernenko', 'moskalevaal01', 'sofika_999', 'alexandr_25_17', '2QfpBoA43t1tzcL', '_Karimova_15', 'v_malenda', 'mokretsova_38', 'davidovaeliz02', 'vlasevskaya2016', 'TSerega_138RUS', '666Sofya', 'anastasssss_', 'shidagis04033', 'bezenkova_777']}, {'v71xk': []}, {'v_malenda': ['_wicked19_', 'sshhfq7', 'ti7s0d7v1tOh3K4', '9XbxzD6XfV73duz', 'alexandr_25_17', 'crybabycry050', 'rusaliiinaaa', '_vorotilova_', 'sshhfq', 'Petstore_ru']}, {'Km9873185647': ['_vorotilova_', 'mokretsova_38']}, {'neoeroakauraaka': ['dkkcp', 'yokopvawjbwm', 'sefurematuri', 'iKPE8GV55', '

Now lets clean up the followers list, extract the useful information, and build our graph object (note that some of these processes may take a while...):

In [0]:
#Get unique user ids:
unique_screen_names = get_unique_ids(followers_list)

#Get connections (edges)
edge_list_out = get_edges(followers_list)

# New Section

In [13]:
f'Total Unique users: {str(len(unique_screen_names))}'

'Total Unique users: 51974'

In [14]:
f'Total Graph Edges: {str(len(edge_list_out))}'

'Total Graph Edges: 56553'

In [0]:
#Instantiate graph object
G = build_ego_graph(unique_screen_names, edge_list_out)

Now, let's prune the graph to remove screen names with only one connection to aid in visualiation:

In [0]:
#Calculate degree centralities
degree_centralities = nx.degree_centrality(G)
one_degree = min(degree_centralities.values())

#Get nodes of one degree:
one_degree_nodes = []
for node in degree_centralities.keys():
    if degree_centralities[node] == one_degree:
        one_degree_nodes.append(node)

#Create copy of the graph 
H = deepcopy(G)

#Remove nodes with only one connection
for node in one_degree_nodes:
    H.remove_node(node)

In [18]:
f'Total nodes before pruning: {str(len(G.nodes()))}'

'Total nodes before pruning: 51974'

In [19]:
f'Total nodes after pruning: {str(len(H.nodes()))}'

'Total nodes after pruning: 2805'

Now let's use the betweenness centralities to find the 10 most influential nodes (this may take a while...):

In [20]:
# Function to sort the list by first item of tuple 
def Sort_Tuple(tup):  
    # key is set to sort using first element  
    # sublist lambda has been used  
    tup.sort(key = lambda x: x[0])
    tup.reverse()  
    return tup

#Calculate centralities
betweenness_centralities = nx.betweenness_centrality(H)

#Print node with maximum betweenness centrality
inverse = [(value, key) for key, value in betweenness_centralities.items()]
inverse_sort = Sort_Tuple(inverse)
inverse_sort[:10]

[(0.6703929635387542, 'theenkay'),
 (0.21028671517808692, 'IolantaGoethe'),
 (0.14802096285965868, 'Arsenteva_katy'),
 (0.05052115671338203, 'sshhfq7'),
 (0.047777069978608096, 'Koshka23082012'),
 (0.03345665130848052, 'kristiesse'),
 (0.023529963238279127, 'babskii_rai'),
 (0.022892402121216574, 'MehribanYsupova'),
 (0.018154638271734656, 'malaiys'),
 (0.01791300586477669, 'lonely_sweet_')]