# Create graph of users

- **Input:** List of users, List of followers of those uesrs, List of friends of those users
- **Output:** Graph of users

- TODO: Paint color of node according to label

In [None]:
import pandas as pd
import pickle
import networkx as nx
import tweepy

# Input
USERS_FILENAME = "data/firstvoters-samples116-info-with-labels.xlsx"
USERS_SHEET = "samples"

FOLLOWERS_FILENAME_PKL = "data/samples116-followers.pkl"
FRIENDS_FILENAME_PKL = "data/samples116-friends.pkl"

# Output
TOP_PAGERANK_FILENAME = "data/samples116-pagerank-topXXXX.txt"

## Get sampled users

In [None]:
USER_INFO_COLUMNS = [
    "screen_name",
    "friends_count",
    "followers_count",
    "statuses_count",
    "created_at",
    "protected",
    "verified",
]  # What attributes to save?

users = {}
df = pd.read_excel(USERS_FILENAME, sheet_name=USERS_SHEET, usecols=USER_INFO_COLUMNS)
for index, node in df.iterrows():
    screen_name = node["screen_name"]
    users[screen_name] = {
        "protected": node.protected,
        "verified": node.verified,
        "followers_count": node.followers_count,
        "friends_count": node.friends_count,
        "statuses_count": node.statuses_count,
        "created_at": node.created_at.isoformat(),
        "seed_node": True,
    }
del df

print(f"Filename (sheetname): {USERS_FILENAME} ({USERS_SHEET})")
print(f"Users: {len(users)}")

## Dict containing info about followers of the sampled users
Read the dict from a pickled file

In [None]:
followers = {}
with open(FOLLOWERS_FILENAME_PKL, "rb") as file:
    followers = pickle.load(file)

print(f"Filename: {FOLLOWERS_FILENAME_PKL}")
print(f"Followers: {len(followers)}")

## Dict containing info about friends ("following") of the sampled users
Read the dict from a pickled file

(friends = users that the sampled users following)

In [None]:
friends = {}
with open(FRIENDS_FILENAME_PKL, "rb") as file:
    friends = pickle.load(file)

print(f"Filename: {FRIENDS_FILENAME_PKL}")
print(f"Friends: {len(friends)}")

## Make sure all dictionaries (followers/followings) have even keys

It is possible to have missing data. If it is the case, discards that user.

In [None]:
test_users = list(users.keys())

_new_users = {k: v for k, v in users.items() if k in test_users}
_new_followers = {k: v for k, v in followers.items() if k in test_users}
_new_friends = {k: v for k, v in friends.items() if k in test_users}

users = _new_users
followers = _new_followers
friends = _new_friends

print("Users that do not have info about their followers:")
print(set(users.keys()).difference(set(followers.keys())))

print("Users that do not have info about their friends:")
print(set(users.keys()).difference(set(friends.keys())))

## Remove users that do not have followers or friends info

In [None]:
for key in set(followers.keys()).difference(set(friends.keys())):
    followers.pop(key)

for key in set(friends.keys()).difference(set(followers.keys())):
    friends.pop(key)

for key in set(users.keys()).difference(set(friends.keys())):
    users.pop(key)

print(f"Remaining users: {len(users)}")

## Create a graph (Approach 1)

Choose either this one OR another below.

In [None]:
g = nx.DiGraph()

for name in users.keys():
    node = users[name]
    g.add_node(name, attr_dict={
        "label": name,
        "screen_name": name,
        "protected": node["protected"],
        "verified": node["verified"],
        "followers_count": node["followers_count"],
        "friends_count": node["friends_count"],
        "statuses_count": node["statuses_count"],
        "created_at": node["created_at"],
        "seed_node": True,
    })

for name in followers.keys():
    for node in followers[name]:
        g.add_node(node.screen_name, attr_dict={
            "label": node.screen_name,
            "screen_name": node.screen_name,
            "protected": node.protected,
            "verified": node.verified,
            "followers_count": node.followers_count,
            "friends_count": node.friends_count,
            "statuses_count": node.statuses_count,
            "created_at": node.created_at,
            "seed_node": False,
        })
        g.add_edge(node.screen_name, name)  # 1 follows 2

for name in friends.keys():
    for node in friends[name]:
        g.add_node(node.screen_name, attr_dict={
            "label": node.screen_name,
            "screen_name": node.screen_name,
            "protected": node.protected,
            "verified": node.verified,
            "followers_count": node.followers_count,
            "friends_count": node.friends_count,
            "statuses_count": node.statuses_count,
            "created_at": node.created_at,
            "seed_node": False,
        })
        g.add_edge(name, node.screen_name)  # 1 follows 2

print(f"Nodes: {len(list(g.nodes)):,}")
print(f"Edges: {len(list(g.edges)):,}")

## Create a graph (Approach 2)

Choose eithet this one OR another above.

In [None]:
g = nx.DiGraph()

for name in users.keys():
    node = users[name]
    g.add_node(name,
               label=name,
               followers_count=node["followers_count"],
               friends_count=node["friends_count"],
               seed_node=True,
               color="red"
              )

for name in followers.keys():
    for node in followers[name]:
        g.add_node(node.screen_name,
                   label=node.screen_name,
                   followers_count=node.followers_count,
                   friends_count=node.friends_count,
                   seed_node=False,
                   color="blue"
                  )
        g.add_edge(node.screen_name, name)  # 1 follows 2

for name in friends.keys():
    for node in friends[name]:
        g.add_node(node.screen_name,
                   label=node.screen_name,
                   followers_count=node.followers_count,
                   friends_count=node.friends_count,
                   seed_node=False,
                   color="blue"
                  )
        g.add_edge(name, node.screen_name)  # 1 follows 2

print(f"Nodes: {len(list(g.nodes)):,}")
print(f"Edges: {len(list(g.edges)):,}")

## Calculate PageRank

In [None]:
import operator

pr = nx.pagerank(g)  # pr is PageRank of graph g

sorted_x = sorted(pr.items(), key=operator.itemgetter(1), reverse=True)
sorted_x[:10]  # See top 10 nodes, sorted by PageRank

In [None]:
TOP_PAGERANK_FILENAME = "data/samples116-pagerank-top1200.txt"

top_pagerank = [name for name,_ in sorted_x[:1200]]  # Get Top 1,200

with open(TOP_PAGERANK_FILENAME, "w") as file:
    file.write("\n".join(top_pagerank))

## Save grapgh as Pickle and GraphML

In [None]:
import pickle

# Output
GRAPH_FILENAME_PKL = "data/samples116-graph.pkl"
GRAPH_FILENAME_GRAPHML = "data/samples116-graph.graphml"

with open(GRAPH_FILENAME_PKL, "wb") as file:
    pickle.dump(g, file)

nx.write_graphml_lxml(g, GRAPH_FILENAME_GRAPHML)

In [None]:
# Load the graph pickle

#import pickle
#import networkx as nx
#
#GRAPH_FILENAME_PKL = "data/samples116-graph.pkl"
#
#with open(GRAPH_FILENAME_PKL, "rb") as file:
#    g = pickle.load(file)
#
#len(g)

## Try draw the graph (can be very slow)

In [None]:
%matplotlib inline
nx.draw(g, node_shape="o", alpha=0.5, linewidths=4, width=2, edge_color="grey")