In [None]:
# been researching into neo4js and spark GraphX

# for now, see whether this dataset is small enough for networkx

# also work on normalizing to N2 form

# I am beginning with a simple retweet graph (nodes = users, edges = tweet propagation count)

db_path = 'data/tweets_GILLUM/gillum_tweets_N1.db'
table_name = 'tweets'


In [None]:
import networkx as nx
import numpy as np
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
from timeit import default_timer as timer
import string

In [None]:
# HELPER FUNCTIONS

def is_retweet(status):
    if status[:2] == 'RT':
        return True # todo do a more thorough check on this
    else:
        return False
    
def parse_retweet(status):
    # todo error handling
    
    status_test = status.split('RT')
    #print(status_test)
    status_test2 = status_test[1].split(':')
    #print(status_test2)
    status_clean = string.join(status_test2[1:])
    source_user = status_test2[0]
    source_user = source_user[1:] # omit the leading space (keep the @)
    #print(status_clean)
    
    return (status_clean, source_user)
    
# todo parse hashtags, parse urls, parse mentions

In [None]:
# connect to the data
db = sqlite3.connect(db_path)
print('database connected')
# cursor method
cursor = db.cursor()

In [None]:
# find the list of users
'''
start = timer()

cursor.execute('SELECT DISTINCT SCREEN_NAME FROM {}'.format(table_name))
screen_names = []
for item in cursor:
    screen_names.append('@' + item[0]) # add the @ for clarity

print(len(screen_names))

end = timer()
print("time elapsed: ")
print(end - start)
'''

In [None]:
# build a networkx graph
'''
start = timer()

G_retweet = nx.DiGraph() # todo directed
for screen_name in screen_names:
    #print(screen_name)
    G_retweet.add_node(screen_name) # todo add metadata (friend count, etc) as properties

end = timer()
print("time elapsed: ")
print(end - start)
'''

In [None]:
# loop through tweets and add edges where retweets are found

G_retweet = nx.DiGraph()

start = timer()

cursor.execute('SELECT status_text,screen_name FROM {}'.format(table_name))
for item in cursor:
    
    #print(item)
    
    status = item[0]
    screen_name = '@' + item[1] # add the @ character for clarity
    if is_retweet(status):
        (clean_tweet, source_user) = parse_retweet(status)        
        # add an edge to this user from the source_user
        
        WEIGHT = 1
        try:
            G_retweet[source_user][screen_name] += WEIGHT
        except:
            e = (source_user, screen_name, {"w":WEIGHT})
            G_retweet.add_edge(*e) # add a new edge
    #print('')

end = timer()
print("time elapsed: ")
print(end - start)


In [None]:
# sanity check - look at the degree distribution

in_degree_list = list(G_retweet.in_degree(weight='w').values())
out_degree_list = list(G_retweet.out_degree(weight='w').values())
print( str(max(out_degree_list)) + " is the maximum out degree in the network")

plt.figure()
plt.hist(in_degree_list)
plt.title('in degree')
plt.show()

plt.figure() # todo plot on a log axis
plt.hist(out_degree_list)
plt.title('out degree')
plt.show()


In [None]:
# isolate the highest out-degree nodes (influencers)
influencers = []
THRESH_out = 99.75
thresh_out = np.percentile(out_degree_list,THRESH_out)
print(thresh_out)
for node in G_retweet.nodes():
    #print(node)
    if G_retweet.out_degree(node,weight='w') > thresh_out:
        influencers.append(node)
        #print(node)
print(len(influencers))    

THRESH_in = 99
thresh_in = np.percentile(in_degree_list,THRESH_in)
# todo isolate the highest in-degree nodes (spammers) and plot in relation to influencers

In [None]:
# for writable method: https://networkx.github.io/documentation/stable/reference/classes/generated/networkx.DiGraph.subgraph.html 
options = {'weight': 'w', 'node_color': 'black','node_size': 3,'width': 0.25, 'alpha': 0.5}


start = timer()

# Create a subgraph SG based on a (possibly multigraph) G
SG = nx.DiGraph()
SG.add_nodes_from(influencers)

#print(influencers[1:10])
for pre in influencers: # since this is small network just brute force it
    for post in influencers:
        if G_retweet.has_edge(pre,post):
            weight = G_retweet[pre][post]["w"]
            e =  (pre,post,{"w":weight})
            SG.add_edge(*e) # add a new edge
            #print('added edge')
       
end = timer()
print("time elapsed: ")
print(end - start)

plt.figure(figsize=(8,8))
#nx.draw_random(G_retweet_loudest)
nx.draw_spring(SG, **options)
plt.show()



In [None]:
# draw as eigenvector centrality, for each tweet