In [1]:
import json, math
import matplotlib.pyplot as plt
import networkx as nx
from pprint import pprint

In [2]:
with open('reddit data/left.json') as f:
    left = json.load(f)
with open('reddit data/right.json') as f:
    right = json.load(f)
with open('reddit data/left_subs.json') as f:
    left_subs = set(json.load(f))
with open('reddit data/right_subs.json') as f:
    right_subs = set(json.load(f))
with open('reddit data/left_users.json') as f:
    left_users = set(json.load(f))
with open('reddit data/right_users.json') as f:
    right_users = set(json.load(f))
with open('reddit data/left_sources.txt') as f:
    left_sources = set([l.strip().lower() for l in f.readlines() if l.strip() != ''])
with open('reddit data/right_sources.txt') as f:
    right_sources = set([l.strip().lower() for l in f.readlines() if l.strip() != ''])
with open('reddit data/all_subs.json') as f:
    all_subs = json.load(f)

In [3]:
# invert user:subs dict
sub_users = dict()
for s in all_subs:
    sub_users[s] = set()
for u in left:
    for s in left[u]:
        sub_users[s].add(u)
for u in right:
    for s in right[u]:
        sub_users[s].add(u)

In [4]:
# filter out small subs
counts = {s:len(sub_users[s]) for s in sub_users}
large_subs = {s:sub_users[s] for s in sub_users if len(sub_users[s]) >= 1500}
print(len(large_subs))
large_sub_users = set()
for s in large_subs:
    for u in large_subs[s]:
        large_sub_users.add(u)
print(len(large_sub_users))


59
38795


In [5]:
political_sub_edges = dict()
sources_set = set(left_sources).union(set(right_sources))
sorted_subs = sorted(list(large_subs))

for s in sorted(sources_set):
    if s not in large_subs: continue
    for u in large_subs[s]:
        for s2 in sorted_subs:
            if s != s2 and u in large_subs[s2]:
                count = political_sub_edges.get((s, s2), 0)
                political_sub_edges[(s, s2)] = count + 1

In [6]:
MAX_SIZE = 50
MAX_WEIGHT = 30

max_weight = max(political_sub_edges[e] for e in political_sub_edges)
G = nx.Graph()
for e in political_sub_edges:
    G.add_edge(e[0], e[1], weight=political_sub_edges[e] / max_weight * MAX_WEIGHT)
   
def score_sub(sub):
    users = large_subs[sub]
    score = 0
    for u in users:
        if u in left_users: score -= 1
        if u in right_users: score += 1
    sub_score = score / len(users)
    sub_score -= -(len(left_users) - len(right_users)) / (len(left_users) + len(right_users))
    return sub_score

def calculate_color(s):
    sub_score = score_sub(s) + 1
    sub_score /= 2
    r = math.floor(255 * sub_score)
    b = math.floor(255 * (1.1 - sub_score))
    color = '#' + '%x' % r + '00' + '%x' % b
        
    return color
    
colors = {s: calculate_color(s) for s in G.nodes}
left_sources_set = set(left_sources)
right_sources_set = set(right_sources)
    
max_size = max(len(large_subs[s]) for s in G.nodes)
node_sizes = {s:len(large_subs[s]) / max_size * MAX_SIZE for s in G.nodes}

In [7]:
from bokeh.io import show, output_file, output_notebook, reset_output
from bokeh.plotting import figure
from bokeh.models import Plot, Range1d, MultiLine, Circle, HoverTool, BoxZoomTool, WheelZoomTool, PanTool, TapTool
from bokeh.models import GraphRenderer, StaticLayoutProvider, Oval
from bokeh.palettes import Spectral4
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes
from bokeh.layouts import row

In [8]:
X_SPACING = 1000
def x(sub):
    return score_sub(sub) * X_SPACING
    
left = 0
center = 0
right = 0
def y(sub):
    global left, center, right
    if sub in left_sources_set:
        result = left
        if left >= 0: left += 2
        left *= -1
    elif sub in right_sources_set:
        result = right
        if right >= 0: right += 2
        right *= -1
    else:
        result = center
        if center >= 0: center += 1
        center *= -1
    return result

FIGURE_SIZE = 1500
plot = figure(x_range=(-FIGURE_SIZE, FIGURE_SIZE), y_range=(-FIGURE_SIZE, FIGURE_SIZE),
              tools='')

graph = from_networkx(G, nx.spring_layout, scale=2, center=(0,0))

### start of layout code

hover = HoverTool(tooltips=[("sub name", "@name"), ("score", "@score")])
hover.show_arrow = False
plot.add_tools(hover, BoxZoomTool(), PanTool(), WheelZoomTool(), TapTool())
graph.node_renderer.data_source.data['name'] = list(G.nodes)
graph.node_renderer.data_source.data['score'] = [score_sub(s) for s in G.nodes]

graph_layout = {node: (x(node), y(node) * MAX_SIZE) for node in G.nodes}
graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

graph.edge_renderer.data_source.data["line_width"] = [G.get_edge_data(a,b)['weight'] for a, b in G.edges()]
graph.node_renderer.data_source.data['node_color'] = [colors[n] for n in G.nodes]
graph.node_renderer.data_source.data['node_size'] = [node_sizes[n] for n in G.nodes]
graph.node_renderer.glyph = Circle(size='node_size', fill_color={'field': 'node_color'})
graph.edge_renderer.glyph.line_width = {'field': 'line_width'}

graph.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width={'field': 'line_width'})
graph.edge_renderer.selection_glyph = MultiLine(line_color='#000000', line_width={'field': 'line_width'})
graph.selection_policy = NodesAndLinkedEdges()

plot.renderers.append(graph)
reset_output()
output_notebook()
show(plot)

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: line_width [renderer: GlyphRenderer(id='4fa7f250-fae4-4d35-849f-5d33aa925375', ...)]
ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: node_color, node_size [renderer: GlyphRenderer(id='a5bf9b40-3169-4808-8523-d5cfd2a9d60c', ...)]


In [9]:
def make_edges(edges, large_subs, large_sub_users, sorted_subs):
    for i in range(len(sorted_subs)):
        s = sorted_subs[i]
        if s not in large_subs: continue
        for u in large_sub_users:
            if u in large_subs[s]:
                for j in range(i + 1, len(sorted_subs)):
                    s2 = sorted_subs[j]
                    if u in large_subs[s2]:
                        count = edges.get((s, s2), 0)
                        edges[(s, s2)] = count + 1

all_sub_edges = dict()
make_edges(all_sub_edges, large_subs, large_sub_users, sorted_subs = sorted(list(large_subs)))

In [10]:
MAX_SIZE = 50
MAX_WEIGHT = 30

max_weight = max(all_sub_edges[e] for e in all_sub_edges)
G = nx.Graph()
for e in all_sub_edges:
    G.add_edge(e[0], e[1], weight=all_sub_edges[e] / max_weight * MAX_WEIGHT)

colors = {s:'white' for s in G.nodes}
max_size = max(len(large_subs[s]) for s in G.nodes)
node_sizes = {s:len(large_subs[s]) / max_size * MAX_SIZE for s in G.nodes}

In [11]:
def plot_plain_network(G):
    FIGURE_SIZE = 2
    plot = figure(x_range=(-FIGURE_SIZE, FIGURE_SIZE), y_range=(-FIGURE_SIZE, FIGURE_SIZE),
                  tools='')

    graph = from_networkx(G, nx.spring_layout, scale=2, center=(0,0))

    ### start of layout code

    hover = HoverTool(tooltips=[("sub name", "@name")])
    hover.show_arrow = False
    plot.add_tools(hover, BoxZoomTool(), PanTool(), WheelZoomTool(), TapTool())
    graph.node_renderer.data_source.data['name'] = list(G.nodes)

    graph.edge_renderer.data_source.data["line_width"] = [G.get_edge_data(a,b)['weight'] for a, b in G.edges()]
    graph.node_renderer.data_source.data['node_color'] = [colors[n] for n in G.nodes]
    graph.node_renderer.data_source.data['node_size'] = [node_sizes[n] for n in G.nodes]
    graph.node_renderer.glyph = Circle(size='node_size', fill_color={'field': 'node_color'})
    graph.edge_renderer.glyph.line_width = {'field': 'line_width'}
    
    graph.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width={'field': 'line_width'})
    graph.edge_renderer.selection_glyph = MultiLine(line_color='#000000', line_width={'field': 'line_width'})
    graph.selection_policy = NodesAndLinkedEdges()

    plot.renderers.append(graph)
    reset_output()
    output_notebook()
    return plot

original_plot = plot_plain_network(G)
show(original_plot)

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: line_width [renderer: GlyphRenderer(id='a5cb9100-6467-4984-82ee-01e36a921353', ...)]
ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: node_color, node_size [renderer: GlyphRenderer(id='ab9a37e9-58ab-4214-a8a5-f147021af6fd', ...)]


In [12]:
with open('reddit data/test_subs.json') as f:
    test = json.load(f)
with open('reddit data/test_users.json') as f:
    test_users = json.load(f)

In [13]:
# invert user:subs dict
test_sub_users = dict()
test_subs = set()
for u in test_users:
    test_subs.update(set(test[u]))
for s in test_subs:
    test_sub_users[s] = set()
for u in test:
    for s in test[u]:
        test_sub_users[s].add(u)
print(len(test_sub_users))
test_large_subs = {s:test_sub_users[s] for s in test_sub_users if len(test_sub_users[s]) >= 1500}
print(len(test_large_subs))

16245
45


In [14]:
test_sub_edges = dict()
make_edges(test_sub_edges, test_large_subs, test_users, sorted(list(test_large_subs)))

In [15]:
MAX_SIZE = 50
MAX_WEIGHT = 30

max_weight = max(test_sub_edges[e] for e in test_sub_edges)
G = nx.Graph()
for e in test_sub_edges:
    G.add_edge(e[0], e[1], weight=test_sub_edges[e] / max_weight * MAX_WEIGHT)
    
colors = {s:'white' for s in G.nodes}
max_size = max(len(test_large_subs[s]) for s in G.nodes)
node_sizes = {s:len(test_large_subs[s]) / max_size * MAX_SIZE for s in G.nodes}

In [16]:
new_plot = plot_plain_network(G)
show(new_plot)

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: line_width [renderer: GlyphRenderer(id='53fb5f21-6766-4bd4-85fa-172b06602cfb', ...)]
ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: node_color, node_size [renderer: GlyphRenderer(id='c0e61a28-9486-4e1a-947c-8223fb9cdacf', ...)]
