# Exploring Reddit Data

## TODO

Identify subreddits with known inauthentic activity, assign unique color on map 

In [1]:
import adjustText
import subprocess
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.sparse as ss
from os.path import isfile
from hdbscan import HDBSCAN
from sklearn.utils import check_array
from sklearn.base import BaseEstimator
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
from sklearn.decomposition import TruncatedSVD

import bokeh
from bokeh.palettes import plasma
from bokeh.models.mappers import LinearColorMapper
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import HoverTool, ColumnDataSource, value, CustomJS, DataRange1d
from collections import OrderedDict
from matplotlib.lines import Line2D
from matplotlib import pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.gridspec import GridSpec
from IPython.display import clear_output

sns.set_context('poster')
sns.set_style('white')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Subreddit Mapping

In [2]:
raw_data = pd.read_csv("data/subreddit_overlaps_BQ.csv")

In [3]:
raw_data

Unnamed: 0,t1_subreddit,t2_subreddit,NumOverlaps
0,RimWorld,archeage,84
1,unitedkingdom,pcmasterrace,3282
2,Trumpgret,dankchristianmemes,322
3,humor,childfree,54
4,customhearthstone,Tinder,411
...,...,...,...
46304397,Marijuana,justicedemocrats,18
46304398,MaliciousCompliance,dankruto,18
46304399,C25K,ProjectFi,18
46304400,Embroidery,casualChildAbuse,18


In [4]:
raw_data.describe()

Unnamed: 0,NumOverlaps
count,46304400.0
mean,12.53839
std,102.3249
min,1.0
25%,1.0
50%,2.0
75%,6.0
max,41650.0


In [5]:
print("Number of pairwise commenter overlaps: {}".format(len(raw_data)))
print("t1_subreddit unique subreddits: {}".format(len(raw_data["t1_subreddit"].unique())))
print("t2_subreddit unique subreddits: {}".format(len(raw_data["t2_subreddit"].unique())))

Number of pairwise commenter overlaps: 46304402
t1_subreddit unique subreddits: 2029
t2_subreddit unique subreddits: 131271


Rank the subreddits so that they are indexed in order of popularity. Popularity is defined by the total number of unique commenters in each subreddit. 

In [6]:
subreddit_popularity = raw_data.groupby('t2_subreddit')['NumOverlaps'].sum()
subreddits = np.array(subreddit_popularity.sort_values(ascending=False).index)

In [7]:
subreddits.tolist()[0:5]

['AskReddit', 'pics', 'funny', 'todayilearned', 'gaming']

Pivot the data into a matrix such that rows and columns are both indexed by subreddits, and the entry at position (i,j) is the number of overlaps bwteen the ith and jth subreddits

Create subreddit-to-integer-index map to convert the subreddit names in the table into numeric row and column indexes.

In [8]:
index_map = dict(np.vstack([subreddits, np.arange(subreddits.shape[0])]).T)

In [9]:
values = raw_data.NumOverlaps
row_indices = raw_data.t2_subreddit.map(index_map)
col_indices = raw_data.t1_subreddit.map(index_map)

Create a sparse matrix. This format requires us to specify triples of row, column, and value for each non-zero entry in the matrix. The COO matrix constructor accepts this as a triple of arrays: the first array is the values, the second and third are arrays of row and column indices.

In [10]:
count_matrix = ss.coo_matrix((values, (row_indices,col_indices)),
                              shape=(subreddits.shape[0], subreddits.shape[0]),
                              dtype=np.float64)

In [11]:
count_matrix.shape

(131271, 131271)

In [12]:
conditional_prob_matrix = count_matrix.tocsr()
conditional_prob_matrix = normalize(conditional_prob_matrix, norm='l1', copy=False)

## Converting subreddit vectors into a map

### Linear dimensionality reduction down to 500 dimensions

In [13]:
reduced_vectors = TruncatedSVD(n_components=500, random_state=1).fit_transform(conditional_prob_matrix)
reduced_vectors = normalize(reduced_vectors, norm='l2', copy=False)

In [14]:
reduced_vectors.shape

(131271, 500)

### Nonlinear dimensionality reduction down to 2 dimensions

In [15]:
class LargeVis(BaseEstimator):
    
    def __init__(self, n_components=2, perplexity=30.0, gamma=5,
                 layout_samples=None, n_neighbors=None, negative_samples=5,
                 alpha=1.0, n_cores=4, knn_prop=3, trees=50):
        self.n_components = n_components
        self.perplexity = perplexity
        self.layout_samples = layout_samples
        self.alpha = alpha
        self.n_cores = n_cores
        self.knn_prop = knn_prop
        self.negative_samples = negative_samples
        self.n_neighbors = n_neighbors
        self.gamma = gamma
        self.trees = trees
        if self.n_neighbors is None:
            self.n_neighbors = int(self.perplexity * 3)


    def fit_transform(self, X, y=None):
        
        if self.layout_samples is None:
            layout_samples = X.shape[0] / 100.0
        else:
            layout_samples = self.layout_samples
            
        X = check_array(X, dtype=np.float64)
        np.savetxt('/tmp/largevis_input', 
                   X, header='{} {}'.format(*X.shape), 
                   comments='')
        subprocess.check_call(['/Users/cameronlaedtke/LargeVis-python3/Linux/LargeVis',
                               '-input',  '/tmp/largevis_input',
                               '-output', '/tmp/largevis_output',
                               '-outdim',  str(self.n_components),
                               '-perp',    str(self.perplexity),
                               '-samples', str(layout_samples),
                               '-gamma',   str(self.gamma),
                               '-prop',    str(self.knn_prop),
                               '-trees',   str(self.trees),
                               '-neigh',   str(self.n_neighbors),
                               '-alpha',   str(self.alpha),
                               '-neg',     str(self.negative_samples),
                               '-threads', str(self.n_cores)])
        self.embedding_ = np.loadtxt('/tmp/largevis_output', skiprows=1)
        return self.embedding_
    
    
    def fit(self, X, y=None):
        self.fit_transform(X)
        return self

In [29]:
# embed_file = 'tsne_data/largevis_subreddit_map_low_perplexity.npy'
# plot_file = 'viz/subreddit_interactive_map_low_perplexity.html'

embed_file = 'tsne_data/largevis_subreddit_map.npy'
plot_file = 'viz/subreddit_interactive_map.html'

if isfile(embed_file):
    subreddit_map = np.load(embed_file)
else:
    largevis = LargeVis(perplexity=20, n_cores=12)
    subreddit_map = largevis.fit_transform(reduced_vectors[:10000])
    np.save(embed_file, subreddit_map)

In [30]:
subreddit_map_df = pd.DataFrame(subreddit_map, columns=('x', 'y'))
subreddit_map_df['subreddit'] = subreddits[:10000]
subreddit_map_df.head()

Unnamed: 0,x,y,subreddit
0,-6.50877,-0.510874,AskReddit
1,-6.329569,-0.26777,pics
2,-6.479487,-0.431,funny
3,-6.256063,-0.035419,todayilearned
4,-5.290978,14.978483,gaming


### Clustering the map

In [31]:
clusterer = HDBSCAN(min_samples=5, min_cluster_size=20).fit(subreddit_map)
cluster_ids = clusterer.labels_

In [32]:
subreddit_map_df['cluster'] = cluster_ids
subreddit_map_df.head()

Unnamed: 0,x,y,subreddit,cluster
0,-6.50877,-0.510874,AskReddit,-1
1,-6.329569,-0.26777,pics,136
2,-6.479487,-0.431,funny,136
3,-6.256063,-0.035419,todayilearned,136
4,-5.290978,14.978483,gaming,36


In [33]:
n_cluster_points = len(subreddit_map_df[subreddit_map_df.cluster != -1])
n_clusters = subreddit_map_df["cluster"].max()
score = silhouette_score(subreddit_map, cluster_ids)
print("points assigned to a cluster: {}".format(n_cluster_points))
print("number of clusters: {}".format(n_clusters))
print("silhouette score: {:.4f}".format(score))

points assigned to a cluster: 8493
number of clusters: 145
silhouette score: 0.3514


### Visualization

In [34]:
def big_palette(size, palette_func):
    if size < 256:
        return palette_func(size)
    p = palette_func(256)
    out = []
    for i in range(size):
        idx = int(i * 256.0 / size)
        out.append(p[idx])
    return out

In [27]:
# output_notebook()

In [28]:
# Construct a color palette and map clusters to colors
if cluster_ids.max() > 255:
    palette = ['#777777'] + big_palette(size=cluster_ids.max(), palette_func=plasma)
else:
    palette = ['#777777'] + list(plasma(cluster_ids.max()))

colormap = LinearColorMapper(palette=palette, low=-1, high=cluster_ids.max())
color_dict = {'field': 'cluster', 'transform': colormap}

# Set fill alpha globally
subreddit_map_df['fill_alpha'] = np.exp((subreddit_map.min() - subreddit_map.max()) / 5.0) + 0.05

# Build a column data source
plot_data = ColumnDataSource(data=subreddit_map_df)

# Create the figure and add tools
fig = figure(
    title='A Map of Subreddits',
    plot_width = 1150,
    plot_height = 1150,
    tools= ('pan, wheel_zoom, box_zoom,''box_select, reset'),
    active_scroll=u'wheel_zoom',
)

fig.add_tools(HoverTool(tooltips = OrderedDict([('subreddit', '@subreddit'), 
                                                ('cluster', '@cluster')])))

# draw the subreddits as circles on the plot
fig.circle(
    u'x', u'y', 
    source = plot_data,
    fill_color = color_dict, 
    line_color = None, 
    fill_alpha = 'fill_alpha',
    size = 10, 
    hover_line_color = u'black'
)

# Custom callback for alpha adjustment
jscode="""
    var data = source.data;
    var start = cb_obj.start;
    var end = cb_obj.end;
    var alpha = data['fill_alpha'];
    var i = 0;
    for (i = 0; i < alpha.length; i++) {
         alpha[i] = Math.exp((start - end) / 5.0) + 0.05;
    }
    source.change.emit();
"""

callback = CustomJS(args=dict(source=plot_data), code=jscode)

fig.x_range.js_on_change("start", callback)
fig.x_range.js_on_change("end", callback)

# configure visual elements of the plot
fig.title.text_font_size = value('18pt')
fig.title.align = 'center'
fig.xaxis.visible = False
fig.yaxis.visible = False
fig.grid.grid_line_color = None
fig.outline_line_color = '#222222'

# display the figure
output_file(plot_file)
show(fig);

### Exploring clusters

In [None]:
def cluster_bounds(dataframe, subreddit):
    # Find the cluster the subreddit belongs to
    cluster = dataframe.cluster[dataframe.subreddit == subreddit].values[0]
    if cluster == -1:
        print('This subreddit was lost as noise and not in any cluster')
        
    # Extract the dubset of the dataframe that is the cluster
    sub_dataframe = dataframe[dataframe.cluster == cluster]
    
    x_min = sub_dataframe.x.min()
    x_max = sub_dataframe.x.max()
    x_padding = (x_max - x_min) * 0.5
    x_min -= x_padding
    x_max += x_padding
    
    y_min = sub_dataframe.y.min()
    y_max = sub_dataframe.y.max()
    y_padding = (y_max - y_min) * 0.5
    y_min -= y_padding
    y_max += y_padding

    return x_min, x_max, y_min, y_max


def data_in_bounds(dataframe, bounds):
    return dataframe[
        (dataframe.x > bounds[0]) &
        (dataframe.x < bounds[1]) &
        (dataframe.y > bounds[2]) &
        (dataframe.y < bounds[3])
    ]


def plot_cluster(dataframe, subreddit, n_labels=50, fontsize=9, dpi=100):
    # Build a color map to match the Bokeh plot
    colormap = dict(zip(
        np.unique(dataframe.cluster),
        ['#777777'] + sns.color_palette('plasma', dataframe.cluster.max() + 1).as_hex()
    ))
    subregion_defined = True
    
    # Figure and gridspec to layout axes
    fig = plt.figure(figsize=(16,10), dpi=dpi)
    gs = GridSpec(3, 3)
    
    # First axes, spanning most of the figure
    # Contains just the points in a region 
    # around the points in the cluster
    ax1 = plt.subplot(gs[:,:2])
    try:
        bounds = cluster_bounds(dataframe, subreddit)
    except IndexError:
        ax1.text(0.5, 0.5, 'Subreddit {} not found!'.format(subreddit), 
                 horizontalalignment='center', verticalalignment='center',
                 transform=ax1.transAxes, fontsize=18)
        subregion_defined = False
    
    if subregion_defined:
        to_plot = data_in_bounds(dataframe, bounds)
        ax1.scatter(to_plot.x, to_plot.y, c=to_plot.cluster.map(colormap), s=30, alpha=0.5)
    
        # We want to add text labels. We subsample up to 50 labels
        # And then use adjustText to get them non-overlapping
        text_elements = []
        for row in to_plot.sample(n=min(len(to_plot), n_labels), random_state=0).values:
            if row[2] != subreddit:
                text_elements.append(ax1.text(row[0], row[1], row[2], alpha=0.5, fontsize=fontsize))
        row = to_plot[to_plot.subreddit == subreddit].values[0]
        text_elements.append(ax1.text(row[0], row[1], row[2], 
                                      color='g',
                                      alpha=0.5, fontsize=11))
        adjustText.adjust_text(text_elements, ax=ax1, lim=100,
                               force_text=0.1, force_points=0.1,
                               arrowprops=dict(arrowstyle="-", color='k', lw=0.5))
    
    ax1.xaxis.set_ticklabels([])
    ax1.yaxis.set_ticklabels([])

    # Second axes, center right of the figure
    # Plots all the data and a rectangle
    # Showing the area selected out
    ax2 = plt.subplot(gs[1,2])
    ax2.scatter(dataframe.x, dataframe.y, s=20,
                c=dataframe.cluster.map(colormap), alpha=0.05)
    
    if subregion_defined:
        ax2.add_patch(Rectangle(xy=(bounds[0], bounds[2]),
                                    width=(bounds[1] - bounds[0]),
                                    height=(bounds[3] - bounds[2]),
                                    edgecolor='k', facecolor='none', lw=1))
    ax2.xaxis.set_ticklabels([])
    ax2.yaxis.set_ticklabels([])
    plt.tight_layout()

    if subregion_defined:
        # Now we make use of the power of matplotlib transforms
        # to draw line from the subselected rectangle in axes2
        # all the way to the bounds of axes1
        trans_figure = fig.transFigure.inverted()

        ax1_coord = trans_figure.transform(ax1.transAxes.transform((1,0)))
        ax2_coord = trans_figure.transform(ax2.transData.transform((bounds[1],bounds[2])))
        connector1 = Line2D((ax1_coord[0],ax2_coord[0]),(ax1_coord[1],ax2_coord[1]),
                              transform=fig.transFigure, lw=1, color='k')
        ax1_coord = trans_figure.transform(ax1.transAxes.transform((1,1)))
        ax2_coord = trans_figure.transform(ax2.transData.transform((bounds[1],bounds[3])))
        connector2 = Line2D((ax1_coord[0],ax2_coord[0]),(ax1_coord[1],ax2_coord[1]),
                              transform=fig.transFigure, lw=1, color='k')

        fig.lines = [connector1, connector2]

In [None]:
plot_cluster(subreddit_map_df, 'StopVoterSuppression')

In [None]:
plot_cluster(subreddit_map_df, 'marxism_101', n_labels=100, fontsize=7, dpi=200)

In [None]:
plot_cluster(subreddit_map_df, 'scottishpolitics', n_labels=100, fontsize=7, dpi=200)

In [None]:
plot_cluster(subreddit_map_df, 'USHistory')

In [None]:
plot_cluster(subreddit_map_df, 'Menaregood', n_labels=100, fontsize=7, dpi=200)

In [None]:
plot_cluster(subreddit_map_df, 'tuesday', n_labels=150, fontsize=7, dpi=200)

In [None]:
# plot_cluster(subreddit_map_df, 'Le_Pen')

In [None]:
plot_cluster(subreddit_map_df, 'bidenbro')

In [None]:
plot_cluster(subreddit_map_df, 'exmuslim')

In [None]:
plot_cluster(subreddit_map_df, 'MuslimLounge')

In [None]:
plot_cluster(subreddit_map_df, 'geopolitics')

In [None]:
plot_cluster(subreddit_map_df, 'france', n_labels=200, fontsize=7)

In [None]:
plot_cluster(subreddit_map_df, 'China')

In [None]:
plot_cluster(subreddit_map_df, 'taiwan')

In [None]:
# plot_cluster(subreddit_map_df, 'cybersecurity')

In [None]:
plot_cluster(subreddit_map_df, 'ipad')

In [None]:
plot_cluster(subreddit_map_df, 'JoeRogan')

In [None]:
plot_cluster(subreddit_map_df, 'minnesota')

In [None]:
# plot_cluster(subreddit_map_df, 'vexillology')

In [None]:
plot_cluster(subreddit_map_df, 'environment')

In [None]:
plot_cluster(subreddit_map_df, 'elonmusk')

In [None]:
plot_cluster(subreddit_map_df, 'trees')

In [None]:
from ipywidgets import interact_manual, fixed, Text

In [None]:
interact_manual(plot_cluster, 
                dataframe=fixed(subreddit_map_df), 
                subreddit=Text())

In [None]:
coherent_clusters = np.argsort(clusterer.cluster_persistence_)[-10:][::-1]
coherence = np.sort(clusterer.cluster_persistence_)[-10:][::-1]

plt.figure(figsize=(15,5))

plt.bar(np.arange(10), coherence)
plt.gca().set_xticks(np.arange(10))
plt.gca().set_xticklabels(coherent_clusters)
plt.xlabel("Cluster")
plt.ylabel("Coherence")
plt.show()

In [None]:
def plot_cluster_by_id(dataframe, cluster_id):
    subreddits_in_cluster = np.array(dataframe.subreddit[cluster_ids == cluster_id])
    plot_cluster(dataframe, subreddits_in_cluster[0])
    plt.gcf().text(0.5, 0.98, 'Cluster {}'.format(cluster_id), ha='center')

In [None]:
plot_cluster_by_id(subreddit_map_df, 0)

In [None]:
plot_cluster_by_id(subreddit_map_df, 10)

In [None]:
plot_cluster_by_id(subreddit_map_df, 4)

In [None]:
plot_cluster_by_id(subreddit_map_df, 8)

In [None]:
plot_cluster_by_id(subreddit_map_df, 15)

In [None]:
plot_cluster_by_id(subreddit_map_df, 5)

In [None]:
plot_cluster_by_id(subreddit_map_df, 12)

In [None]:
plot_cluster_by_id(subreddit_map_df, 14)

In [None]:
plot_cluster_by_id(subreddit_map_df, 3)

In [None]:
plot_cluster_by_id(subreddit_map_df, 37)