In [1]:
import glob
import json
import os.path as op
import os
from collections import defaultdict

import implicit
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

import matplotlib.pyplot as plt
%matplotlib inline

os.environ['OPENBLAS_NUM_THREADS'] = '1'

## Load dataset with top subreddits by subs count

In [9]:
! wget https://www.dropbox.com/s/hie780betvmuu79/reddit.csv

In [3]:
top_subreddits_df = pd.read_csv('reddit.csv')
subreddit2log_size = {
    subreddit: np.log(size)
    for subreddit, size in top_subreddits_df[['real_name', 'subs']].values
}

## Load users' comments dataset 

In [202]:
! wget https://www.dropbox.com/s/rlc61tbs4yo56uv/subreddit_users.json

In [195]:
with open('subreddit_users.json') as f:
    subreddit_users = json.load(f)

id2user = list({user for users in subreddit_users.values() for user in users})
user2id = {user: i for i, user in enumerate(id2user)}

id2subreddit = list(subreddit_users)
subreddit2id = {subreddit: i for i, subreddit in enumerate(id2subreddit)}

## Calculate IDFs, TFs and mean users' comments count for BM25 metric

In [198]:
subreddit2df = defaultdict(int)

user2subreddit2tf = defaultdict(dict)
for sub, users in subreddit_users.items():
    for u, tf in users.items():
        subreddit2df[sub] += 1
        user2subreddit2tf[u][sub] = tf
        
subreddit2idf = {s: np.log((len(id2user) - df + 0.5) / (df + 0.5)) for s, df in subreddit2df.items()}

mean_user_length = np.mean(
    [sum(subreddit2tf.values()) for subreddit2tf in user2subreddit2tf.values()]
)

## Construct sparse subreddit-user matrix

In [152]:
xs = []
ys = []
data = []

K1 = 1.2
B = 0.5

def bm25(tf, idf, user_length):
    length_norm = 1 - B + B * user_length / mean_user_length
    return (
        idf * (tf * (K1 + 1)) / (tf + K1 * length_norm)
    )

for subreddit, users in subreddit_users.items():
    subreddit_id = subreddit2id[subreddit]
    for user, _ in users.items():
        user_id = user2id[user]
        tf = user2subreddit2tf[user][subreddit]
        idf = subreddit2idf[subreddit]

        user_subreddits_length = sum(user2subreddit2tf[u].values())
        xs.append(subreddit_id)
        ys.append(user_id)
        data.append(bm25(tf, idf, user_subreddits_length))

matrix = coo_matrix((data, (xs, ys)), shape=(len(id2subreddit), len(id2user))).tocsr()

## Factorize matrix, it may take a long time

In [153]:
model = implicit.als.AlternatingLeastSquares(
    factors=60, num_threads=7,
    iterations=20, calculate_training_loss=True,
    validate_proportion=0.05
)
model.fit(matrix, show_progress=True)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




## Dump subreddits vectors and subreddits list to files

In [181]:
np.save('subreddits.npy', model.item_factors)

with open('subreddits.json', 'w') as f:
    json.dump(id2subreddit, f)

## Similar subreddits recommender using cosine similarity

In [10]:
item_vecs = model.item_factors / np.linalg.norm(model.item_factors, axis=1, keepdims=True)

def find_nearest_subreddits(name):
    sub_id = subreddit2id[name]
    sims = item_vecs @ item_vecs[sub_id]
    for i in np.argsort(np.squeeze(sims))[-20:-1][::-1]:
        yield (id2subreddit[i], sims[i])

In [211]:
for sub, score in find_nearest_subreddits('datascience'):
    print(f'https://reddit.com/r/{sub} -> {score}')

https://reddit.com/r/learnmachinelearning -> 0.9642617702484131
https://reddit.com/r/OMSCS -> 0.959938108921051
https://reddit.com/r/dataengineering -> 0.9495332837104797
https://reddit.com/r/ExperiencedDevs -> 0.9486438632011414
https://reddit.com/r/MachineLearning -> 0.9478331208229065
https://reddit.com/r/algorithms -> 0.9413017630577087
https://reddit.com/r/LanguageTechnology -> 0.9407913684844971
https://reddit.com/r/SoftwareEngineering -> 0.9393213987350464
https://reddit.com/r/Rlanguage -> 0.937468945980072
https://reddit.com/r/cscareerquestions -> 0.9353570342063904
https://reddit.com/r/AskComputerScience -> 0.9329497218132019
https://reddit.com/r/statistics -> 0.9264026284217834
https://reddit.com/r/datasets -> 0.9260255694389343
https://reddit.com/r/csMajors -> 0.9235038757324219
https://reddit.com/r/computerscience -> 0.9200653433799744
https://reddit.com/r/compsci -> 0.9196723699569702
https://reddit.com/r/learnprogramming -> 0.9169032573699951
https://reddit.com/r/django -

## Project given subreddits vectors onto 2d surface using UMAP

In [11]:
import umap

embedding = umap.UMAP(
    n_neighbors=10, min_dist=0.001, metric='cosine', random_state=1
).fit_transform(model.item_factors)

## Interactive plotly visualization

In [12]:
from plotly.offline import plot

plot({
        'data': [{
            "x": embedding[:, 0],
            "y": embedding[:, 1],
            'type': 'scattergl',
            'marker': {
                'color': 'rgb(231, 231, 221)',
                'size': [subreddit2log_size[i] / 6 for i in id2subreddit],
                'line': {'width':0}
            },
            'text': [f'{s}' for s in id2subreddit],
            'opacity': 0.6,
            'hovertemplate': '<b>r/%{text}</b>',
            'mode': 'markers',
            'name': ''
        }],
        'layout': {
            'xaxis': {
                'showgrid': False,
                'zeroline': False,
                'showline': False,
                'ticks': '',
                'showticklabels': False
            },
            'yaxis': {
                'showgrid': False,
                'zeroline': False,
                'showline': False,
                'ticks': '',
                'showticklabels': False
            },
            'plot_bgcolor': 'rgb(24, 24, 34)',
            'hovermode': 'closest',
            'showlegend': False,
            'title': 'Subreddits space',
            'hoverlabel': {'bgcolor': '#FFF'},
        },
    },
    filename = 'subreddits_space.html',
    config={
        'responsive': True,
        'modeBarButtonsToRemove': [
            'pan2d', 'lasso2d','hoverCompareCartesian', 'resetViews',
            'toggleSpikelines', 'hoverClosestCartesian', 'select2d',
            'resetViewMapbox', 'resetScale2d', 'autoScale2d']
    },
    auto_open=True)

'subreddits_space.html'

## Matplotlib visualization for [r/dataisbeautiful](https://reddit.com/r/dataisbeautiful) and wallpapers

In [217]:
plt.figure(figsize=(70 * 16 / 9, 70))
ax = plt.axes()
ax.set_facecolor(np.array([24, 24, 34]) / 255)

plt.scatter(
    embedding[:, 0], embedding[:, 1],
    s=[subreddit2log_size[i] ** 1.2 / 6 for i in id2subreddit],
    color=np.array([231, 231, 221]) / 255,
    alpha=0.6
)