In [179]:
import implicit
import numpy as np
from tqdm import tqdm_notebook
import pandas as pd
import csv 
import scipy
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from implicit.nearest_neighbours import bm25_weight
from implicit import alternating_least_squares
import umap

In [180]:
data = []
with open('interactions_30_ch_no_bots') as csvfile:
    datareader = csv.reader(csvfile, delimiter=' ')
    for subreddit, user, comments, _ in datareader:
        data.append([user, subreddit, int(comments)])

In [181]:
data = pd.DataFrame.from_records(data)

In [182]:
data.columns = ['user', 'subreddit', 'comments']

In [183]:
data['user'] = data['user'].astype("category")
data['subreddit'] = data['subreddit'].astype("category")

In [184]:
# create a sparse matrix of all the artist/user/play triples
comments = coo_matrix((data['comments'].astype(float), 
                   (data['subreddit'].cat.codes, 
                    data['user'].cat.codes)))

### Latent Semantic Analysis

In [185]:
# toggle this variable if you want to recalculate the als factors
read_als_factors_from_file = True

In [186]:
if read_als_factors_from_file:
    subreddit_factors = np.load('subreddit_factors_als.npy')
    user_factors = np.load('user_factors_als.npy')
else:
    subreddit_factors, user_factors = alternating_least_squares(bm25_weight(comments), 20)

In [65]:
subreddit_factors, user_factors = alternating_least_squares(bm25_weight(comments), 20)

100%|██████████| 15.0/15 [00:56<00:00,  3.66s/it]


In [187]:
class TopRelated(object):
    def __init__(self, subreddit_factors):
        norms = np.linalg.norm(subreddit_factors, axis=-1)
        self.factors = subreddit_factors / norms[:, np.newaxis]
        self.subreddits = data['subreddit'].cat.categories.array.to_numpy()

    def get_related(self, subreddit, N=10):
        subredditid = np.where(self.subreddits == subreddit)[0][0]
        scores = self.factors.dot(self.factors[subredditid])
        best = np.argpartition(scores, -N)[-N:]
        best_ = [self.subreddits[i] for i in best]
        return sorted(zip(best_, scores[best]), key=lambda x: -x[1])

In [188]:
top_related = TopRelated(subreddit_factors)

In [189]:
top_related.get_related('OnePiece')

[('OnePiece', 0.99999994),
 ('Naruto', 0.99061096),
 ('bleach', 0.98533773),
 ('OnePunchMan', 0.98216236),
 ('gamingadvice', 0.9787289),
 ('KillLaKill', 0.9785078),
 ('animebazaar', 0.97804713),
 ('StardustCrusaders', 0.9744566),
 ('snowleopards', 0.97381645),
 ('RolledTheDice', 0.97334623)]

In [190]:
subreddit_factors.shape

(41143, 20)

In [191]:
subreddits_embedded = umap.UMAP().fit_transform(subreddit_factors)
subreddits_embedded.shape

(41143, 2)

In [192]:
subreddits_embedded

array([[ 3.6010728,  0.0774124],
       [-3.897131 , -4.700751 ],
       [-1.2897592, -2.1058679],
       ...,
       [-3.5628572, -6.5976143],
       [ 9.227859 ,  1.489325 ],
       [-1.9831607, -2.7639296]], dtype=float32)

In [193]:
subreddits = data['subreddit'].cat.categories.array.to_numpy()

In [194]:
import random

indices = random.sample(range(len(subreddits)), 1000)

In [195]:
sampled_subreddits = subreddits[indices]
sampled_subreddits_embedded = subreddits_embedded[indices]

In [196]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='abkds', api_key='KKuXHMUKu7EHg9kIZWrl')


# Create random data with numpy
import numpy as np

N = 500
xs = sampled_subreddits_embedded[:, 0]
ys = sampled_subreddits_embedded[:, 1]

# Create a trace
trace = go.Scatter(
    x = xs,
    y = ys,
    mode='markers+text',
    text=sampled_subreddits
)

data_ = [trace]

# Plot and embed in ipython notebook!
py.iplot(data_, filename='basic-scatter')

# or plot with: plot_url = py.plot(data, filename='basic-line')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~abkds/0 or inside your plot.ly account where it is named 'basic-scatter'


### Bayesian Personalized Ranking

In [198]:
from implicit.bpr import BayesianPersonalizedRanking

params = {"factors": 63}

In [215]:
cx = comments

k = 0
for i, j, v in zip(comments.row, cx.col, cx.data):
    if k == 10:
        break
    print( "(%d, %d), %s" % (i,j,v) )
    k += 1

(27148, 1113188), 86.0
(27148, 824475), 30.0
(27148, 747152), 9.0
(27148, 726613), 1.0
(27148, 2314515), 84.0
(27148, 1640350), 52.0
(27148, 1206063), 17.0
(27148, 1491311), 71.0
(27148, 1003607), 165.0
(27148, 2089596), 15.0


In [124]:
import logging
import tqdm
import time
import codecs

In [328]:
model = BayesianPersonalizedRanking(**params)

In [350]:
model_name = 'bpr'
output_filename = 'subreddits_recs_bpr'

In [332]:
model.fit(comments)

100%|██████████| 100/100 [03:37<00:00,  2.21s/it, correct=95.94%, skipped=7.16%]


In [338]:
def bpr_related_subreddits(subreddit):
    found = np.where(subreddits == subreddit)
    if len(found[0]) == 0:
        raise ValueError("Subreddit doesn't exist in the dataset.")
    _id = found[0][0]
    return [(subreddits[i], v) for i, v in model.similar_items(_id)]

In [349]:
bpr_related_subreddits('dogs')

[('dogs', 2.9264455),
 ('Dogtraining', 2.752574),
 ('puppy101', 2.6112063),
 ('Pets', 2.550935),
 ('AskVet', 2.4905205),
 ('WiggleButts', 2.447608),
 ('DoggyDNA', 2.421627),
 ('rescuedogs', 2.4086597),
 ('mutt', 2.3754249),
 ('Veterinary', 2.3596926)]

In [410]:
users = data['user'].cat.categories.array.to_numpy()

In [412]:
# generate recommendations for each user and write out to a file
user_comments = comments.T.tocsr()

with tqdm.tqdm_notebook(total=len(users)) as progress:
    with codecs.open(output_filename, "w", "utf8") as o:
        for userid, username in enumerate(users):
            for subredditid, score in model.recommend(userid, user_comments):
                o.write("%s\t%s\t%s\n" % (username, subreddits[subredditid], score))
            progress.update(1)

HBox(children=(IntProgress(value=0, max=2323019), HTML(value='')))

### Sample user recommendations

We went through the user 'xkcd_transciber' list of subreddits, where he/she commented. Taking a view of the kind of subreddits followed by the user we see that the predictions are good. This is just one sample, we are saving the recommendations for all users in a file and will also write the AUC score function for getting the exact scores for the generated recommendations.

In [413]:
def recommend_for_user(username):
    sample_user_id = np.where(users == username)[0][0]
    return [(subreddits[i], v) for i, v in model.recommend(2293528, user_comments)]

In [414]:
recommend_for_user('xkcd_transcriber')

[('Buttcoin', 3.698471),
 ('redditrequest', 3.686717),
 ('spam', 3.6279433),
 ('programmingcirclejerk', 3.593357),
 ('modclub', 3.5591707),
 ('roguelikedev', 3.4863684),
 ('metanarchism', 3.4842796),
 ('LinuxCirclejerk', 3.4640236),
 ('ModelUSGov', 3.45617),
 ('lisp', 3.3709202)]

In [415]:
def subreddits_interacted_by_user(username):
    sample_user_id = np.where(users == username)[0][0]
    _idlist =  comments.getcol(sample_user_id)
    return [subreddits[idx] for idx, i in enumerate(xkcd.toarray()) if i != 0.0]

In [416]:
# sample 50 reddits with which xkcd_transcriber has interacted with.
random.sample(subreddits_interacted_by_user('xkcd_transcriber'), 50)

['linuxquestions',
 'cpp',
 'NBASpurs',
 'computertechs',
 'Serendipity',
 'Libertarian',
 'TheFacebookDelusion',
 'Kappa',
 'battlestations',
 'linux_gaming',
 'creepy',
 'Banshee',
 'harrypotter',
 'iphone',
 'collapse',
 'gatech',
 'Cricket',
 'calvinandhobbes',
 'PS4',
 'MLS',
 'Gunners',
 'lolphp',
 'changetip',
 'Python',
 'latterdaysaints',
 'scientology',
 'startrek',
 'electronics',
 'learnpython',
 'ukpolitics',
 'acecombat',
 'lgbt',
 'ShitAmericansSay',
 'SacredGeometry',
 'philadelphia',
 'JamesBond',
 'speedrun',
 'CodAW',
 'Austin',
 'sweden',
 'techsupportanimals',
 'sysadmin',
 'shittyaskreddit',
 'RWBY',
 'CFBOffTopic',
 'minecraftsuggestions',
 'phillycycling',
 'AndroidMasterRace',
 'socialskills',
 'MilitaryPorn']