In [1]:
import implicit
import numpy as np
from tqdm import tqdm_notebook
import pandas as pd
import csv 
import scipy
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from implicit.nearest_neighbours import bm25_weight
from implicit import alternating_least_squares
from implicit import approximate_als
import umap

In [2]:
data = []
with open('interactions_30_ch_no_bots') as csvfile:
    datareader = csv.reader(csvfile, delimiter = ' ')
    for subreddit, user, comments, _ in datareader:
        data.append([user, subreddit, int(comments)])

In [3]:
data = pd.DataFrame.from_records(data)

In [4]:
data.columns = ['user', 'subreddit', 'comments']

In [5]:
data['user'] = data['user'].astype("category")
data['subreddit'] = data['subreddit'].astype("category")

In [6]:
# create a sparse matrix of all the artist/user/play triples
comments = coo_matrix((data['comments'].astype(float), 
                   (data['subreddit'].cat.codes, 
                    data['user'].cat.codes)))

### Latent Semantic Analysis ( Matrix Factorization)

In [8]:
# toggle this variable if you want to recalculate the als factors
read_als_factors_from_file = True

In [9]:
if read_als_factors_from_file:
    subreddit_factors = np.load('subreddit_factors_als.npy')
    user_factors = np.load('user_factors_als.npy')
else:
    subreddit_factors, user_factors = alternating_least_squares(bm25_weight(comments), 20)

In [10]:
subreddit_factors, user_factors = alternating_least_squares(bm25_weight(comments), 20, use_gpu = False)

This method is deprecated. Please use the AlternatingLeastSquares class instead
100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [02:26<00:00,  9.15s/it]


In [11]:
class TopRelated(object):
    def __init__(self, subreddit_factors):
        norms = np.linalg.norm(subreddit_factors, axis=-1)
        self.factors = subreddit_factors / norms[:, np.newaxis]
        self.subreddits = data['subreddit'].cat.categories.array.to_numpy()

    def get_related(self, subreddit, N=10):
        subreddit_hits = np.where(self.subreddits == subreddit)
        print(subreddit_hits)
        if not any(subreddit_hits[0]):
            print("In our world your SubReddit does not exist!!!")
            return 
        subredditid = subreddit_hits[0][0]
        scores = self.factors.dot(self.factors[subredditid])
        best = np.argpartition(scores, -N)[-N:]
        best_ = [self.subreddits[i] for i in best]
        return sorted(zip(best_, scores[best]), key=lambda x: -x[1])

In [12]:
top_related = TopRelated(subreddit_factors)

In [13]:
top_related.get_related('bleach')

(array([23623], dtype=int64),)


[('bleach', 1.0),
 ('OnePiece', 0.9817828),
 ('Naruto', 0.9807206),
 ('UNAMmx', 0.96712846),
 ('Seinen', 0.9622733),
 ('OnePunchMan', 0.9606999),
 ('CodeGeass', 0.96038693),
 ('manga', 0.9593556),
 ('HunterXHunter', 0.9584304),
 ('FullmetalAlchemist', 0.95777917)]

In [16]:
subreddit_factors.shape

(41143, 20)

In [17]:
subreddits_embedded = umap.UMAP().fit_transform(subreddit_factors)
subreddits_embedded.shape

AttributeError: module 'umap' has no attribute 'UMAP'

In [None]:
subreddits_embedded

In [11]:
subreddits = data['subreddit'].cat.categories.array.to_numpy()

In [None]:
import random

indices = random.sample(range(len(subreddits)), 1000)

In [None]:
sampled_subreddits = subreddits[indices]
sampled_subreddits_embedded = subreddits_embedded[indices]

In [None]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='abkds', api_key='KKuXHMUKu7EHg9kIZWrl')


# Create random data with numpy
import numpy as np

N = 500
xs = sampled_subreddits_embedded[:, 0]
ys = sampled_subreddits_embedded[:, 1]

# Create a trace
trace = go.Scatter(
    x = xs,
    y = ys,
    mode='markers+text',
    text=sampled_subreddits
)

data_ = [trace]

# Plot and embed in ipython notebook!
py.iplot(data_, filename='basic-scatter')

# or plot with: plot_url = py.plot(data, filename='basic-line')

### Bayesian Personalized Ranking

In [93]:
from implicit.bpr import BayesianPersonalizedRanking

params = {"factors": 63}

In [94]:
cx = comments

k = 0
for i, j, v in zip(comments.row, cx.col, cx.data):
    if k == 10:
        break
    print( "(%d, %d), %s" % (i,j,v) )
    k += 1

(27148, 1113188), 86.0
(27148, 824475), 30.0
(27148, 747152), 9.0
(27148, 726613), 1.0
(27148, 2314515), 84.0
(27148, 1640350), 52.0
(27148, 1206063), 17.0
(27148, 1491311), 71.0
(27148, 1003607), 165.0
(27148, 2089596), 15.0


In [95]:
import logging
import tqdm
import time
import codecs

In [96]:
model = BayesianPersonalizedRanking(**params)

In [97]:
model_name = 'bpr'
output_filename = 'subreddits_recs_bpr'

In [98]:
model.fit(comments)

100%|██████████████████████████████████| 100/100 [00:43<00:00,  2.34it/s, correct=95.78%, skipped=7.16%]


In [36]:
def bpr_related_subreddits(subreddit):
    found = np.where(subreddits == subreddit)
    if len(found[0]) == 0:
        raise ValueError("Subreddit doesn't exist in the dataset.")
    _id = found[0][0]
    return [(subreddits[i], v) for i, v in model.similar_items(_id)]

In [37]:
bpr_related_subreddits('OnePiece')

[('OnePiece', 3.054841),
 ('BokuNoHeroAcademia', 2.8956935),
 ('HunterXHunter', 2.8327653),
 ('Naruto', 2.809574),
 ('fairytail', 2.8008738),
 ('Toriko', 2.7774673),
 ('bleach', 2.7609572),
 ('Gintama', 2.6900525),
 ('OnePieceCircleJerk', 2.6743338),
 ('NanatsunoTaizai', 2.6683886)]

In [38]:
users = data['user'].cat.categories.array.to_numpy()

In [None]:
# generate recommendations for each user and write out to a file
user_comments = comments.T.tocsr()

with tqdm.tqdm_notebook(total=len(users)) as progress:
    with codecs.open(output_filename, "w", "utf8") as o:
        for userid, username in enumerate(users):
            for subredditid, score in model.recommend(userid, user_comments):
                o.write("%s\t%s\t%s\n" % (username, subreddits[subredditid], score))
            progress.update(1)

### Sample user recommendations

We went through the user 'xkcd_transciber' list of subreddits, where he/she commented. Taking a view of the kind of subreddits followed by the user we see that the predictions are good. This is just one sample, we are saving the recommendations for all users in a file and will also write the AUC score function for getting the exact scores for the generated recommendations.

In [34]:
def recommend_for_user(username):
    sample_user_id = np.where(users == username)[0][0]
    return [(subreddits[i], v) for i, v in model.recommend(2293528, user_comments)]

In [None]:
recommend_for_user('xkcd_transcriber')

In [None]:
def subreddits_interacted_by_user(username):
    sample_user_id = np.where(users == username)[0][0]
    _idlist =  comments.getcol(sample_user_id)
    return [subreddits[idx] for idx, i in enumerate(xkcd.toarray()) if i != 0.0]

In [None]:
# sample 50 reddits with which xkcd_transcriber has interacted with.
random.sample(subreddits_interacted_by_user('xkcd_transcriber'), 50)

## Similarity 


In [8]:
import implicit.nearest_neighbours 
import numpy as np

bm25recommender = implicit.nearest_neighbours.BM25Recommender()
bm25recommender.fit(comments)

100%|███████████████████████████████████████████████████████████| 41143/41143 [00:04<00:00, 9102.42it/s]


In [9]:
def simililar(subreddit, model):
    found = np.where(subreddits == subreddit)
    if len(found[0]) == 0:
        raise ValueError("Subreddit doesn't exist in the dataset.")
    _id = found[0][0]
    return [(subreddits[i], v) for i, v in model.similar_items(_id)]

In [21]:
simililar('relationship_advice', bm25recommender)

[('relationship_advice', 56264.860356837584),
 ('relationships', 7646.369998617379),
 ('sex', 3620.3338698129846),
 ('dating_advice', 3464.2280969267954),
 ('Advice', 2987.466870557861),
 ('DeadBedrooms', 2613.245492302089),
 ('Marriage', 2289.7037608953324),
 ('confession', 2212.4492161185867),
 ('offmychest', 1906.8668341457656),
 ('askwomenadvice', 1890.3259544583016)]

In [15]:
cosine_recommender = implicit.nearest_neighbours.CosineRecommender()
cosine_recommender.fit(comments)

100%|██████████████████████████████████████████████████████████| 41143/41143 [00:01<00:00, 25334.11it/s]


In [22]:
simililar('relationship_advice', cosine_recommender)

[('relationship_advice', 1.0000000000001017),
 ('junkojunsui', 0.2294500237344137),
 ('Controversy', 0.10987747615450796),
 ('relationships', 0.10444737538547606),
 ('badparents', 0.0687772524582305),
 ('Advice', 0.06377708722197854),
 ('OneY', 0.06206601341100445),
 ('dating_advice', 0.05847217068137823),
 ('Marriage', 0.054288376156235725),
 ('postnationalist', 0.05180502196499363)]

In [23]:
item_item_recommender = implicit.nearest_neighbours.ItemItemRecommender()
item_item_recommender.fit(comments)

100%|██████████████████████████████████████████████████████████| 41143/41143 [00:01<00:00, 25148.53it/s]


In [25]:
simililar('relationship_advice', item_item_recommender)

[('relationship_advice', 1532006.0),
 ('relationships', 624468.0),
 ('AskReddit', 426470.0),
 ('sex', 93304.0),
 ('AskWomen', 74514.0),
 ('Advice', 70018.0),
 ('AskMen', 57261.0),
 ('TwoXChromosomes', 52129.0),
 ('funny', 49147.0),
 ('todayilearned', 47359.0)]

In [27]:
tf_idf_recommender = implicit.nearest_neighbours.TFIDFRecommender()
tf_idf_recommender.fit(comments)

100%|██████████████████████████████████████████████████████████| 41143/41143 [00:01<00:00, 27210.93it/s]


In [28]:
simililar('relationship_advice', tf_idf_recommender)

[('relationship_advice', 1.0000000000000269),
 ('relationships', 0.1485194909573468),
 ('junkojunsui', 0.09118742669753052),
 ('sex', 0.07145533336297871),
 ('Controversy', 0.0683580169544256),
 ('dating_advice', 0.06248236304658284),
 ('Advice', 0.05464629578540297),
 ('Marriage', 0.04939810870950803),
 ('AskWomen', 0.045036999780776604),
 ('datingoverthirty', 0.04339423672382081)]

## SURPRISE MADAFAQAR

In [34]:
import surprise

In [35]:
svdpp = surprise.SVDpp()

In [105]:
reader = surprise.Reader()
dataset = surprise.Dataset.load_from_df(data, reader)

In [106]:
# Retrieve the trainset
trainset = dataset.build_full_trainset()

In [None]:
svdpp.fit(trainset)

In [60]:
a = svdpp.compute_similarities()

In [101]:
data['subreddit'][0000]

'exmormon'

In [104]:
found = np.where(subreddits == 'vv')
_id = found[0][0]
subreddits[np.argsort(a[_id])[::-1][0:10]]

IndexError: index 0 is out of bounds for axis 0 with size 0

In [103]:
help(np.arg)

AttributeError: module 'numpy' has no attribute 'arg'