In [1]:
import implicit
import numpy as np
from tqdm import tqdm_notebook
import pandas as pd
import csv 
import scipy
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from implicit.nearest_neighbours import bm25_weight
from implicit import alternating_least_squares
import umap

In [2]:
data = []
with open('interactions_30_ch_no_bots') as csvfile:
    datareader = csv.reader(csvfile, delimiter=' ')
    for subreddit, user, comments, _ in datareader:
        data.append([user, subreddit, int(comments)])

In [3]:
data = pd.DataFrame.from_records(data)

In [4]:
data.columns = ['user', 'subreddit', 'comments']

In [5]:
data['user'] = data['user'].astype("category")
data['subreddit'] = data['subreddit'].astype("category")

In [6]:
# create a sparse matrix of all the artist/user/play triples
comments = coo_matrix((data['comments'].astype(float), 
                   (data['subreddit'].cat.codes, 
                    data['user'].cat.codes)))

### Latent Semantic Analysis

In [7]:
subreddit_factors, user_factors = alternating_least_squares(bm25_weight(comments), 20)

This method is deprecated. Please use the AlternatingLeastSquares class instead
100%|██████████| 15.0/15 [01:04<00:00,  3.69s/it]


In [8]:
class TopRelated(object):
    def __init__(self, subreddit_factors):
        norms = np.linalg.norm(subreddit_factors, axis=-1)
        self.factors = subreddit_factors / norms[:, np.newaxis]
        self.subreddits = data['subreddit'].cat.categories.array.to_numpy()

    def get_related(self, subreddit, N=10):
        subredditid = np.where(self.subreddits == subreddit)[0][0]
        scores = self.factors.dot(self.factors[subredditid])
        best = np.argpartition(scores, -N)[-N:]
        best_ = [self.subreddits[i] for i in best]
        return sorted(zip(best_, scores[best]), key=lambda x: -x[1])

In [9]:
top_related = TopRelated(subreddit_factors)

In [10]:
top_related.get_related('OnePiece')

[('OnePiece', 0.9999999),
 ('Naruto', 0.98521143),
 ('OnePunchMan', 0.97248393),
 ('bleach', 0.9659112),
 ('evangelion', 0.9641021),
 ('snowleopards', 0.96162313),
 ('FullmetalAlchemist', 0.9615463),
 ('KillLaKill', 0.9608073),
 ('churchofpokemon', 0.959475),
 ('gantz', 0.95921016)]

In [11]:
subreddit_factors.shape

(41143, 20)

In [12]:
subreddits_embedded = umap.UMAP().fit_transform(subreddit_factors)
subreddits_embedded.shape

  n_components


(41143, 2)

In [13]:
subreddits_embedded

array([[ 3.712512  , -0.54299134],
       [10.828133  , -4.028959  ],
       [ 7.516719  , -4.2612247 ],
       ...,
       [12.795794  , -2.954731  ],
       [ 2.836984  ,  6.1448727 ],
       [ 8.1486    , -4.846176  ]], dtype=float32)

In [14]:
subreddits = data['subreddit'].cat.categories.array.to_numpy()

In [32]:
import random

indices = random.sample(range(len(subreddits)), 1000)

In [33]:
sampled_subreddits = subreddits[indices]
sampled_subreddits_embedded = subreddits_embedded[indices]

In [34]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='abkds', api_key='KKuXHMUKu7EHg9kIZWrl')


# Create random data with numpy
import numpy as np

N = 500
xs = sampled_subreddits_embedded[:, 0]
ys = sampled_subreddits_embedded[:, 1]

# Create a trace
trace = go.Scatter(
    x = xs,
    y = ys,
    mode='markers+text',
    text=sampled_subreddits
)

data_ = [trace]

# Plot and embed in ipython notebook!
py.iplot(data_, filename='basic-scatter')

# or plot with: plot_url = py.plot(data, filename='basic-line')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~abkds/0 or inside your plot.ly account where it is named 'basic-scatter'
