In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import dill

In [2]:
# Read the data 
dataset = pd.read_csv('main_comments.csv.gz')
dataset

Unnamed: 0,id,score,link_id,author,subreddit,created_utc
0,t1_gl0hhsq,1,t3_l6a44e,Io99IHkg-4QzX6xbKwbte0cuzp4=,wallstreetbets,1611788340
1,t1_gmd0xrl,1,t3_lehibh,1UBdU9GQvCnnXQHAcYaG1uL9V_U=,RedditSessions,1612683157
2,t1_gggg1ed,1,t3_kgocvo,Io99IHkg-4QzX6xbKwbte0cuzp4=,memes,1608454223
3,t1_g7zggfh,1,t3_j6n57d,EA1r-K5p_lVBLesLhCFRrKOPN-I=,videos,1602058834
4,t1_fn060jg,26,t3_fyheuv,_aeNuqWD_AT5JIfooWYpKiZR8qg=,nfl,1586536065
...,...,...,...,...,...,...
27997318,t1_f8sotb4,1,t3_e20bxb,EA1r-K5p_lVBLesLhCFRrKOPN-I=,Showerthoughts,1574785778
27997319,t1_eczq4gz,1,t3_abbin1,Pkz1m3vsliYpbnltUnWaQkPFLEo=,AskReddit,1546315274
27997320,t1_f240cz6,0,t3_dbujdv,NfwullBPKgqUPvj_Qr6RPnH1hrI=,DestinyTheGame,1569942007
27997321,t1_em778f3,2,t3_bj2bnd,kj12hcxGWPd3LxjKCNpoPFTDNBQ=,funny,1556683776


In [None]:
authors = dataset['author'].loc[~dataset['author'].isin(
    ['EA1r-K5p_lVBLesLhCFRrKOPN-I=', 'Io99IHkg-4QzX6xbKwbte0cuzp4='])].unique()
np.random.seed(0)
sample_authors = np.random.choice(authors, 10000)
sample_ds = dataset.loc[dataset['author'].isin(set(sample_authors))]

In [None]:
# Sample some authors
# Visualize user retention
# Plot user retention rate (last action vs first action)
start = sample_ds.groupby('author')['created_utc'].min()
end = sample_ds.groupby('author')['created_utc'].max()
diffs = end - start
count = {}
month = 2419200
for diff in diffs:
    num_months = diff // month
    count[num_months] = count.get(num_months, 0) + 1
count

del count[0]

# Visualization of how long the user stays on the platform.
# It seems to decrease at an exponential rate.

plt.bar(count.keys(), count.values())

In [None]:
# We can try to cluster neighbors by similiar users
np.random.seed(0)
# users = np.random.choice(authors, 10000)
users = np.random.choice(authors, 100000)
sample_ds2 = dataset.loc[dataset['author'].isin(set(users))]
sample_ds2

In [None]:
subreddits = sample_ds2['subreddit'].unique()
subreddit_map = {}
user_map = {}

for idx, subreddit in enumerate(subreddits):
    subreddit_map[subreddit] = idx
for idx, user in enumerate(set(users)):
    user_map[user] = idx

matrix = [[0] * len(subreddits) for _ in range(len(user_map))]

for row in sample_ds2.iloc():
    matrix[user_map[row['author']]][subreddit_map[row['subreddit']]] = 1

# n x d matrix for n users and d subreddits
matrix = pd.DataFrame(matrix)
matrix

In [None]:
# Factorize the matrix (embed subreddits into 2d space)
from sklearn.decomposition import NMF
model = NMF(n_components=2, init='random', random_state=0)
W = model.fit_transform(matrix)
H = model.components_

ask_reddit = 0
memes = 0
for i in range(len(subreddits)):
    if subreddits[i] == 'AskReddit':
        ask_reddit = i
    elif subreddits[i] == 'memes':
        memes = i

H[0][ask_reddit] = 0
H[1][ask_reddit] = 0
H[0][memes] = 0
H[1][memes] = 0

plot = pd.DataFrame()
plot['x'] = H[0]
plot['y'] = H[1]

# 2d embedding of how close subreddits are with some outliers removed
fig = px.scatter(plot, x='x', y='y')
fig.show('notebook')

W2 = model.fit_transform(matrix.transpose())
H2 = model.components_
fig = px.scatter(x=H2[0], y=H2[1])
fig.show('notebook')

In [None]:
# tf idf, properly weight matrix
sum_mat = matrix.sum(axis=1)
for i in range(len(sum_mat)):
    sum_mat[i] = max(1, sum_mat[i])
matrix2 = matrix.divide(sum_mat, axis=0)
matrix2

In [None]:
# Factorize the matrix (embed subreddits into 2d space)

from sklearn.decomposition import NMF
model = NMF(n_components=2, init='random', random_state=0)
W = model.fit_transform(matrix)
H = model.components_

# get rid of ask reddit
ask_reddit = 0
memes = 0

for i in range(len(subreddits)):
    if subreddits[i] == 'AskReddit':
        ask_reddit = i
    elif subreddits[i] == 'memes':
        memes = i

H[0][ask_reddit] = 0
H[1][ask_reddit] = 0
H[0][memes] = 0
H[1][memes] = 0

plot = pd.DataFrame()
plot['x'] = H[0]
plot['y'] = H[1]
plot['subreddit'] = subreddits

# 2d embedding of how close subreddits are with some outliers removed
# Matrix uses additional tf idf weights to adjust for users subscribed to multiple
# Subreddits
fig = px.scatter(plot, x='x', y='y')
fig.show('notebook')

W = model.fit_transform(matrix.transpose())
H = model.components_
fig = px.scatter(plot, x=H[0], y=H[1])
fig.show('notebook')

In [None]:
# Let us visualize this with pca
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_mat = pca.fit_transform(matrix2)
fig = px.scatter(x=pca_mat[:, 0], y=pca_mat[:, 1])
fig.show()

pca_mat = pca.fit_transform(matrix)
fig = px.scatter(x=pca_mat[:, 0], y=pca_mat[:, 1])
fig.show()


In [None]:
# Show distrubution of users
user_distribution = sample_ds2['subreddit'].value_counts()
user_distribution.plot(logy=True, title='User distribution')

In [None]:
start = sample_ds2.groupby('author')['created_utc'].min()
end = sample_ds2.groupby('author')['created_utc'].max()
Y = end - start
X = matrix2

from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.33, random_state=0)

from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(trainX, trainY)
reg.coef_

In [None]:
n = len(testX)
# View regression on plan
y = reg.predict(testX)

# Pca onto one dimension
pca = PCA(n_components=1)
x = pca.fit_transform(testX).reshape(n)


plotGraph = pd.DataFrame()
plotGraph['x'] = np.concatenate((x, x))
plotGraph['y'] = np.concatenate((y, testY.values.reshape(n)))
plotGraph['color'] = [0] * n + [1] * n
plotGraph['hover'] = ['prediction'] * n + ['actual'] * n

fig = px.scatter(plotGraph, 
                 x='x', 
                 y='y', 
                 color='color', 
                 log_y=True, 
                 hover_data=['hover'],
                 title='Regression Plot testing'
                )
fig.show()

In [None]:
n = len(trainX)
# Same plot with training data
y = reg.predict(trainX)

# Pca onto one dimension
pca = PCA(n_components=1)
x = pca.fit_transform(trainX).reshape(n)

plotGraph = pd.DataFrame()
plotGraph['x'] = np.concatenate((x, x))
plotGraph['y'] = np.concatenate((y, trainY.values.reshape(n)))
plotGraph['color'] = [0] * n + [1] * n
plotGraph['hover'] = ['prediction'] * n + ['actual'] * n

fig = px.scatter(
    plotGraph, 
    x='x', 
    y='y', 
    color='color', 
    log_y=True, 
    hover_data=['hover'], 
    title='Regression Plot on Training')
fig.show()

In [None]:
# Let us revisit the plots before and see if certain clusters share retency levels

pca = PCA(n_components=2)
pca_mat = pca.fit_transform(matrix)

plot = pd.DataFrame()
plot['x'] = pca_mat[:, 0]
plot['y'] = pca_mat[:, 1]
plot['color'] = diff.values
fig = px.scatter(plot, x='x', y='y', color='color')
fig.show()

pca_mat2 = pca.fit_transform(matrix2)
plot2 = pd.DataFrame()
plot2['x'] = pca_mat2[:, 0]
plot2['y'] = pca_mat2[:, 1]
plot2['color'] = diff.values
fig2 = px.scatter(plot2, x='x', y='y', color='color')
fig2.show()


In [None]:
# Let's view the plot with what our regression thinks the result is
c1 = reg.predict(matrix)
c2 = reg.predict(matrix2)

pca = PCA(n_components=2)
pca_mat = pca.fit_transform(matrix)
plot = pd.DataFrame()
plot['x'] = pca_mat[:, 0]
plot['y'] = pca_mat[:, 1]
plot['color'] = c1
fig = px.scatter(plot, x='x', y='y', color='color')
fig.show()

pca_mat2 = pca.fit_transform(matrix2)
plot2 = pd.DataFrame()
plot2['x'] = pca_mat2[:, 0]
plot2['y'] = pca_mat2[:, 1]
plot2['color'] = c2
fig2 = px.scatter(plot2, x='x', y='y', color='color')
fig2.show()

# It seems, that for each cluster has a humogonous distrbution, are there are far fewer zeroes in our regression
# plot, maybe we should see the results if we remove our zeros

In [None]:
inverse_subreddit = {}

# Filter out 0s
filtr = list(~(diff==0))
X = matrix2[filtr]
Y = diff[filtr]

# Get subreddits list
for subreddit in subreddit_map:
    inverse_subreddit[subreddit_map[subreddit]] = subreddit

def transform_row(row):
    subreddits = []
    for i in range(len(row)):
        if row[i] > 0:
            subreddits.append(inverse_subreddit[i])
    return subreddits
reddits = X.transpose().apply(transform_row)

# Train new regressor
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.33, random_state=0)
reg = LinearRegression().fit(trainX, trainY)


pca_mat2 = pca.fit_transform(X)
plot2 = pd.DataFrame()
plot2['x'] = pca_mat2[:, 0]
plot2['y'] = pca_mat2[:, 1]
plot2['color'] = Y.values
plot2['reddits'] = reddits
fig2 = px.scatter(plot2, x='x', y='y', color='color', hover_data=['reddits'])
fig2.show()

plot3 = pd.DataFrame()
plot3['x'] = pca_mat2[:, 0]
plot3['y'] = pca_mat2[:, 1]
plot3['color'] = reg.predict(X)
plot3['reddits'] = reddits
fig3 = px.scatter(plot3, x='x', y='y', color='color', hover_data=['reddits'])
fig3.show()

In [None]:
inverse_subreddit = {}

# Filter out 0s
filtr = list(~(diff==0))
X = matrix2[filtr]
Y = diff[filtr]

# Get subreddits list
for subreddit in subreddit_map:
    inverse_subreddit[subreddit_map[subreddit]] = subreddit

def transform_row(row):
    subreddits = []
    for i in range(len(row)):
        if row[i] > 0:
            subreddits.append(inverse_subreddit[i])
    return subreddits
reddits = X.transpose().apply(transform_row)

# Train new regressor
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.33, random_state=0)
reg = LinearRegression().fit(trainX, trainY)


pca_mat2 = pca.fit_transform(X)
plot2 = pd.DataFrame()
plot2['x'] = pca_mat2[:, 0]
plot2['y'] = pca_mat2[:, 1]
plot2['color'] = Y.values
plot2['reddits'] = reddits
fig2 = px.scatter(plot2, x='x', y='y', color='color', hover_data=['reddits'])
fig2.show()

plot3 = pd.DataFrame()
plot3['x'] = pca_mat2[:, 0]
plot3['y'] = pca_mat2[:, 1]
plot3['color'] = reg.predict(X)
plot3['reddits'] = reddits
fig3 = px.scatter(plot3, x='x', y='y', color='color', hover_data=['reddits'])
fig3.show()