In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import normalize
import time
from tqdm.notebook import tqdm
import datetime
import os
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns

In [3]:
joke_df = pd.read_csv(r'..\joke_df_features_svd.csv')
user_df = pd.read_csv(r'..\user_df_features_svd.csv')

In [5]:
joke_df

Unnamed: 0,joke_feature_1,joke_feature_2,joke_feature_3,joke_feature_4,joke_feature_5,joke_feature_6,joke_feature_7,joke_feature_8,joke_feature_9,joke_feature_10,...,joke_feature_91,joke_feature_92,joke_feature_93,joke_feature_94,joke_feature_95,joke_feature_96,joke_feature_97,joke_feature_98,joke_feature_99,joke_feature_100
0,0.112688,0.222817,-0.195922,0.219684,0.411241,-0.053139,0.290816,-0.058500,0.003905,0.066091,...,0.431792,-0.320441,0.025341,0.102580,-0.387499,0.224376,0.364185,-0.144255,-0.240137,-0.015587
1,0.313724,0.043327,-0.048844,0.067215,0.307983,0.014399,0.256734,-0.074591,-0.023945,0.102127,...,0.623640,0.011707,-0.018323,-0.022698,-0.248734,-0.178312,0.374911,-0.056900,-0.330262,-0.087538
2,-0.252002,0.430994,0.220400,0.133130,0.135608,0.111379,0.129852,0.039378,-0.162345,0.195971,...,-0.162176,-0.297996,-0.132594,0.337949,0.067280,0.285934,-0.153183,0.132184,0.000751,-0.057890
3,-0.052535,0.541485,-0.223866,0.139070,0.385290,0.131492,0.507533,0.002464,-0.133843,-0.067574,...,0.040186,-0.182518,0.084169,0.219612,-0.213434,0.361682,0.079440,-0.318766,-0.021421,0.146524
4,-0.027378,0.174890,-0.053249,0.102202,0.344133,0.069124,0.256610,-0.168126,-0.235313,0.104072,...,-0.037136,-0.286614,-0.147054,0.241144,0.003931,0.260650,0.021384,-0.104395,0.103104,0.084330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.137697,0.603971,-0.180179,-0.214594,0.265756,0.238847,0.676550,0.098790,-0.127231,0.063033,...,0.437792,0.151213,0.410687,0.035605,-0.268979,0.302526,0.168139,-0.008251,-0.173199,0.090621
96,-0.005569,0.412936,-0.002250,-0.214248,0.468186,0.145344,0.627553,-0.441548,0.129704,0.086022,...,0.078341,-0.180102,-0.107962,0.078101,0.119086,0.374430,0.161647,-0.568151,0.198361,0.353774
97,0.178174,-0.115281,-0.007642,-0.393356,0.423065,0.120142,0.585237,-0.422318,0.128890,0.172139,...,0.714508,-0.029828,0.136986,-0.066974,-0.150649,-0.105737,0.449684,-0.618395,0.196442,0.112897
98,-0.338073,0.425045,-0.089250,-0.360074,0.323095,0.378184,0.690955,-0.020013,-0.026818,-0.028049,...,0.369752,-0.075770,0.261672,0.101164,-0.216765,0.316440,0.162326,-0.422756,0.159005,0.225908


In [6]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [14]:
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN, KMeans
import matplotlib.colors as mcolors

In [15]:
_colors = list(mcolors.CSS4_COLORS.values())

In [17]:
joke_emb_tsne = TSNE(2).fit_transform(joke_df.values)
joke_emb_tsne



array([[ 1.6954951e-01,  1.0326442e+00],
       [-2.5769954e+00,  5.9198231e-01],
       [ 6.7597566e+00,  1.1812178e+00],
       [ 3.1149094e+00, -9.2545533e-01],
       [ 4.5549831e+00,  2.3123426e+00],
       [ 4.7749491e+00,  9.9226689e-01],
       [-1.6091648e+00,  3.6358497e+00],
       [-4.0327463e+00,  4.0856500e+00],
       [ 2.9454725e+00,  2.9118612e+00],
       [ 6.9973392e+00,  7.7469969e-01],
       [ 5.9954624e+00, -2.9655510e-01],
       [ 1.8651111e+00,  2.3622506e+00],
       [-3.5569432e+00,  8.6743039e-01],
       [ 2.9059994e+00,  3.2972522e+00],
       [ 2.2053487e+00, -2.6366749e+00],
       [-2.9275575e+00,  1.6974388e+00],
       [ 3.6004791e+00,  4.2233076e+00],
       [ 2.5103724e+00,  2.5687187e+00],
       [ 1.8265946e+00,  8.3217597e-01],
       [-4.0069013e+00,  2.8631923e+00],
       [-5.7018223e+00,  2.9941227e+00],
       [-1.7429446e+00,  5.8278947e+00],
       [ 6.8557439e+00,  1.8148562e+00],
       [ 6.9452114e+00, -3.0081490e-02],
       [ 1.87665

In [18]:
draw_vectors(joke_emb_tsne[:, 0], joke_emb_tsne[:, 1], token=[f'j_{i}' for i in range(1, len(joke_df) + 1)])

In [31]:
joke_labels = KMeans(15).fit_predict(joke_df.values)



In [34]:
draw_vectors(joke_emb_tsne[:, 0], joke_emb_tsne[:, 1], color=[_colors[l] for l in joke_labels],
             token=[f'j_{i}_{joke_labels[i-1]}' for i in range(1, len(joke_df) + 1)],
             radius=20,)

In [35]:
user_emb_tsne = TSNE(2).fit_transform(user_df.values)
user_emb_tsne



array([[-1.1328012, 44.67709  ],
       [44.55226  , 35.50679  ],
       [ 2.579142 , 44.340794 ],
       ...,
       [ 1.584656 , 40.42305  ],
       [18.442324 , 17.109406 ],
       [-9.778803 , -1.366395 ]], dtype=float32)

In [37]:
draw_vectors(user_emb_tsne[:, 0], user_emb_tsne[:, 1], token=[f'u_{i}' for i in range(1, len(user_df) + 1)])

In [39]:
user_labels = KMeans(148).fit_predict(user_df.values)

In [41]:
draw_vectors(user_emb_tsne[:, 0], user_emb_tsne[:, 1], color=[_colors[l] for l in user_labels],
             token=[f'j_{i}_{user_labels[i-1]}' for i in range(1, len(user_df) + 1)],
             radius=20,)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
pca = PCA(2)
scaler = StandardScaler()
artist_emb_pca = pca.fit_transform(artist_emb)
artist_emb_pca
#word_vectors_pca = scaler.fit_transform(word_vectors_pca)

In [None]:
artist_emb_pca = scaler.fit_transform(artist_emb_pca)
artist_emb_pca

In [None]:
artists_arr = list(titles_dict.values())
artists_arr[:10]

In [None]:
draw_vectors(artist_emb_pca[:, 0], artist_emb_pca[:, 1], token=artists_arr)


In [None]:
del umap

In [None]:
import umap

In [None]:
embedding = umap.UMAP(n_neighbors=5).fit_transform(artist_emb)

In [None]:
draw_vectors(embedding[:, 0], embedding[:, 1], token=artists_arr)

In [None]:
def find_nearest(data_vectors, query_vector, k=10):
    """
    given text line (query), return k most similar lines from data, sorted from most to least similar
    similarity should be measured as cosine between query and line embedding vectors
    hint: it's okay to use global variables: data and data_vectors. see also: np.argpartition, np.argsort
    """
    dists = data_vectors.dot(query_vector[:, None])[:, 0] / ((norms+1e-16)*np.linalg.norm(query_vector))
    nearest_elements = dists.argsort(axis=0)[-k:][::-1]
    out = [data[i] for i in nearest_elements]
    return out# <YOUR CODE: top-k lines starting from most similar>

In [None]:
mcolors.CSS4_COLORS

In [None]:
len(mcolors.CSS4_COLORS)

In [None]:
labels = KMeans(148).fit_predict(artist_emb)

In [None]:
draw_vectors(artist_emb_pca[:, 0], artist_emb_pca[:, 1], color=[_colors[l] for l in labels],
             token=artists_arr,
             radius=20,)

In [None]:
labels_dbscan = DBSCAN(eps=0.66).fit_predict(artist_emb)
display(np.unique(labels_dbscan))
count = Counter(labels_dbscan)
sorted(count.items(), key=lambda item: item[1], reverse=True)

In [None]:
titles_df['label'] = labels_dbscan
titles_df

In [None]:
titles_df[titles_df['label'] == 11]

In [None]:
draw_vectors(artist_emb_pca[:, 0], artist_emb_pca[:, 1], color=[_colors[l] for l in labels_dbscan],
             token=artists_arr,
             radius=20,)

In [None]:
from collections import Counter

In [None]:
labels = KMeans(300).fit_predict(artist_emb)

titles_df['label'] = labels
titles_df

In [None]:
titles_df[titles_df['artist_name'] == 'Nirvana']

In [None]:
titles_df[titles_df['label'] == 225]