# User2Vec
Based on: https://ieeexplore.ieee.org/document/8875952/

How it works:
- Run doc2vec
- Average vector representations for each user


In [1]:
from elasticsearch import Elasticsearch
import pandas as pd
import numpy as np
import eland as ed
from sklearn.manifold import TSNE

from src.models import User2Vec, tokenize

# uncomment this if database is not already open (and give ES a couple minutes to set up)
#!make database

ed_df = ed.read_es('localhost', 'twitter')

In [43]:
df = ed_df[[
    'tweet_id', 'original_tweet_id_str', 
    'user_id', 'name', 'full_text_processed', 
    'sentiment', 'followers_count'
]].to_pandas().fillna(np.nan)

In [3]:
unique_docs = df['full_text_processed'].unique()
train_corpus = [tokenize(doc, tag) for doc, tag in zip(unique_docs, range(unique_docs.shape[0]))]

In [4]:
model = User2Vec(vector_size=10, min_count=2, epochs=40)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
users_id, user_vectors = model.infer_user_vectors(
    df['user_id'], 
    df['full_text_processed'],
    track_progress=True
)

vec_cols = [f'vec_{i}' for i in range(10)]
df_user_vecs = pd.DataFrame(user_vectors, columns=vec_cols)
df_user_vecs['user_id'] = users_id

In [20]:
tsne = TSNE(random_state=0, n_jobs=-1, verbose=2)
user_embeddings = tsne.fit_transform(user_vectors)

df_user_vecs[['tsne_0','tsne_1']] = user_embeddings

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 194183 samples in 0.447s...
[t-SNE] Computed neighbors for 194183 samples in 257.234s...
[t-SNE] Computed conditional probabilities for sample 1000 / 194183
[t-SNE] Computed conditional probabilities for sample 2000 / 194183
[t-SNE] Computed conditional probabilities for sample 3000 / 194183
[t-SNE] Computed conditional probabilities for sample 4000 / 194183
[t-SNE] Computed conditional probabilities for sample 5000 / 194183
[t-SNE] Computed conditional probabilities for sample 6000 / 194183
[t-SNE] Computed conditional probabilities for sample 7000 / 194183
[t-SNE] Computed conditional probabilities for sample 8000 / 194183
[t-SNE] Computed conditional probabilities for sample 9000 / 194183
[t-SNE] Computed conditional probabilities for sample 10000 / 194183
[t-SNE] Computed conditional probabilities for sample 11000 / 194183
[t-SNE] Computed conditional probabilities for sample 12000 / 194183
[t-SNE] Computed conditional proba

[t-SNE] Computed conditional probabilities for sample 120000 / 194183
[t-SNE] Computed conditional probabilities for sample 121000 / 194183
[t-SNE] Computed conditional probabilities for sample 122000 / 194183
[t-SNE] Computed conditional probabilities for sample 123000 / 194183
[t-SNE] Computed conditional probabilities for sample 124000 / 194183
[t-SNE] Computed conditional probabilities for sample 125000 / 194183
[t-SNE] Computed conditional probabilities for sample 126000 / 194183
[t-SNE] Computed conditional probabilities for sample 127000 / 194183
[t-SNE] Computed conditional probabilities for sample 128000 / 194183
[t-SNE] Computed conditional probabilities for sample 129000 / 194183
[t-SNE] Computed conditional probabilities for sample 130000 / 194183
[t-SNE] Computed conditional probabilities for sample 131000 / 194183
[t-SNE] Computed conditional probabilities for sample 132000 / 194183
[t-SNE] Computed conditional probabilities for sample 133000 / 194183
[t-SNE] Computed con

In [51]:
df_meta = df.groupby('user_id').agg(
    {'full_text_processed': 'count', 'sentiment': 'mean', 'followers_count': 'max'}
).rename(columns={
        'full_text_processed': 'Tweet Count', 
        'sentiment':'Mean Sentiment', 
        'followers_count': 'Followers'
    }
)

df_user_vecs = df_user_vecs.set_index('user_id').join(df_meta)

In [58]:
df_user_vecs.to_csv('iwmi_user2vec.csv')

In [75]:
tweet_vectors = np.array(df['full_text_processed'].apply(
    lambda doc: model.infer_vector(tokenize(doc))
).tolist())

vec_cols = [f'vec_{i}' for i in range(10)]
df_tweet_vecs = pd.DataFrame(tweet_vectors, columns=vec_cols, index=df['tweet_id'])

In [93]:
df_tweets = df.set_index('tweet_id').join(df_tweet_vecs)
df_tweets.to_csv('iwmi_tweet2vec.csv')

## Data Visualization

In [39]:
from bokeh.io import output_file, show
from bokeh.models import Panel, Tabs
from bokeh.plotting import figure

output_file("slider.html")

p1 = figure(plot_width=300, plot_height=300)
p1.circle(df_user_vecs.tsne_0, df_user_vecs.tsne_1, size=20, color="navy", alpha=0.5, tags=df_user_vecs.tweet_count.tolist())
tab1 = Panel(child=p1, title="circle")

p2 = figure(plot_width=300, plot_height=300)
p2.line([1, 2, 3, 4, 5], [6, 7, 2, 4, 5], line_width=3, color="navy", alpha=0.5)
tab2 = Panel(child=p2, title="line")

tabs = Tabs(tabs=[ tab1, tab2 ])

show(tabs)