In [2]:
import pandas as pd
import numpy as np

num_topics = 26
df = pd.read_csv(f"~/Desktop/R_js/data/beta/pro{num_topics}_beta.csv")

display(df.head())
print(df.shape)

Unnamed: 0,topic,term,beta
0,1,1_hour_break,3.6726730000000003e-208
1,2,1_hour_break,1.351e-189
2,3,1_hour_break,8.675901000000001e-213
3,4,1_hour_break,4.690151e-197
4,5,1_hour_break,6.8970809999999995e-192


(172666, 3)


## Computing topic word distribution

In [37]:
topic_word_distr = []
imp_words = []
terms = df[df.topic == 1].term.values.tolist()

num_topics = len(df.topic.unique())
for i in range(1, num_topics+1):
    topic = df[df.topic == i] 
    vals = topic.beta.values.tolist()
    topic_word_distr += [np.asarray(vals)]
    words = [terms[i] for i in np.asarray(vals).argsort()[-10:][::-1].tolist()]
    imp_words.append(words)
    
print(len(topic_word_distr))
print(len(imp_words))

26
26


# Dissimilarity Measure

In [59]:
from scipy.spatial import distance

dist = []
for i in range(num_topics): 
    t1 = topic_word_distr[i]
    d1 = []
    for j in range(num_topics):
        t2 = topic_word_distr[j]
        d = distance.jensenshannon(t1, t2)
        d1.append(d)
    dist.append(d1)

print(dist)

[[0.0, 0.7327352468637109, 0.8323044061986776, 0.8325536025441058, 0.8320827507687758, 0.832425727591373, 0.8323244187202242, 0.8324585372461176, 0.8324015859033278, 0.8324115818215652, 0.832126945946274, 0.7947087923094072, 0.8322685546904088, 0.8324007764191051, 0.832459053131275, 0.832215110952098, 0.8312343869853788, 0.8322495853025533, 0.8322695625200496, 0.7652270168465799, 0.8309379400221265, 0.8322630337534296, 0.8316667622368653, 0.8322503319712625, 0.7916171422942008, 0.6822959290933661], [0.7327352468637109, 0.0, 0.8325457041675713, 0.8325545329505012, 0.8324322119369919, 0.8325379391315565, 0.8324634048459001, 0.832554610405917, 0.8324869197171669, 0.83254839822017, 0.8325546111576978, 0.7855747549270169, 0.8325517701237807, 0.8323972009172426, 0.8325112977290315, 0.8325545205680647, 0.8324852042657643, 0.8324665327710699, 0.8325543054136106, 0.7408098302129645, 0.8313848086092511, 0.8325038397270004, 0.8324099106460785, 0.832417723072178, 0.780123770073262, 0.5891450563991

# Dimensionality Reduction

## TSNE

In [38]:
# Clustering
from sklearn.manifold import TSNE
# tsne = TSNE(n_components=2)
tsne = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
X_tsne = tsne.fit_transform(topic_word_distr)

[t-SNE] Computing 25 nearest neighbors...
[t-SNE] Indexed 26 samples in 0.000s...
[t-SNE] Computed neighbors for 26 samples in 0.034s...
[t-SNE] Computed conditional probabilities for sample 26 / 26
[t-SNE] Mean sigma: 1125899906842624.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 51.058292
[t-SNE] KL divergence after 1000 iterations: 0.439410


## PCA

In [60]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_tsne = pca.fit_transform(dist)

# Visualization

In [61]:
# Generate random colors

import random
get_colors = lambda n: list(map(lambda i: "#" + "%06x" % random.randint(0, 0xFFFFFF),range(n)))
colormap = np.asarray(get_colors(num_topics)) 
print(colormap)

['#2c3790' '#dd64b7' '#d6aa5c' '#06d0af' '#e97e72' '#e3d693' '#80ce05'
 '#e69a30' '#25d55f' '#66ff4d' '#34f806' '#d05631' '#b28e3c' '#0d30dd'
 '#964802' '#5d8787' '#fc671e' '#b5be3c' '#2e7ddf' '#cd27b6' '#bc6662'
 '#c4c8d2' '#a5d8b1' '#1bfff9' '#57aa32' '#062836']


In [62]:
df_cluster = pd.DataFrame()
df_cluster['X_tsne'] = X_tsne[:, 0]
df_cluster['Y_tsne'] = X_tsne[:, 1]
df_cluster['Topic'] = range(num_topics)
df_cluster['Topic_words'] = imp_words

cluster_colors = dict()
for c in range(num_topics): 
    cluster_colors[c] = colormap[c]
    
df_cluster['Colors'] = df_cluster['Topic'].apply(lambda l: cluster_colors[l])
df_cluster.sort_values(by=['Topic'], inplace=True)

display(df_cluster.head())

Unnamed: 0,X_tsne,Y_tsne,Topic,Topic_words,Colors
0,0.424407,-0.01997,0,"[salari, benefit, competit, salari_benefit, co...",#2c3790
1,0.587125,-0.013552,1,"[pay, benefit, benefit_pay, weekli, hard, comm...",#dd64b7
2,-0.116011,-0.054128,2,"[opportun, advanc, fast, pace, fast_pace, grow...",#d6aa5c
3,-0.11499,-0.061738,3,"[lot, event, perk, avail, activ, interact, res...",#06d0af
4,-0.132947,-0.230052,4,"[life, balanc, life_balanc, abl, develop, prof...",#e97e72


In [63]:
from bokeh.plotting import figure, show, output_notebook, save#, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

In [64]:
source = ColumnDataSource(dict(
    x=df_cluster['X_tsne'],
    y=df_cluster['Y_tsne'],
    color=df_cluster['Colors'],
    label=df_cluster['Topic'],
    topic_words = df_cluster["Topic_words"]
))

In [65]:
title = 'T-SNE visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=700,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y',  legend='label', source=source,
                 color='color', alpha=0.8)#'msize', )

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = { "Topic": "@label  <br> Topic_words: @topic_words "}
plot_lda.legend.location = "top_left"

show(plot_lda)



## Combine Pro and Cons model


In [67]:
import pandas as pd
import numpy as np

num_topics_pro = 26
df_pro = pd.read_csv(f"~/Desktop/R_js/data/beta/pro{num_topics_pro}_beta.csv")

num_topics_con = 24
df_con = pd.read_csv(f"~/Desktop/R_js/data/beta/con{num_topics_con}_beta.csv")

display(df_pro.head())
display(df_pro.shape)

display(df_con.head())
display(df_con.shape)

Unnamed: 0,topic,term,beta
0,1,1_hour_break,3.6726730000000003e-208
1,2,1_hour_break,1.351e-189
2,3,1_hour_break,8.675901000000001e-213
3,4,1_hour_break,4.690151e-197
4,5,1_hour_break,6.8970809999999995e-192


(172666, 3)

Unnamed: 0,topic,term,beta
0,1,’_care,8.624716999999999e-100
1,2,’_care,2.55527e-93
2,3,’_care,1.219269e-114
3,4,’_care,3.194975e-91
4,5,’_care,7.375773e-119


(154080, 3)

In [77]:
pro_terms = df_pro[df_pro.topic == 1]["term"].values.tolist()
con_terms = df_con[df_con.topic == 1]["term"].values.tolist()

pro_con = list(set(pro_terms) & set(con_terms))
print(len(pro_con))
print(pro_con)

2096
['morn_shift', 'employe', 'centric', 'ground', 'joke', 'emerg', 'paid_lunch', 'rep', 'motiv', 'fulltim', 'safeti', 'discount_employe', 'temp', 'discount_product', 'step', 'hope', 'equal', 'support', 'kroger', 'continu', 'accomod', 'agent', 'union_employe', 'health_care_401k', 'despit', 'morn', 'report', 'record', 'schedul', 'variou', 'locker', 'secur_advanc', 'success', 'solo', 'outstand', 'confid', 'demand', 'gym', 'train_program', 'expertis', 'bother', 'multi', 'guest', 'run', 'feel', 'extern', 'matur', 'upper_manag', 'peer', 'divis', 'car', 'benefit_temp', 'event', 'hous', 'ect', 'childcar', 'support_manag', 'clearli', 'quiet', 'benefit_health', 'ate', 'downtown', 'sometim', 'technic', '30_minut_lunch', 'elsewher', 'stay', 'shift_bid', 'effect', 'glass', 'benefit_offer', 'product_line', 'explain', 'broker', 'break_30', 'opportun_move', 'minim', 'credo', 'connect', 'final', 'pretti', 'dupont', 'fuel', 'workhom', 'dress', 'store_level', 'lunch', 'cost_studi', 'corp', 'pay_peopl',