In [1]:
import pandas as pd
import numpy as np

num_topics = 10
data_path = "~/Desktop/DataViz/whisper/whisperVis/data/topicModeling/pro"
df = pd.read_csv(data_path + f"/{num_topics}/beta.csv")

display(df.head())
print(df.shape)

Unnamed: 0,topic,term,beta
0,1,1_hour_break,3.333175e-24
1,2,1_hour_break,3.228877e-24
2,3,1_hour_break,6.48679e-12
3,4,1_hour_break,2.023997e-12
4,5,1_hour_break,3.937727e-08


(66410, 3)


## Computing topic word distribution

In [2]:
topic_word_distr = []
imp_words = []
terms = df[df.topic == 1].term.values.tolist()

num_topics = len(df.topic.unique())
for i in range(1, num_topics+1):
    topic = df[df.topic == i] 
    vals = topic.beta.values.tolist()
    topic_word_distr += [np.asarray(vals)]
    words = [terms[i] for i in np.asarray(vals).argsort()[-10:][::-1].tolist()]
    imp_words.append(words)
    
print(len(topic_word_distr))
print(len(imp_words))

10
10


# Dissimilarity Measure

In [3]:
from scipy.spatial import distance

dist = []
for i in range(num_topics): 
    t1 = topic_word_distr[i]
    d1 = []
    for j in range(num_topics):
        t2 = topic_word_distr[j]
        d = distance.jensenshannon(t1, t2)
        d1.append(d)
    dist.append(d1)

print(dist)

[[0.0, 0.0460335568329371, 0.8239785780015826, 0.8243718930161399, 0.8211542490569189, 0.8225457748989495, 0.7851704726560149, 0.82126436097393, 0.8065968339046721, 0.7845333327332118], [0.0460335568329371, 0.0, 0.8239937321342545, 0.8242772665116179, 0.8212731738765019, 0.8225717183828379, 0.7849308197066048, 0.8213019164955101, 0.8064309599207397, 0.7847485154928479], [0.8239785780015826, 0.8239937321342545, 0.0, 0.824060737580206, 0.8252000850367154, 0.81680593030572, 0.825419173454282, 0.8256744238046658, 0.8242911966073121, 0.813297188146814], [0.8243718930161399, 0.8242772665116179, 0.824060737580206, 0.0, 0.8048349275604464, 0.8236719663878131, 0.8242863663741836, 0.8186766604143078, 0.819474639460868, 0.8226448164030605], [0.8211542490569189, 0.8212731738765019, 0.8252000850367154, 0.8048349275604464, 0.0, 0.8269855744936826, 0.8201284708363028, 0.8219212236395543, 0.826349042288878, 0.8040374766410577], [0.8225457748989495, 0.8225717183828379, 0.81680593030572, 0.8236719663878

# Dimensionality Reduction

## TSNE

In [4]:
# Clustering
from sklearn.manifold import TSNE
# tsne = TSNE(n_components=2)
tsne = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
X_tsne = tsne.fit_transform(topic_word_distr)

[t-SNE] Computing 9 nearest neighbors...
[t-SNE] Indexed 10 samples in 0.000s...
[t-SNE] Computed neighbors for 10 samples in 0.046s...
[t-SNE] Computed conditional probabilities for sample 10 / 10
[t-SNE] Mean sigma: 1125899906842624.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 49.707256
[t-SNE] KL divergence after 1000 iterations: 0.243565


## PCA

In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_tsne = pca.fit_transform(dist)

In [6]:
print(X_tsne)

[[ 0.91602631 -0.01770558]
 [ 0.91602499 -0.01755304]
 [-0.26421447 -0.36486093]
 [-0.27136738 -0.07220799]
 [-0.26142527 -0.06473843]
 [-0.25898445 -0.29162923]
 [-0.15186916  0.47587035]
 [-0.26001533  0.49390075]
 [-0.20838313 -0.04873366]
 [-0.15579211 -0.09234226]]


# Visualization

In [7]:
# Generate random colors

import random
get_colors = lambda n: list(map(lambda i: "#" + "%06x" % random.randint(0, 0xFFFFFF),range(n)))
colormap = np.asarray(get_colors(num_topics)) 
print(colormap)

['#b601d0' '#863c5a' '#bf6812' '#675458' '#7904fa' '#4985ee' '#a1f076'
 '#2b9e88' '#39109d' '#dc686d']


In [8]:
df_cluster = pd.DataFrame()
df_cluster['X_tsne'] = X_tsne[:, 0]
df_cluster['Y_tsne'] = X_tsne[:, 1]
df_cluster['Topic'] = range(num_topics)
df_cluster['Topic_words'] = imp_words

cluster_colors = dict()
for c in range(num_topics): 
    cluster_colors[c] = colormap[c]
    
df_cluster['Colors'] = df_cluster['Topic'].apply(lambda l: cluster_colors[l])
df_cluster.sort_values(by=['Topic'], inplace=True)

df_cluster['Topic'] = df_cluster['Topic']+1
display(df_cluster.head())

Unnamed: 0,X_tsne,Y_tsne,Topic,Topic_words,Colors
0,0.916026,-0.017706,1,"[free, employe, discount, schedul, day, decent...",#b601d0
1,0.916025,-0.017553,2,"[free, employe, discount, schedul, day, decent...",#863c5a
2,-0.264214,-0.364861,3,"[benefit, manag, lot, worker, decent_pay, easi...",#bf6812
3,-0.271367,-0.072208,4,"[pay_benefit, advanc, help, salari, worklif, c...",#675458
4,-0.261425,-0.064738,5,"[pay, hour, flexibl, cowork, balanc, cultur, f...",#7904fa


In [9]:
from bokeh.plotting import figure, show, output_notebook, save#, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

In [10]:
source = ColumnDataSource(dict(
    x=df_cluster['X_tsne'],
    y=df_cluster['Y_tsne'],
    color=df_cluster['Colors'],
    label=df_cluster['Topic'],
    topic_words = df_cluster["Topic_words"]
))

In [11]:
title = 'T-SNE visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=700,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y',  legend='label', source=source,
                 color='color', alpha=0.8)#'msize', )

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = { "Topic": "@label  <br> Topic_words: @topic_words "}
plot_lda.legend.location = "top_left"

show(plot_lda)

# Saving File

In [12]:
display(df_cluster.head())
display(df_cluster.shape)

Unnamed: 0,X_tsne,Y_tsne,Topic,Topic_words,Colors
0,0.916026,-0.017706,1,"[free, employe, discount, schedul, day, decent...",#b601d0
1,0.916025,-0.017553,2,"[free, employe, discount, schedul, day, decent...",#863c5a
2,-0.264214,-0.364861,3,"[benefit, manag, lot, worker, decent_pay, easi...",#bf6812
3,-0.271367,-0.072208,4,"[pay_benefit, advanc, help, salari, worklif, c...",#675458
4,-0.261425,-0.064738,5,"[pay, hour, flexibl, cowork, balanc, cultur, f...",#7904fa


(10, 5)

In [13]:
df.to_csv(f"~/Desktop/data_viz/con/{num_topics}/topic_center.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/bishalsainju/Desktop/data_viz/con/10/topic_center.csv'

## Combine Pro and Cons model


In [14]:
import pandas as pd
import numpy as np

# num_topics_pro = 8
num_topics_pro = 10
data_path = "~/Desktop/DataViz/whisper/whisperVis/data/topicModeling/pro"
df_pro = pd.read_csv(f"~/Desktop/R_js/data/beta/pro{num_topics_pro}_beta.csv")

# num_topics_con = 10
num_topics_con = 10
data_path = "~/Desktop/DataViz/whisper/whisperVis/data/topicModeling/pro"
df_con = pd.read_csv(f"~/Desktop/R_js/data/beta/con{num_topics_con}_beta.csv")

display(df_pro.head())
display(df_pro.shape)

display(df_con.head())
display(df_con.shape)

Unnamed: 0,topic,term,beta
0,1,1_hour_break,3.6726730000000003e-208
1,2,1_hour_break,1.351e-189
2,3,1_hour_break,8.675901000000001e-213
3,4,1_hour_break,4.690151e-197
4,5,1_hour_break,6.8970809999999995e-192


(172666, 3)

Unnamed: 0,topic,term,beta
0,1,’_care,8.624716999999999e-100
1,2,’_care,2.55527e-93
2,3,’_care,1.219269e-114
3,4,’_care,3.194975e-91
4,5,’_care,7.375773e-119


(154080, 3)

In [148]:
pro_terms = df_pro[df_pro.topic == 1]["term"].values.tolist()
con_terms = df_con[df_con.topic == 1]["term"].values.tolist()

pro_con = list(set(pro_terms) & set(con_terms))
print(len(pro_con))
print(pro_con)

2096
['principl', 'time_famili', 'overtim_hour', 'payout', 'secur_pay', 'advanc_peopl', 'individu', 'sector', 'redcard', 'that', 'client', 'store_level', 'train_class', 'pay_competit', 'medic_insur', 'opportun_career', 'holiday_vacat', 'ten', 'treat_employe', 'equip', 'capabl', 'reinforc', 'think', 'child', 'competit_salari', 'phone', 'pay_structur', 'oil', 'perk', 'daili', 'hard', 'promo', 'shortag', 'bank_center', 'disabl', 'schedul_worklif_balanc', 'fed', 'corner', 'entri', 'son', 'won', 'loyalti', 'dont', 'polici', 'mac', 'allow', 'heath', 'energi', 'commiss_structur', 'pay_health', 'retir', 'firm', 'consist_schedul', 'loan', 'temp', 'risk', 'extend', 'talent', 'parttim', 'test', 'vacat_time', 'mark', 'advanc', 'workload', 'travel', 'complex', 'hour_employe', 'fork', 'fedex', 'certain', 'pharmacist', 'fargo', 'purpos', 'senior', 'suit', 'leav', 'past', 'badg', 'lend', 'manag_hour', 'leader', 'holiday_weekend', 'hub', 'pay_schedul', 'manag_life_balanc', 'embrac', 'hit', 'talk_peopl'

In [149]:
just_pro = list(set(pro_terms) - set(con_terms))
just_con = list(set(con_terms) - set(pro_terms))


print(len(just_pro) + len(just_con) + 2*len(pro_con))
print(len(pro_terms) + len(con_terms))

13061
13061


In [150]:

n = len(just_con)
for i in range(1, num_topics_pro+1):
    topic = np.full(n, i)
    beta = np.full(n, 0)
    df_temp = pd.DataFrame()
    df_temp["topic"] = topic
    df_temp["term"] = just_con
    df_temp["beta"] = beta
    df_pro = pd.concat([df_pro, df_temp], ignore_index=True, sort=False)
    
n = len(just_pro)
for i in range(1, num_topics_con+1):
    topic = np.full(n, i)
    beta = np.full(n, 0)
    df_temp = pd.DataFrame()
    df_temp["topic"] = topic
    df_temp["term"] = just_pro
    df_temp["beta"] = beta
    df_con = pd.concat([df_con, df_temp], ignore_index=True, sort=False)
    
    
df_pro["procon"] = "pro"
df_pro["color"] = "#c67f1f"
df_con["procon"] = "con"
df_con["color"] = "#20e560"

display(df_pro.head())
display(df_con.head())

Unnamed: 0,topic,term,beta,procon,color
0,1,1_hour_break,3.6726730000000003e-208,pro,#c67f1f
1,2,1_hour_break,1.351e-189,pro,#c67f1f
2,3,1_hour_break,8.675901000000001e-213,pro,#c67f1f
3,4,1_hour_break,4.690151e-197,pro,#c67f1f
4,5,1_hour_break,6.8970809999999995e-192,pro,#c67f1f


Unnamed: 0,topic,term,beta,procon,color
0,1,’_care,8.624716999999999e-100,con,#20e560
1,2,’_care,2.55527e-93,con,#20e560
2,3,’_care,1.219269e-114,con,#20e560
3,4,’_care,3.194975e-91,con,#20e560
4,5,’_care,7.375773e-119,con,#20e560


In [151]:
df = pd.concat([df_pro, df_con], ignore_index=True, sort=False)


In [152]:
print(df_pro.term.nunique())
print(df_con.term.nunique())

t26 = df[(df.topic==26) & (df.procon=="pro")].term.nunique()
print(t26)

10965
10965
10965


## Computing topic word distribution

In [153]:
topic_word_distr = []
imp_words = []
pro_con = []
color = []
terms = df[(df.topic == 1) & (df.procon == "pro")].term.values.tolist()


for i in range(1, num_topics_pro+1):
    topic = df[(df.topic == i) & (df.procon == "pro")] 
    vals = topic.beta.values.tolist()
    topic_word_distr += [np.asarray(vals)]
    words = [terms[i] for i in np.asarray(vals).argsort()[-10:][::-1].tolist()]
    imp_words.append(words)
    pro_con.append("pro")
    color.append("#05890D")

for i in range(1, num_topics_con+1):
    topic = df[(df.topic == i) & (df.procon == "con")] 
    vals = topic.beta.values.tolist()
    topic_word_distr += [np.asarray(vals)]
    words = [terms[i] for i in np.asarray(vals).argsort()[-10:][::-1].tolist()]
    imp_words.append(words)
    pro_con.append("con")
    color.append("#F42F04")

    
print(len(topic_word_distr))
print(len(imp_words))

50
50


# Dissimilarity Measure


In [166]:
from scipy.spatial import distance

dist = []
tot_topics = num_topics_pro + num_topics_con
for i in range(tot_topics): 
    t1 = topic_word_distr[i]
    d1 = []
    for j in range(tot_topics):
        t2 = topic_word_distr[j]
        d = distance.jensenshannon(t1, t2)
        d1.append(d)
    dist.append(d1)

print(dist)

[[0.0, 0.7327352468637108, 0.8323044061986776, 0.8325536025441058, 0.8320827507687758, 0.832425727591373, 0.8323244187202242, 0.8324585372461177, 0.8324015859033278, 0.8324115818215652, 0.832126945946274, 0.7947087923094072, 0.8322685546904088, 0.832400776419105, 0.8324590531312749, 0.832215110952098, 0.8312343869853788, 0.8322495853025533, 0.8322695625200496, 0.7652270168465799, 0.8309379400221265, 0.8322630337534296, 0.8316667622368653, 0.8322503319712624, 0.7916171422942008, 0.6822959290933661, 0.8257748669624936, 0.824558518635462, 0.8285240094273204, 0.8295108269922667, 0.8311582207959608, 0.8300008054932917, 0.8297389670514661, 0.8268398234549498, 0.8288914710677447, 0.8259728560960965, 0.8249516332297623, 0.8316413732902515, 0.8288973878811765, 0.8316436496461522, 0.8306025815288794, 0.829271952607187, 0.8309780316068648, 0.8144107925876231, 0.8239915217410422, 0.8221338798007191, 0.8260949834234321, 0.830278902586446, 0.8264951627647712, 0.8065218729853648], [0.7327352468637108

# Dimensionality Reduction

## TSNE


In [110]:
# Clustering
from sklearn.manifold import TSNE
# tsne = TSNE(n_components=2)
tsne = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
X_tsne = tsne.fit_transform(topic_word_distr)

[t-SNE] Computing 49 nearest neighbors...
[t-SNE] Indexed 50 samples in 0.000s...
[t-SNE] Computed neighbors for 50 samples in 0.003s...
[t-SNE] Computed conditional probabilities for sample 50 / 50
[t-SNE] Mean sigma: 0.161373
[t-SNE] KL divergence after 250 iterations with early exaggeration: 81.962502
[t-SNE] KL divergence after 900 iterations: 0.501462


## PCA

In [167]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_tsne = pca.fit_transform(topic_word_distr)
# X_tsne = pca.fit_transform(dist)

# Visualization

In [168]:
df_cluster = pd.DataFrame()
df_cluster['X_tsne'] = X_tsne[:, 0]
df_cluster['Y_tsne'] = X_tsne[:, 1]
df_cluster['Topic'] = range(1, tot_topics+1)
df_cluster['Topic_words'] = imp_words
df_cluster['pro_con'] = pro_con
df_cluster['color'] = color


In [169]:
from bokeh.plotting import figure, show, output_notebook, save#, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

In [170]:
source = ColumnDataSource(dict(
    x=df_cluster['X_tsne'],
    y=df_cluster['Y_tsne'],
    color=df_cluster['color'],
    label=df_cluster['Topic'],
    topic_words = df_cluster["Topic_words"],
    pro_con = df_cluster["pro_con"]
))

In [3]:
title = 'PCA visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=700,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y',  legend='pro_con', source=source,
                 color='color', alpha=0.8, radius=.01, fill_alpha=.3)#'msize', )

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = { "Topic": "@label (@pro_con)  <br> Topic_words: @topic_words "}
plot_lda.legend.location = "top_left"

show(plot_lda)

NameError: name 'figure' is not defined