
# Analysis to place individual model on spatial position

## Steps: 
### 1. Computing topic word distribution.
### 2. Dissimilarity Measure using Jensen Shanon/Cosine Similarity/Euclidean Distance
### 3. Dimensionality Reduction (PCA / t-SNE / MDS)
### 4. Visualization using bokeh plot


# Analysis to combine pro con model

## Steps: 
### 1. Combine pro con corpus
### 2. Dissimilarity Measure
### 3. Dimensionality Reduction
### 4. Visualization


In [1]:
import pandas as pd
import numpy as np

num_topics = 10
data_path = "~/Desktop/DataViz/whisper/whisperVis/data/topicModeling/pro"
df = pd.read_csv(data_path + f"/{num_topics}/beta.csv")

display(df.head())
print(df.shape)

Unnamed: 0,topic,term,beta
0,1,1_hour_break,3.333175e-24
1,2,1_hour_break,3.228877e-24
2,3,1_hour_break,6.48679e-12
3,4,1_hour_break,2.023997e-12
4,5,1_hour_break,3.937727e-08


(66410, 3)


## Computing topic word distribution

In [2]:
topic_word_distr = []
imp_words = []
terms = df[df.topic == 1].term.values.tolist()

num_topics = len(df.topic.unique())
for i in range(1, num_topics+1):
    topic = df[df.topic == i] 
    vals = topic.beta.values.tolist()
    topic_word_distr += [np.asarray(vals)]
    words = [terms[i] for i in np.asarray(vals).argsort()[-10:][::-1].tolist()]
    imp_words.append(words)
    
print(len(topic_word_distr))
print(len(imp_words))

10
10


# Dissimilarity Measure

In [3]:
from scipy.spatial import distance

dist = []
for i in range(num_topics): 
    t1 = topic_word_distr[i]
    d1 = []
    for j in range(num_topics):
        t2 = topic_word_distr[j]
        d = distance.jensenshannon(t1, t2)
        d1.append(d)
    dist.append(d1)

print(dist)

[[0.0, 0.0460335568329371, 0.8239785780015826, 0.8243718930161399, 0.8211542490569189, 0.8225457748989495, 0.7851704726560149, 0.82126436097393, 0.8065968339046721, 0.7845333327332118], [0.0460335568329371, 0.0, 0.8239937321342545, 0.8242772665116179, 0.8212731738765019, 0.8225717183828379, 0.7849308197066048, 0.8213019164955101, 0.8064309599207397, 0.7847485154928479], [0.8239785780015826, 0.8239937321342545, 0.0, 0.824060737580206, 0.8252000850367154, 0.81680593030572, 0.825419173454282, 0.8256744238046658, 0.8242911966073121, 0.813297188146814], [0.8243718930161399, 0.8242772665116179, 0.824060737580206, 0.0, 0.8048349275604464, 0.8236719663878131, 0.8242863663741836, 0.8186766604143078, 0.819474639460868, 0.8226448164030605], [0.8211542490569189, 0.8212731738765019, 0.8252000850367154, 0.8048349275604464, 0.0, 0.8269855744936826, 0.8201284708363028, 0.8219212236395543, 0.826349042288878, 0.8040374766410577], [0.8225457748989495, 0.8225717183828379, 0.81680593030572, 0.8236719663878

# Dimensionality Reduction

## TSNE

In [4]:
# Clustering
from sklearn.manifold import TSNE
# tsne = TSNE(n_components=2)
tsne = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
X_tsne = tsne.fit_transform(topic_word_distr)

[t-SNE] Computing 9 nearest neighbors...
[t-SNE] Indexed 10 samples in 0.000s...
[t-SNE] Computed neighbors for 10 samples in 0.046s...
[t-SNE] Computed conditional probabilities for sample 10 / 10
[t-SNE] Mean sigma: 1125899906842624.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 49.707256
[t-SNE] KL divergence after 1000 iterations: 0.243565


## PCA

In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_tsne = pca.fit_transform(dist)

In [6]:
print(X_tsne)

[[ 0.91602631 -0.01770558]
 [ 0.91602499 -0.01755304]
 [-0.26421447 -0.36486093]
 [-0.27136738 -0.07220799]
 [-0.26142527 -0.06473843]
 [-0.25898445 -0.29162923]
 [-0.15186916  0.47587035]
 [-0.26001533  0.49390075]
 [-0.20838313 -0.04873366]
 [-0.15579211 -0.09234226]]


# Visualization

In [7]:
# Generate random colors

import random
get_colors = lambda n: list(map(lambda i: "#" + "%06x" % random.randint(0, 0xFFFFFF),range(n)))
colormap = np.asarray(get_colors(num_topics)) 
print(colormap)

['#b601d0' '#863c5a' '#bf6812' '#675458' '#7904fa' '#4985ee' '#a1f076'
 '#2b9e88' '#39109d' '#dc686d']


In [8]:
df_cluster = pd.DataFrame()
df_cluster['X_tsne'] = X_tsne[:, 0]
df_cluster['Y_tsne'] = X_tsne[:, 1]
df_cluster['Topic'] = range(num_topics)
df_cluster['Topic_words'] = imp_words

cluster_colors = dict()
for c in range(num_topics): 
    cluster_colors[c] = colormap[c]
    
df_cluster['Colors'] = df_cluster['Topic'].apply(lambda l: cluster_colors[l])
df_cluster.sort_values(by=['Topic'], inplace=True)

df_cluster['Topic'] = df_cluster['Topic']+1
display(df_cluster.head())

Unnamed: 0,X_tsne,Y_tsne,Topic,Topic_words,Colors
0,0.916026,-0.017706,1,"[free, employe, discount, schedul, day, decent...",#b601d0
1,0.916025,-0.017553,2,"[free, employe, discount, schedul, day, decent...",#863c5a
2,-0.264214,-0.364861,3,"[benefit, manag, lot, worker, decent_pay, easi...",#bf6812
3,-0.271367,-0.072208,4,"[pay_benefit, advanc, help, salari, worklif, c...",#675458
4,-0.261425,-0.064738,5,"[pay, hour, flexibl, cowork, balanc, cultur, f...",#7904fa


In [9]:
from bokeh.plotting import figure, show, output_notebook, save#, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

In [10]:
source = ColumnDataSource(dict(
    x=df_cluster['X_tsne'],
    y=df_cluster['Y_tsne'],
    color=df_cluster['Colors'],
    label=df_cluster['Topic'],
    topic_words = df_cluster["Topic_words"]
))

In [11]:
title = 'T-SNE visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=700,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y',  legend='label', source=source,
                 color='color', alpha=0.8)#'msize', )

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = { "Topic": "@label  <br> Topic_words: @topic_words "}
plot_lda.legend.location = "top_left"

show(plot_lda)

# Saving File

In [12]:
display(df_cluster.head())
display(df_cluster.shape)

Unnamed: 0,X_tsne,Y_tsne,Topic,Topic_words,Colors
0,0.916026,-0.017706,1,"[free, employe, discount, schedul, day, decent...",#b601d0
1,0.916025,-0.017553,2,"[free, employe, discount, schedul, day, decent...",#863c5a
2,-0.264214,-0.364861,3,"[benefit, manag, lot, worker, decent_pay, easi...",#bf6812
3,-0.271367,-0.072208,4,"[pay_benefit, advanc, help, salari, worklif, c...",#675458
4,-0.261425,-0.064738,5,"[pay, hour, flexibl, cowork, balanc, cultur, f...",#7904fa


(10, 5)

In [13]:
df.to_csv(f"~/Desktop/data_viz/con/{num_topics}/topic_center.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/bishalsainju/Desktop/data_viz/con/10/topic_center.csv'

## Combine Pro and Cons model


In [29]:
import pandas as pd
import numpy as np

# num_topics_pro = 8
num_topics_pro = 10
data_path = "~/Desktop/DataViz/whisper/whisperVis/data/topicModeling/pro"
df_pro = pd.read_csv(data_path + f"/{num_topics_pro}/beta.csv")

# num_topics_con = 10
num_topics_con = 10
data_path = "~/Desktop/DataViz/whisper/whisperVis/data/topicModeling/con"
df_con = pd.read_csv(data_path + f"/{num_topics_con}/beta.csv")

display(df_pro.head())
display(df_pro.shape)

display(df_con.head())
display(df_con.shape)

Unnamed: 0,topic,term,beta
0,1,1_hour_break,3.333175e-24
1,2,1_hour_break,3.228877e-24
2,3,1_hour_break,6.48679e-12
3,4,1_hour_break,2.023997e-12
4,5,1_hour_break,3.937727e-08


(66410, 3)

Unnamed: 0,topic,term,beta
0,1,’_care,3.717818e-128
1,2,’_care,7.615264e-147
2,3,’_care,7.927693e-160
3,4,’_care,3.476992e-149
4,5,’_care,0.0001030328


(64200, 3)

In [30]:
pro_terms = df_pro[df_pro.topic == 1]["term"].values.tolist()
con_terms = df_con[df_con.topic == 1]["term"].values.tolist()

pro_con = list(set(pro_terms) & set(con_terms))
print(len(pro_con))
print(pro_con)

2096
['cooper', 'break_30_minut', 'tie', 'flight', 'union_employe', 'staf', 'heavi', 'deli', 'huge', 'applic', 'mobil', 'parttim_hour', 'minut_lunch_break', 'stop', 'excit', 'caller', 'pay_worklif', 'monday', 'benefit_lunch', 'think', 'life_balanc_manag', 'approach', 'materi', 'pay_retail', 'comput', 'extrem', 'cell_phone', 'type', 'independ', 'weather', 'flex', 'bore', 'break_break', 'familiar', 'tuition', 'explain', 'fifteen', 'friendli', 'manag_pay', 'hour_peopl', 'dedic', 'break_15_minut', 'overcom', 'credit', 'person_vehicl', 'search', 'clearli', 'abl_help', 'asset', 'walk', 'hour_break', 'experienc', 'bonus', 'rank', 'ten', 'merchandis', 'vet', 'servic', 'night_shift', 'coverag', 'wasnt', 'law', 'health_plan', 'return', 'micromanag', 'solid', 'muy', 'administr', 'vacat_time', 'min_lunch_break', 'mistak', 'refer', 'chair', 'exposur', 'confin', 'advantag', 'employe_recognit', '1_hour_lunch', 'committe', 'make', 'compens_manag', 'theyr', 'pace_environ', 'vend_machin', 'veteran', 'di

In [31]:
just_pro = list(set(pro_terms) - set(con_terms))
just_con = list(set(con_terms) - set(pro_terms))


print(len(just_pro) + len(just_con) + 2*len(pro_con))
print(len(pro_terms) + len(con_terms))

13061
13061


In [32]:

n = len(just_con)
for i in range(1, num_topics_pro+1):
    topic = np.full(n, i)
    beta = np.full(n, 0)
    df_temp = pd.DataFrame()
    df_temp["topic"] = topic
    df_temp["term"] = just_con
    df_temp["beta"] = beta
    df_pro = pd.concat([df_pro, df_temp], ignore_index=True, sort=False)
    
n = len(just_pro)
for i in range(1, num_topics_con+1):
    topic = np.full(n, i)
    beta = np.full(n, 0)
    df_temp = pd.DataFrame()
    df_temp["topic"] = topic
    df_temp["term"] = just_pro
    df_temp["beta"] = beta
    df_con = pd.concat([df_con, df_temp], ignore_index=True, sort=False)
    
    
df_pro["procon"] = "pro"
df_pro["color"] = "#c67f1f"
df_con["procon"] = "con"
df_con["color"] = "#20e560"

display(df_pro.head())
display(df_con.head())

Unnamed: 0,topic,term,beta,procon,color
0,1,1_hour_break,3.333175e-24,pro,#c67f1f
1,2,1_hour_break,3.228877e-24,pro,#c67f1f
2,3,1_hour_break,6.48679e-12,pro,#c67f1f
3,4,1_hour_break,2.023997e-12,pro,#c67f1f
4,5,1_hour_break,3.937727e-08,pro,#c67f1f


Unnamed: 0,topic,term,beta,procon,color
0,1,’_care,3.717818e-128,con,#20e560
1,2,’_care,7.615264e-147,con,#20e560
2,3,’_care,7.927693e-160,con,#20e560
3,4,’_care,3.476992e-149,con,#20e560
4,5,’_care,0.0001030328,con,#20e560


In [33]:
df = pd.concat([df_pro, df_con], ignore_index=True, sort=False)


In [34]:
print(df_pro.term.nunique())
print(df_con.term.nunique())

t26 = df[(df.topic==26) & (df.procon=="pro")].term.nunique()
print(t26)

10965
10965
0


## Computing topic word distribution

In [35]:
topic_word_distr = []
imp_words = []
pro_con = []
color = []
terms = df[(df.topic == 1) & (df.procon == "pro")].term.values.tolist()


for i in range(1, num_topics_pro+1):
    topic = df[(df.topic == i) & (df.procon == "pro")] 
    vals = topic.beta.values.tolist()
    topic_word_distr += [np.asarray(vals)]
    words = [terms[i] for i in np.asarray(vals).argsort()[-10:][::-1].tolist()]
    imp_words.append(words)
    pro_con.append("pro")
    color.append("#05890D")

for i in range(1, num_topics_con+1):
    topic = df[(df.topic == i) & (df.procon == "con")] 
    vals = topic.beta.values.tolist()
    topic_word_distr += [np.asarray(vals)]
    words = [terms[i] for i in np.asarray(vals).argsort()[-10:][::-1].tolist()]
    imp_words.append(words)
    pro_con.append("con")
    color.append("#F42F04")

    
print(len(topic_word_distr))
print(len(imp_words))

20
20


# Dissimilarity Measure


In [36]:
from scipy.spatial import distance

dist = []
tot_topics = num_topics_pro + num_topics_con
for i in range(tot_topics): 
    t1 = topic_word_distr[i]
    d1 = []
    for j in range(tot_topics):
        t2 = topic_word_distr[j]
        d = distance.jensenshannon(t1, t2)
        d1.append(d)
    dist.append(d1)

print(dist)

[[0.0, 0.04603355683293713, 0.8239785780015827, 0.8243718930161399, 0.8211542490569189, 0.8225457748989495, 0.7851704726560149, 0.82126436097393, 0.8065968339046721, 0.7845333327332118, 0.8207700859887048, 0.8206312225428519, 0.8203771627528406, 0.8189575461674725, 0.8074110793734718, 0.8190244161488368, 0.8184920178864045, 0.824238356982029, 0.8233954262421781, 0.8200537009719729], [0.04603355683293713, 0.0, 0.8239937321342545, 0.8242772665116178, 0.8212731738765019, 0.8225717183828379, 0.7849308197066048, 0.8213019164955101, 0.8064309599207397, 0.7847485154928477, 0.8206870259058296, 0.8206973658257407, 0.8202303219116275, 0.8180126084186464, 0.8072206750966084, 0.8187262569039286, 0.8181920892597662, 0.8238761818004828, 0.8231618409535916, 0.8195933679312796], [0.8239785780015827, 0.8239937321342545, 0.0, 0.824060737580206, 0.8252000850367154, 0.8168059303057201, 0.825419173454282, 0.8256744238046659, 0.8242911966073121, 0.813297188146814, 0.8167547746121009, 0.8193202445034313, 0.8

# Dimensionality Reduction

## TSNE


In [37]:
# Clustering
from sklearn.manifold import TSNE
# tsne = TSNE(n_components=2)
tsne = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
X_tsne = tsne.fit_transform(topic_word_distr)

[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 20 samples in 0.000s...
[t-SNE] Computed neighbors for 20 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 20 / 20
[t-SNE] Mean sigma: 1125899906842624.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.528702
[t-SNE] KL divergence after 1000 iterations: 0.383639


## PCA

In [38]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_tsne = pca.fit_transform(topic_word_distr)
# X_tsne = pca.fit_transform(dist)

# Visualization

In [39]:
df_cluster = pd.DataFrame()
df_cluster['X_tsne'] = X_tsne[:, 0]
df_cluster['Y_tsne'] = X_tsne[:, 1]
df_cluster['Topic'] = range(1, tot_topics+1)
df_cluster['Topic_words'] = imp_words
df_cluster['pro_con'] = pro_con
df_cluster['color'] = color


In [40]:
from bokeh.plotting import figure, show, output_notebook, save#, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

In [41]:
source = ColumnDataSource(dict(
    x=df_cluster['X_tsne'],
    y=df_cluster['Y_tsne'],
    color=df_cluster['color'],
    label=df_cluster['Topic'],
    topic_words = df_cluster["Topic_words"],
    pro_con = df_cluster["pro_con"]
))

In [42]:
title = 'PCA visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=700,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y',  legend='pro_con', source=source,
                 color='color', alpha=0.8, radius=.01, fill_alpha=.3)#'msize', )

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = { "Topic": "@label (@pro_con)  <br> Topic_words: @topic_words "}
plot_lda.legend.location = "top_left"

show(plot_lda)