In [36]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm
import pandas as pd 
import logging, time, sys, argparse, re, gensim, math, faulthandler
import pickle as pkl
import numpy as np
from os.path import exists
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

In [2]:
def train_phrases_model():
    start_time = time.time()
    context_windows_by_sense = sum(self.context_windows.values(), [])
    tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(context_windows_by_sense)]
    model = Doc2Vec(tagged_data, vector_size = 20, window = 4, min_count = 1, epochs = 100)
    model.save(self.model_path)
    end_time = time.time()
    logging.info(end_time - start_time) 

def read_file(input_path):
        logging.info('Opened: ' + input_path)
        with open(input_path, "rb") as f:
            input_obj = pkl.load(f)
            return input_obj     
        
def get_word_vectors(model, phrases_df):
        vectors = []
        dv = model.dv
        not_found = 0
        for idx, row in phrases_df.iterrows():
            try:
                vectors.append(dv[idx])
            except KeyError:
                logging.info(key, 'not found')
                not_found += 1
                continue

        n = len(vectors)
        logging.info("n: " + str(n))
        logging.info("not_found: " + str(not_found))

        return vectors   

    
def get_similarity_matrix(vectors):
    n = len(vectors)
    distances = np.zeros((len(vectors[:n]), len(vectors[:n])))

    for idx1, vec1 in enumerate(tqdm(vectors[:n])):
        for idx2, vec2 in enumerate(vectors[idx1:n]):
            if idx2 == 0:
                distances[idx1][idx1] = 0
                continue
            # calculate the pearson correlation 
            p = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
            # calculate the distance between vectors 
            dist = abs(0.5 * (1 - p))

            distances[idx1][idx1 + idx2] = dist
            distances[idx1 + idx2][idx1] = dist

    return distances

def run_pca(distances, k):
    dist_matrix = pd.DataFrame(distances)
    x = dist_matrix.loc[:, ].values
    x = StandardScaler().fit_transform(x)
    reduction_model = PCA(n_components=k)
    pca = reduction_model.fit_transform(x)
    total_var = reduction_model.explained_variance_ratio_.sum() * 100
    logging.info("Total variance explained:" + str(total_var))
    return pca

In [71]:
model = Doc2Vec.load('../data/full_corpus/phrase_embedding_model.pickle')
context_windows = read_file('../data/full_corpus/context_windows.pickle')

In [72]:
phrases_df = pd.DataFrame(columns=['modality', 'descriptor', 'freq_in_modality', 'context_window_length'])
for modality, lst in context_windows.items():
    phrase_count = {}
    descriptor_list = [[modality, tuple(phrase)] for phrase in lst]
    for phrase in lst: 
        if tuple(phrase) in phrase_count:
            phrase_count[tuple(phrase)] += 1
        else:
            phrase_count[tuple(phrase)] = 1 
    final_descriptor_list = [descriptor_list[i] + [phrase_count[tuple(phrase)]] + [len(phrase)] for i, phrase in enumerate(lst)]
    phrases_df = phrases_df.append(pd.DataFrame(final_descriptor_list, columns = ['modality', 'descriptor', 'freq_in_modality', 'context_window_length']))
    phrases_df = phrases_df.reset_index(drop=True)
phrases_df.head()

Unnamed: 0,modality,descriptor,freq_in_modality,context_window_length
0,sight,"((sir, NOUN),)",456,1
1,sight,"((hands, NOUN),)",362,1
2,sight,"((shading, VERB), (vivid, ADJ), (sunlight, NOUN))",4,3
3,sight,"((bodily, ADV), (burst, VERB))",4,2
4,sight,"((suddenly, ADV), (blue, ADJ), (plain, ADJ))",4,3


In [73]:
phrases_df['freq_in_modality'] = phrases_df['freq_in_modality'].astype(int)
frames = []
for modality in context_windows.keys():
    subset = phrases_df.loc[(phrases_df['modality'] == modality)].drop_duplicates()
    subset = subset.nlargest(20,'freq_in_modality')
    frames.append(subset)
    display(subset)
top_phrases_df = pd.concat(frames)
top_phrases_df.head()

Unnamed: 0,modality,descriptor,freq_in_modality,context_window_length
155,sight,"((said, VERB),)",4509,1
452,sight,"((face, NOUN),)",2909,1
765,sight,"((come, VERB),)",2147,1
172,sight,"((eyes, NOUN),)",2141,1
139,sight,"((let, VERB),)",2132,1
2210,sight,"((man, NOUN),)",1883,1
189,sight,"((shall, VERB),)",1772,1
264,sight,"((time, NOUN),)",1593,1
422,sight,"((saw, VERB),)",1514,1
625,sight,"((glad, ADJ),)",1355,1


Unnamed: 0,modality,descriptor,freq_in_modality,context_window_length
785407,hear,"((said, VERB),)",4969,1
785274,hear,"((know, VERB),)",1727,1
786005,hear,"((voice, NOUN),)",1566,1
785353,hear,"((heard, VERB),)",1130,1
785495,hear,"((low, ADJ),)",1041,1
785641,hear,"((moment, NOUN),)",1004,1
785339,hear,"((tell, VERB),)",980,1
785327,hear,"((words, NOUN),)",943,1
786385,hear,"((time, NOUN),)",874,1
785315,hear,"((sound, NOUN),)",828,1


Unnamed: 0,modality,descriptor,freq_in_modality,context_window_length
1196739,touch,"((hand, NOUN),)",1413,1
1196050,touch,"((sure, ADJ),)",805,1
1195916,touch,"((way, NOUN),)",753,1
1195969,touch,"((time, NOUN),)",597,1
1196006,touch,"((said, VERB),)",589,1
1195973,touch,"((know, VERB),)",527,1
1196053,touch,"((hands, NOUN),)",522,1
1196905,touch,"((man, NOUN),)",439,1
1197078,touch,"((lips, NOUN),)",435,1
1196150,touch,"((heart, NOUN),)",430,1


Unnamed: 0,modality,descriptor,freq_in_modality,context_window_length
1488567,taste,"((said, VERB),)",737,1
1489155,taste,"((good, ADJ),)",217,1
1489466,taste,"((little, ADJ),)",187,1
1491547,taste,"((drink, VERB),)",151,1
1488846,taste,"((dinner, NOUN),)",140,1
1492034,taste,"((day, NOUN),)",123,1
1488560,taste,"((food, NOUN),)",117,1
1489864,taste,"((bread, NOUN),)",94,1
1488519,taste,"((eat, VERB),)",93,1
1489213,taste,"((smile, NOUN),)",93,1


Unnamed: 0,modality,descriptor,freq_in_modality,context_window_length
1574877,smell,"((air, NOUN),)",148,1
1574765,smell,"((pipe, NOUN),)",123,1
1574920,smell,"((freely, ADV),)",114,1
1575016,smell,"((room, NOUN),)",108,1
1574700,smell,"((said, VERB),)",98,1
1575168,smell,"((word, NOUN),)",83,1
1574918,smell,"((cigar, NOUN),)",79,1
1574737,smell,"((cigarette, NOUN),)",71,1
1574833,smell,"((fire, NOUN),)",62,1
1575550,smell,"((hardly, ADV),)",52,1


Unnamed: 0,modality,descriptor,freq_in_modality,context_window_length
155,sight,"((said, VERB),)",4509,1
452,sight,"((face, NOUN),)",2909,1
765,sight,"((come, VERB),)",2147,1
172,sight,"((eyes, NOUN),)",2141,1
139,sight,"((let, VERB),)",2132,1


In [74]:
phrase_vectors = get_word_vectors(model, top_phrases_df)

In [75]:
similarity_matrix = get_similarity_matrix(phrase_vectors)

100%|██████████| 100/100 [00:00<00:00, 417.67it/s]


In [76]:
pca = run_pca(similarity_matrix, 2)

In [77]:
principalDf = pd.DataFrame(data = pca, columns = ['principal component 1', 'principal component 2'])

In [78]:
principalDf.shape

(100, 2)

In [79]:
top_phrases_df = top_phrases_df.reset_index(drop=True)
display(top_phrases_df)
finalDf = pd.concat([principalDf, top_phrases_df[['descriptor', 'modality', 'freq_in_modality']]], axis = 1)
finalDf.head()

Unnamed: 0,modality,descriptor,freq_in_modality,context_window_length
0,sight,"((said, VERB),)",4509,1
1,sight,"((face, NOUN),)",2909,1
2,sight,"((come, VERB),)",2147,1
3,sight,"((eyes, NOUN),)",2141,1
4,sight,"((let, VERB),)",2132,1
...,...,...,...,...
95,smell,"((hear, VERB),)",42,1
96,smell,"((little, ADJ),)",42,1
97,smell,"((came, VERB),)",40,1
98,smell,"((turned, VERB),)",40,1


Unnamed: 0,principal component 1,principal component 2,descriptor,modality,freq_in_modality
0,-5.27794,-0.439259,"((said, VERB),)",sight,4509
1,-9.878849,-0.511935,"((face, NOUN),)",sight,2909
2,6.699184,-2.6035,"((come, VERB),)",sight,2147
3,-2.978911,-7.329702,"((eyes, NOUN),)",sight,2141
4,6.471551,-0.368275,"((let, VERB),)",sight,2132


In [80]:
finalDf.columns[:-1 * 2]

Index(['principal component 1', 'principal component 2', 'descriptor'], dtype='object')

In [81]:
fig = px.scatter(
            finalDf,
            color=finalDf["modality"],
            color_discrete_map={'sight': '#1f77b4', 'hear': '#2ca02c', 'taste': '#d62728', 'smell': '#ff7f0e', 'touch': '#9467bd'},
            x=finalDf.columns[0],
            y=finalDf.columns[1],
            hover_data=finalDf.columns[:],
            custom_data=finalDf.columns[:],
            title= "Phrase2Vec PCA - Top 100 Descriptors By Freq")
fig.update_traces(marker=dict(size=10,))
fig.write_html("phrase2vec_top_100_freq.html")
fig.show()