In [1]:
import time, logging, sqlite3, sys, re
import pandas as pd 
import numpy as np 
import pickle as pkl
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.decomposition import PCA

In [2]:
create_table = """CREATE TABLE CONTEXT_WINDOWS (
               SENSE_NAME VARCHAR(10) NOT NULL,
               SEED_WORD VARCHAR(10) NOT NULL,
               CONTEXT_WINDOW VARCHAR(50) NOT NULL,
               SENTENCE TEXT NOT NULL
               );"""

# data = cursor.execute(create_table)
# conn.commit()

In [3]:
def convert(string):
    return string.replace("), (", "***").split("***")

def convert2(lst):
    return [tuple(map(replaceQuotes, itm.strip("()").split(","))) for itm in lst]

def replaceQuotes(string):
    return string.replace("'", "").strip()

def get_word_vectors(model, phrases_df):
        vectors = []
        dv = model.dv
        not_found = 0
        for idx, row in phrases_df.iterrows():
            try:
                vectors.append(dv[idx])
            except KeyError:
                print(idx, 'not found')
                not_found += 1
                continue

        n = len(vectors)
        print("n: " + str(n))
        print("not_found: " + str(not_found))

        return vectors   

    
def get_similarity_matrix(vectors):
    n = len(vectors)
    distances = np.zeros((len(vectors[:n]), len(vectors[:n])))

    for idx1, vec1 in enumerate(tqdm(vectors[:n])):
        for idx2, vec2 in enumerate(vectors[idx1:n]):
            if idx2 == 0:
                distances[idx1][idx1] = 0
                continue
            # calculate the pearson correlation 
            p = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
            # calculate the distance between vectors 
            dist = abs(0.5 * (1 - p))

            distances[idx1][idx1 + idx2] = dist
            distances[idx1 + idx2][idx1] = dist

    return distances

def run_pca(distances, k):
    dist_matrix = pd.DataFrame(distances)
    x = dist_matrix.loc[:, ].values
    x = StandardScaler().fit_transform(x)
    reduction_model = PCA(n_components=k)
    pca = reduction_model.fit_transform(x)
    total_var = reduction_model.explained_variance_ratio_.sum() * 100
    logging.info("Total variance explained:" + str(total_var))
    return pca

class Corpus: 
    
    def __init__(self):
        self.corpus_df = pd.read_csv('../data/english_fiction_metadata.csv')
        self.context_windows_dict = '../data/full_corpus/context_windows_dict.pickle'
        self.context_windows_dict = {'sense_name': [], 'seed_word': [] , 'context_window': [], 'sentence': []}
        self.model_path = '../data/full_corpus/phrase_embedding_model_noun_adj_context_window_size_4.pickle'
        
        
    def train_phrases_model(self, ):
        start_time = time.time()
        data_set = self.read_file("../data/full_corpus/phrase_noun_adj_context_window_size_4.pickle")
        tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(data_set['test'].tolist())]
        model = Doc2Vec(tagged_data, vector_size = 20, window = 4, min_count = 1, epochs = 100)
        model.save(self.model_path)
        end_time = time.time()
        logging.info(end_time - start_time) 
    
    def read_file(self, input_path):
            logging.info('Opened: ' + input_path)
            with open(input_path, "rb") as f:
                input_obj = pkl.load(f)
                return input_obj     
            
    def save_file(self, output_obj, output_path):
        with open(output_path, 'wb') as f:
            pkl.dump(output_obj, f)
        logging.info('Saved to: ' + output_path)
            
    def save_context_windows(self,):
        vfunct = np.vectorize(self.insert_context_window_into_db)
        vfunct(self.corpus_df.cw_df_path)   
        
    def insert_context_window_into_db(self, cw_df_path):
        start_time = time.time()
        cw_df = self.read_file(cw_df_path)
        for idx, row in cw_df.iterrows():
            if row['context_window']:
                insert = """INSERT INTO CONTEXT_WINDOWS
                            (SENSE_NAME, SEED_WORD, CONTEXT_WINDOW, SENTENCE) 
                            VALUES (?, ?, ?, ?);"""
                data_tuple = (row['sense_name'], str(row['seed_word']), str(row['context_window']), str(row['sentence']))
                cursor.execute(insert, data_tuple)
                conn.commit()
        end_time = time.time()
        logging.info(end_time - start_time)
        
        
    def extract_noun_adjective_phrases(self,):
        sql_query = pd.read_sql_query("""SELECT * from CONTEXT_WINDOWS
                                 WHERE CONTEXT_WINDOW LIKE '%NOUN%'
                                 AND CONTEXT_WINDOW LIKE '%ADJ%'""", conn)
        df = pd.DataFrame(sql_query, columns = ['SENSE_NAME', 'SEED_WORD', 'CONTEXT_WINDOW', 'SENTENCE'])
        conn.close()
        df['test'] = df['CONTEXT_WINDOW'].str.strip('][')
        df['test'] = df['test'].apply(convert)
        df['test'] = df['test'].apply(convert2)
        df = df[df['test'].map(len) == 2]
        self.save_file(df, "../data/full_corpus/phrase_noun_adj_context_window_size_4.pickle")

In [4]:
# conn = sqlite3.connect('context_window.db')
# cursor = conn.cursor()
# pragma = """PRAGMA case_sensitive_like=ON;"""
# cursor.execute(pragma)

In [5]:
# corpus = Corpus()
# corpus.save_context_windows()
# corpus.extract_noun_adjective_phrases()
# corpus.train_phrases_model()

In [6]:
# cursor.execute('SELECT count(*) from CONTEXT_WINDOWS')
# cur_result = cursor.fetchone()
# print("There are " + str(cur_result) + " rows in the DB.")
# conn.close()

In [7]:
# cursor.execute("DELETE FROM CONTEXT_WINDOWS;")
# print('We have deleted', cursor.rowcount, 'records from the table.')
# conn.commit()
# conn.close()

In [66]:
model = Doc2Vec.load('../data/full_corpus/phrase_embedding_model_noun_adj_context_window_size_4.pickle')
# context_windows = read_file('../data/full_corpus/phrase_noun_adj_context_window_size_4.pickle')

In [337]:
context_window_list = context_windows['test'].tolist()
freq_list = [context_window_list.count(item) for item in context_window_list]
context_windows['freq'] = freq_list
context_windows.head()

Unnamed: 0,SENSE_NAME,SEED_WORD,CONTEXT_WINDOW,SENTENCE,test,freq
0,hear,"('means', 'NOUN')","[('thing', 'NOUN'), ('unprecedented', 'ADJ')]","[('here', 'ADV'), ('be', 'VERB'), ('it', 'PRON...","[(thing, NOUN), (unprecedented, ADJ)]",4
4,touch,"('strike', 'VERB')","[('quick', 'ADJ'), ('terror', 'NOUN')]","[('as', 'ADP'), ('if', 'ADP'), ('to', 'PART'),...","[(quick, ADJ), (terror, NOUN)]",4
5,touch,"('reaching', 'VERB')","[('waters', 'NOUN'), ('revolving', 'ADJ')]","[('while', 'ADP'), ('the', 'DET'), ('two', 'NU...","[(waters, NOUN), (revolving, ADJ)]",4
7,hear,"('means', 'VERB')","[('little', 'ADJ'), ('word', 'NOUN')]","[('what', 'NOUN'), ('means', 'VERB'), ('that',...","[(little, ADJ), (word, NOUN)]",8
12,sight,"('look', 'NOUN')","[('regular', 'ADJ'), ('outs', 'NOUN')]","[('come', 'VERB'), ('down', 'PART'), (',', 'PU...","[(regular, ADJ), (outs, NOUN)]",4


In [351]:
context_windows['SENSE_NAME'] = context_windows['SENSE_NAME'].astype(str)
context_windows['test'] = context_windows['test'].astype(str)
context_windows.head()

Unnamed: 0,SENSE_NAME,SEED_WORD,CONTEXT_WINDOW,SENTENCE,test,freq
0,hear,"('means', 'NOUN')","[('thing', 'NOUN'), ('unprecedented', 'ADJ')]","[('here', 'ADV'), ('be', 'VERB'), ('it', 'PRON...","[('thing', 'NOUN'), ('unprecedented', 'ADJ')]",4
4,touch,"('strike', 'VERB')","[('quick', 'ADJ'), ('terror', 'NOUN')]","[('as', 'ADP'), ('if', 'ADP'), ('to', 'PART'),...","[('quick', 'ADJ'), ('terror', 'NOUN')]",4
5,touch,"('reaching', 'VERB')","[('waters', 'NOUN'), ('revolving', 'ADJ')]","[('while', 'ADP'), ('the', 'DET'), ('two', 'NU...","[('waters', 'NOUN'), ('revolving', 'ADJ')]",4
7,hear,"('means', 'VERB')","[('little', 'ADJ'), ('word', 'NOUN')]","[('what', 'NOUN'), ('means', 'VERB'), ('that',...","[('little', 'ADJ'), ('word', 'NOUN')]",8
12,sight,"('look', 'NOUN')","[('regular', 'ADJ'), ('outs', 'NOUN')]","[('come', 'VERB'), ('down', 'PART'), (',', 'PU...","[('regular', 'ADJ'), ('outs', 'NOUN')]",4


In [364]:
context_windows.loc[[178072]]

Unnamed: 0,SENSE_NAME,SEED_WORD,CONTEXT_WINDOW,SENTENCE,test,freq
178072,taste,"('bitterly', 'ADV')","[('old', 'ADJ'), ('woman', 'NOUN')]","[('""', 'PUNCT'), ('that', 'DET'), ('is', 'VERB...","[('old', 'ADJ'), ('woman', 'NOUN')]",94


In [73]:
with open("context_windows.pickle", "rb") as f:
    context_windows = pkl.load(f)
context_windows.head()

Unnamed: 0,SENSE_NAME,SEED_WORD,CONTEXT_WINDOW,SENTENCE,test,freq
0,hear,"('means', 'NOUN')","[('thing', 'NOUN'), ('unprecedented', 'ADJ')]","[('here', 'ADV'), ('be', 'VERB'), ('it', 'PRON...","[('thing', 'NOUN'), ('unprecedented', 'ADJ')]",4
1,touch,"('strike', 'VERB')","[('quick', 'ADJ'), ('terror', 'NOUN')]","[('as', 'ADP'), ('if', 'ADP'), ('to', 'PART'),...","[('quick', 'ADJ'), ('terror', 'NOUN')]",4
2,touch,"('reaching', 'VERB')","[('waters', 'NOUN'), ('revolving', 'ADJ')]","[('while', 'ADP'), ('the', 'DET'), ('two', 'NU...","[('waters', 'NOUN'), ('revolving', 'ADJ')]",4
3,hear,"('means', 'VERB')","[('little', 'ADJ'), ('word', 'NOUN')]","[('what', 'NOUN'), ('means', 'VERB'), ('that',...","[('little', 'ADJ'), ('word', 'NOUN')]",8
4,sight,"('look', 'NOUN')","[('regular', 'ADJ'), ('outs', 'NOUN')]","[('come', 'VERB'), ('down', 'PART'), (',', 'PU...","[('regular', 'ADJ'), ('outs', 'NOUN')]",4


In [74]:
context_windows['freq'] = context_windows['freq'].astype(int)
# context_windows = context_windows.reset_index(drop=True)
frames = []
senses = ['sight', 'hear', 'touch', 'taste', 'smell']
for modality in senses:
    print(modality)
    subset = context_windows.loc[(context_windows['SENSE_NAME'] == modality)].drop_duplicates(subset=['test', 'SENSE_NAME'])
    subset = subset.nlargest(40,'freq')
    frames.append(subset)
    display(subset)
top_phrases_df = pd.concat(frames)
top_phrases_df.head()

sight


Unnamed: 0,SENSE_NAME,SEED_WORD,CONTEXT_WINDOW,SENTENCE,test,freq
392,sight,"('look', 'NOUN')","[('old', 'ADJ'), ('man', 'NOUN')]","[('""', 'PUNCT'), ('said', 'VERB'), ('hester', ...","[('old', 'ADJ'), ('man', 'NOUN')]",370
91,sight,"('looked', 'VERB')","[('young', 'ADJ'), ('man', 'NOUN')]","[('she', 'PRON'), ('looked', 'VERB'), ('at', '...","[('young', 'ADJ'), ('man', 'NOUN')]",365
753,sight,"('watch', 'VERB')","[('great', 'ADJ'), ('deal', 'NOUN')]","[('it', 'PRON'), (""'s"", 'VERB'), ('a', 'DET'),...","[('great', 'ADJ'), ('deal', 'NOUN')]",220
1060,sight,"('seen', 'VERB')","[('good', 'ADJ'), ('deal', 'NOUN')]","[('she', 'PRON'), ('had', 'VERB'), ('seen', 'V...","[('good', 'ADJ'), ('deal', 'NOUN')]",215
2608,sight,"('looking', 'VERB')","[('long', 'ADJ'), ('time', 'NOUN')]","[('now', 'ADV'), ('he', 'PRON'), ('could', 'VE...","[('long', 'ADJ'), ('time', 'NOUN')]",207
3875,sight,"('look', 'VERB')","[('young', 'ADJ'), ('lady', 'NOUN')]","[('""', 'PUNCT'), ('look', 'VERB'), ('at', 'ADP...","[('young', 'ADJ'), ('lady', 'NOUN')]",131
2866,sight,"('looked', 'VERB')","[('old', 'ADJ'), ('woman', 'NOUN')]","[('she', 'PRON'), ('looked', 'VERB'), ('an', '...","[('old', 'ADJ'), ('woman', 'NOUN')]",94
1102,sight,"('looked', 'VERB')","[('little', 'ADJ'), ('girl', 'NOUN')]","[('when', 'ADV'), ('dorothy', 'PROPN'), (',', ...","[('little', 'ADJ'), ('girl', 'NOUN')]",92
1611,sight,"('eye', 'NOUN')","[('old', 'ADJ'), ('lady', 'NOUN')]","[('tom', 'PROPN'), ('felt', 'VERB'), ('the', '...","[('old', 'ADJ'), ('lady', 'NOUN')]",69
6666,sight,"('looked', 'VERB')","[('door', 'NOUN'), ('open', 'ADJ')]","[('i', 'PRON'), ('flung', 'VERB'), ('the', 'DE...","[('door', 'NOUN'), ('open', 'ADJ')]",67


hear


Unnamed: 0,SENSE_NAME,SEED_WORD,CONTEXT_WINDOW,SENTENCE,test,freq
2314,hear,"('voice', 'NOUN')","[('old', 'ADJ'), ('man', 'NOUN')]","[('then', 'ADV'), ('the', 'DET'), ('voice', 'N...","[('old', 'ADJ'), ('man', 'NOUN')]",370
1200,hear,"('silent', 'ADJ')","[('young', 'ADJ'), ('man', 'NOUN')]","[('the', 'DET'), ('young', 'ADJ'), ('man', 'NO...","[('young', 'ADJ'), ('man', 'NOUN')]",365
81,hear,"('mean', 'VERB')","[('great', 'ADJ'), ('deal', 'NOUN')]","[('that', 'DET'), ('will', 'VERB'), ('mean', '...","[('great', 'ADJ'), ('deal', 'NOUN')]",220
702,hear,"('meaning', 'NOUN')","[('good', 'ADJ'), ('deal', 'NOUN')]","[('there', 'ADV'), ('was', 'VERB'), ('a', 'DET...","[('good', 'ADJ'), ('deal', 'NOUN')]",215
3559,hear,"('listened', 'VERB')","[('long', 'ADJ'), ('time', 'NOUN')]","[('""', 'PUNCT'), ('she', 'PRON'), ('asked', 'V...","[('long', 'ADJ'), ('time', 'NOUN')]",207
6361,hear,"('heard', 'VERB')","[('young', 'ADJ'), ('lady', 'NOUN')]","[('i', 'PRON'), ('was', 'VERB'), ('""', 'PUNCT'...","[('young', 'ADJ'), ('lady', 'NOUN')]",131
7645,hear,"('listened', 'VERB')","[('old', 'ADJ'), ('woman', 'NOUN')]","[('richard', 'PROPN'), ('and', 'CCONJ'), ('mon...","[('old', 'ADJ'), ('woman', 'NOUN')]",94
1117,hear,"('sounds', 'NOUN')","[('little', 'ADJ'), ('girl', 'NOUN')]","[('these', 'DET'), ('sounds', 'NOUN'), ('made'...","[('little', 'ADJ'), ('girl', 'NOUN')]",92
23202,hear,"('mean', 'VERB')","[('old', 'ADJ'), ('lady', 'NOUN')]","[('""', 'PUNCT'), ('you', 'PRON'), ('mean', 'VE...","[('old', 'ADJ'), ('lady', 'NOUN')]",69
592,hear,"('heard', 'VERB')","[('door', 'NOUN'), ('open', 'ADJ')]","[('when', 'ADV'), ('she', 'PRON'), ('heard', '...","[('door', 'NOUN'), ('open', 'ADJ')]",67


touch


Unnamed: 0,SENSE_NAME,SEED_WORD,CONTEXT_WINDOW,SENTENCE,test,freq
2953,touch,"('clinging', 'VERB')","[('old', 'ADJ'), ('man', 'NOUN')]","[('clinging', 'VERB'), ('to', 'ADP'), ('the', ...","[('old', 'ADJ'), ('man', 'NOUN')]",370
2512,touch,"('touch', 'NOUN')","[('young', 'ADJ'), ('man', 'NOUN')]","[('it', 'PRON'), ('would', 'VERB'), ('do', 'VE...","[('young', 'ADJ'), ('man', 'NOUN')]",365
3094,touch,"('felt', 'VERB')","[('great', 'ADJ'), ('deal', 'NOUN')]","[('she', 'PRON'), ('felt', 'VERB'), ('a', 'DET...","[('great', 'ADJ'), ('deal', 'NOUN')]",220
7394,touch,"('felt', 'VERB')","[('good', 'ADJ'), ('deal', 'NOUN')]","[('he', 'PRON'), ('felt', 'VERB'), ('a', 'DET'...","[('good', 'ADJ'), ('deal', 'NOUN')]",215
3546,touch,"('reaching', 'VERB')","[('long', 'ADJ'), ('time', 'NOUN')]","[('so', 'ADV'), ('spoke', 'VERB'), ('love', 'N...","[('long', 'ADJ'), ('time', 'NOUN')]",207
22435,touch,"('kissed', 'VERB')","[('young', 'ADJ'), ('lady', 'NOUN')]","[('mr.', 'PROPN'), ('winkle', 'PROPN'), ('kiss...","[('young', 'ADJ'), ('lady', 'NOUN')]",131
13170,touch,"('feel', 'VERB')","[('old', 'ADJ'), ('woman', 'NOUN')]","[('in', 'ADP'), ('the', 'DET'), ('train', 'NOU...","[('old', 'ADJ'), ('woman', 'NOUN')]",94
5068,touch,"('feel', 'VERB')","[('little', 'ADJ'), ('girl', 'NOUN')]","[('whatever', 'ADJ'), ('i', 'PRON'), ('can', '...","[('little', 'ADJ'), ('girl', 'NOUN')]",92
11263,touch,"('fingers', 'NOUN')","[('old', 'ADJ'), ('lady', 'NOUN')]","[('""', 'PUNCT'), ('yes', 'INTJ'), (',', 'PUNCT...","[('old', 'ADJ'), ('lady', 'NOUN')]",69
4526,touch,"('reached', 'VERB')","[('door', 'NOUN'), ('open', 'ADJ')]","[('it', 'PRON'), ('was', 'VERB'), ('never', 'A...","[('door', 'NOUN'), ('open', 'ADJ')]",67


taste


Unnamed: 0,SENSE_NAME,SEED_WORD,CONTEXT_WINDOW,SENTENCE,test,freq
6223,taste,"('sour', 'ADJ')","[('old', 'ADJ'), ('man', 'NOUN')]","[('now', 'ADV'), ('i', 'PRON'), ('am', 'VERB')...","[('old', 'ADJ'), ('man', 'NOUN')]",370
28255,taste,"('eating', 'VERB')","[('young', 'ADJ'), ('man', 'NOUN')]","[('in', 'ADP'), ('this', 'DET'), ('order', 'NO...","[('young', 'ADJ'), ('man', 'NOUN')]",365
708,taste,"('taste', 'NOUN')","[('great', 'ADJ'), ('deal', 'NOUN')]","[('she', 'PRON'), ('has', 'VERB'), ('a', 'DET'...","[('great', 'ADJ'), ('deal', 'NOUN')]",220
2193,taste,"('ate', 'VERB')","[('good', 'ADJ'), ('deal', 'NOUN')]","[('if', 'ADP'), ('they', 'PRON'), ('ate', 'VER...","[('good', 'ADJ'), ('deal', 'NOUN')]",215
5930,taste,"('eat', 'VERB')","[('long', 'ADJ'), ('time', 'NOUN')]","[('he', 'PRON'), ('thought', 'VERB'), ('that',...","[('long', 'ADJ'), ('time', 'NOUN')]",207
5535,taste,"('tastes', 'NOUN')","[('young', 'ADJ'), ('lady', 'NOUN')]","[('you', 'PRON'), ('have', 'VERB'), ('not', 'A...","[('young', 'ADJ'), ('lady', 'NOUN')]",131
53930,taste,"('bitterly', 'ADV')","[('old', 'ADJ'), ('woman', 'NOUN')]","[('""', 'PUNCT'), ('that', 'DET'), ('is', 'VERB...","[('old', 'ADJ'), ('woman', 'NOUN')]",94
11573,taste,"('sweet', 'ADJ')","[('little', 'ADJ'), ('girl', 'NOUN')]","[('""', 'PUNCT'), ('who', 'NOUN'), ('could', 'V...","[('little', 'ADJ'), ('girl', 'NOUN')]",92
27635,taste,"('bitter', 'ADJ')","[('old', 'ADJ'), ('lady', 'NOUN')]","[('she', 'PRON'), ('was', 'VERB'), ('such', 'A...","[('old', 'ADJ'), ('lady', 'NOUN')]",69
27644,taste,"('swallowed', 'VERB')","[('old', 'ADJ'), ('friend', 'NOUN')]","[('he', 'PRON'), ('dropped', 'VERB'), ('the', ...","[('old', 'ADJ'), ('friend', 'NOUN')]",65


smell


Unnamed: 0,SENSE_NAME,SEED_WORD,CONTEXT_WINDOW,SENTENCE,test,freq
66687,smell,"('nose', 'NOUN')","[('old', 'ADJ'), ('man', 'NOUN')]","[('she', 'PRON'), ('withdrew', 'VERB'), ('her'...","[('old', 'ADJ'), ('man', 'NOUN')]",370
45653,smell,"('breathed', 'VERB')","[('young', 'ADJ'), ('man', 'NOUN')]","[('""', 'PUNCT'), ('parkenstacker', 'PROPN'), (...","[('young', 'ADJ'), ('man', 'NOUN')]",365
7578,smell,"('smoke', 'VERB')","[('great', 'ADJ'), ('deal', 'NOUN')]","[('i', 'PRON'), ('smoke', 'VERB'), ('a', 'DET'...","[('great', 'ADJ'), ('deal', 'NOUN')]",220
32809,smell,"('smell', 'NOUN')","[('good', 'ADJ'), ('deal', 'NOUN')]","[('""', 'PUNCT'), ('a', 'DET'), ('person', 'NOU...","[('good', 'ADJ'), ('deal', 'NOUN')]",215
10809,smell,"('breathe', 'VERB')","[('long', 'ADJ'), ('time', 'NOUN')]","[('i', 'PRON'), ('could', 'VERB'), (""n't"", 'AD...","[('long', 'ADJ'), ('time', 'NOUN')]",207
8004,smell,"('perfume', 'NOUN')","[('young', 'ADJ'), ('lady', 'NOUN')]","[('the', 'DET'), ('perfume', 'NOUN'), ('of', '...","[('young', 'ADJ'), ('lady', 'NOUN')]",131
46057,smell,"('breathed', 'VERB')","[('little', 'ADJ'), ('girl', 'NOUN')]","[('""', 'PUNCT'), ('oh', 'INTJ'), (',', 'PUNCT'...","[('little', 'ADJ'), ('girl', 'NOUN')]",92
29300,smell,"('breathing', 'NOUN')","[('old', 'ADJ'), ('lady', 'NOUN')]","[('he', 'PRON'), ('wears', 'VERB'), ('indiarub...","[('old', 'ADJ'), ('lady', 'NOUN')]",69
47875,smell,"('breathe', 'VERB')","[('young', 'ADJ'), ('woman', 'NOUN')]","[('but', 'CCONJ'), ('from', 'ADP'), ('the', 'D...","[('young', 'ADJ'), ('woman', 'NOUN')]",62
106141,smell,"('smoke', 'VERB')","[('old', 'ADJ'), ('gentleman', 'NOUN')]","[('inevitably', 'ADV'), ('the', 'DET'), ('old'...","[('old', 'ADJ'), ('gentleman', 'NOUN')]",54


Unnamed: 0,SENSE_NAME,SEED_WORD,CONTEXT_WINDOW,SENTENCE,test,freq
392,sight,"('look', 'NOUN')","[('old', 'ADJ'), ('man', 'NOUN')]","[('""', 'PUNCT'), ('said', 'VERB'), ('hester', ...","[('old', 'ADJ'), ('man', 'NOUN')]",370
91,sight,"('looked', 'VERB')","[('young', 'ADJ'), ('man', 'NOUN')]","[('she', 'PRON'), ('looked', 'VERB'), ('at', '...","[('young', 'ADJ'), ('man', 'NOUN')]",365
753,sight,"('watch', 'VERB')","[('great', 'ADJ'), ('deal', 'NOUN')]","[('it', 'PRON'), (""'s"", 'VERB'), ('a', 'DET'),...","[('great', 'ADJ'), ('deal', 'NOUN')]",220
1060,sight,"('seen', 'VERB')","[('good', 'ADJ'), ('deal', 'NOUN')]","[('she', 'PRON'), ('had', 'VERB'), ('seen', 'V...","[('good', 'ADJ'), ('deal', 'NOUN')]",215
2608,sight,"('looking', 'VERB')","[('long', 'ADJ'), ('time', 'NOUN')]","[('now', 'ADV'), ('he', 'PRON'), ('could', 'VE...","[('long', 'ADJ'), ('time', 'NOUN')]",207


In [75]:
phrase_vectors = get_word_vectors(model, top_phrases_df)

n: 200
not_found: 0


In [76]:
similarity_matrix = get_similarity_matrix(phrase_vectors)

100%|██████████| 200/200 [00:00<00:00, 239.12it/s]


In [77]:
pca = run_pca(similarity_matrix, 2)

In [78]:
principalDf = pd.DataFrame(data = pca, columns = ['principal component 1', 'principal component 2'])

In [79]:
principalDf.shape

(200, 2)

In [80]:
principalDf.head()

Unnamed: 0,principal component 1,principal component 2
0,-9.943266,-0.374705
1,-10.418727,-1.546303
2,2.713349,5.34651
3,4.658481,4.467498
4,7.941022,3.130614


In [81]:
top_phrases_df = top_phrases_df.reset_index(drop=True)
display(top_phrases_df)

Unnamed: 0,SENSE_NAME,SEED_WORD,CONTEXT_WINDOW,SENTENCE,test,freq
0,sight,"('look', 'NOUN')","[('old', 'ADJ'), ('man', 'NOUN')]","[('""', 'PUNCT'), ('said', 'VERB'), ('hester', ...","[('old', 'ADJ'), ('man', 'NOUN')]",370
1,sight,"('looked', 'VERB')","[('young', 'ADJ'), ('man', 'NOUN')]","[('she', 'PRON'), ('looked', 'VERB'), ('at', '...","[('young', 'ADJ'), ('man', 'NOUN')]",365
2,sight,"('watch', 'VERB')","[('great', 'ADJ'), ('deal', 'NOUN')]","[('it', 'PRON'), (""'s"", 'VERB'), ('a', 'DET'),...","[('great', 'ADJ'), ('deal', 'NOUN')]",220
3,sight,"('seen', 'VERB')","[('good', 'ADJ'), ('deal', 'NOUN')]","[('she', 'PRON'), ('had', 'VERB'), ('seen', 'V...","[('good', 'ADJ'), ('deal', 'NOUN')]",215
4,sight,"('looking', 'VERB')","[('long', 'ADJ'), ('time', 'NOUN')]","[('now', 'ADV'), ('he', 'PRON'), ('could', 'VE...","[('long', 'ADJ'), ('time', 'NOUN')]",207
...,...,...,...,...,...,...
195,smell,"('breathing', 'NOUN')","[('deep', 'ADJ'), ('voice', 'NOUN')]","[('there', 'ADV'), ('was', 'VERB'), ('a', 'DET...","[('deep', 'ADJ'), ('voice', 'NOUN')]",19
196,smell,"('puffed', 'VERB')","[('big', 'ADJ'), ('man', 'NOUN')]","[('then', 'ADV'), ('a', 'DET'), ('big', 'ADJ')...","[('big', 'ADJ'), ('man', 'NOUN')]",18
197,smell,"('smoker', 'NOUN')","[('heart', 'NOUN'), ('good', 'ADJ')]","[('he', 'PRON'), ('sighed', 'VERB'), ('with', ...","[('heart', 'NOUN'), ('good', 'ADJ')]",18
198,smell,"('breathed', 'VERB')","[('open', 'ADJ'), ('mouth', 'NOUN')]","[('marius', 'PROPN'), (""'"", 'PART'), ('eyelids...","[('open', 'ADJ'), ('mouth', 'NOUN')]",17


In [89]:
finalDf = pd.concat([principalDf, top_phrases_df[['SENSE_NAME', 'SEED_WORD', 'test', 'freq', 'SENTENCE']]], axis = 1)
finalDf.head()

Unnamed: 0,principal component 1,principal component 2,SENSE_NAME,SEED_WORD,test,freq,SENTENCE
0,-9.943266,-0.374705,sight,"('look', 'NOUN')","[('old', 'ADJ'), ('man', 'NOUN')]",370,"[('""', 'PUNCT'), ('said', 'VERB'), ('hester', ..."
1,-10.418727,-1.546303,sight,"('looked', 'VERB')","[('young', 'ADJ'), ('man', 'NOUN')]",365,"[('she', 'PRON'), ('looked', 'VERB'), ('at', '..."
2,2.713349,5.34651,sight,"('watch', 'VERB')","[('great', 'ADJ'), ('deal', 'NOUN')]",220,"[('it', 'PRON'), (""'s"", 'VERB'), ('a', 'DET'),..."
3,4.658481,4.467498,sight,"('seen', 'VERB')","[('good', 'ADJ'), ('deal', 'NOUN')]",215,"[('she', 'PRON'), ('had', 'VERB'), ('seen', 'V..."
4,7.941022,3.130614,sight,"('looking', 'VERB')","[('long', 'ADJ'), ('time', 'NOUN')]",207,"[('now', 'ADV'), ('he', 'PRON'), ('could', 'VE..."


In [83]:
# with open("final_Df.pickle", "rb") as f:
#     finalDf = pkl.load(f)

In [90]:
finalDf['SENTENCE'] = finalDf['SENTENCE'].str.strip("[]")
finalDf['SENTENCE'] = finalDf['SENTENCE'].apply(convert)
finalDf['SENTENCE'] = finalDf['SENTENCE'].apply(convert2)
finalDf.head()

Unnamed: 0,principal component 1,principal component 2,SENSE_NAME,SEED_WORD,test,freq,SENTENCE
0,-9.943266,-0.374705,sight,"('look', 'NOUN')","[('old', 'ADJ'), ('man', 'NOUN')]",370,"[("", PUNCT), (said, VERB), (hester, PROPN), (,..."
1,-10.418727,-1.546303,sight,"('looked', 'VERB')","[('young', 'ADJ'), ('man', 'NOUN')]",365,"[(she, PRON), (looked, VERB), (at, ADP), (the,..."
2,2.713349,5.34651,sight,"('watch', 'VERB')","[('great', 'ADJ'), ('deal', 'NOUN')]",220,"[(it, PRON), (""s"", VERB), (a, DET), (great, AD..."
3,4.658481,4.467498,sight,"('seen', 'VERB')","[('good', 'ADJ'), ('deal', 'NOUN')]",215,"[(she, PRON), (had, VERB), (seen, VERB), (a, D..."
4,7.941022,3.130614,sight,"('looking', 'VERB')","[('long', 'ADJ'), ('time', 'NOUN')]",207,"[(now, ADV), (he, PRON), (could, VERB), (not, ..."


In [91]:
def create_string(lst):
    new_list = [val[0] for val in lst]
    return ' '.join(l + '<br>' * (n % 20 == 2) for n, l in enumerate(new_list))
finalDf['SENTENCE'] = finalDf['SENTENCE'].apply(create_string)
finalDf.head()

Unnamed: 0,principal component 1,principal component 2,SENSE_NAME,SEED_WORD,test,freq,SENTENCE
0,-9.943266,-0.374705,sight,"('look', 'NOUN')","[('old', 'ADJ'), ('man', 'NOUN')]",370,""" said hester<br> noticing the old man ""s"" lo..."
1,-10.418727,-1.546303,sight,"('looked', 'VERB')","[('young', 'ADJ'), ('man', 'NOUN')]",365,"she looked at<br> the young man ""s"" bare arms . """
2,2.713349,5.34651,sight,"('watch', 'VERB')","[('great', 'ADJ'), ('deal', 'NOUN')]",220,"it ""s"" a<br> great deal to watch her ."
3,4.658481,4.467498,sight,"('seen', 'VERB')","[('good', 'ADJ'), ('deal', 'NOUN')]",215,she had seen<br> a good deal of the gardners ;...
4,7.941022,3.130614,sight,"('looking', 'VERB')","[('long', 'ADJ'), ('time', 'NOUN')]",207,now he could<br> not watch tarzan but he was ...


In [92]:
# with open("principalDf.pickle", 'wb') as f:
#     pkl.dump(principalDf, f)

In [93]:
fig = px.scatter(
            finalDf,
            color=finalDf["SENSE_NAME"],
            color_discrete_map={'sight': '#1f77b4', 'hear': '#2ca02c', 'taste': '#d62728', 'smell': '#ff7f0e', 'touch': '#9467bd'},
            x=finalDf.columns[0],
            y=finalDf.columns[1],
            hover_data=finalDf.columns[:],
            custom_data=finalDf.columns[:],
            title= "Phrase2Vec PCA - Top 200 Descriptors By Freq")
fig.update_traces(marker=dict(size=10,))
fig.write_html("phrase2vec_noun_adj_top_200_freq.html")
fig.show()