# Metadata

```yaml
Course:    DS 5001
Module:    09 Lab
Topic:     Homework 9
Author:    Andrew Avitabile
Date:      27 March 2024
```

# Set Up

In [3]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [19]:
# Adjust this for a new corpus
data_prefix = 'austen-melville'
table_dir =  f'{data_home}/{data_prefix}'
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
PARA = OHCO[:4] # Paragraphs
SENT = OHCO[:5] # Sentences
BAG = PARA

In [6]:
# word2vec parameters
w2v_params = dict(
    window = 5,
    vector_size = 246,
    min_count = 50, # THIS LIMITS OUR VOCAB
    workers = 4
)

In [7]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
from gensim.corpora import Dictionary
from sklearn.manifold import TSNE
import plotly_express as px

In [8]:
import gensim
gensim.__version__

'4.3.0'

# Import `TOKENS` and `VOCAB`

We import data from the TOKEN table of the novels corpus, excluding proper nouns.

In [21]:
TOKENS = pd.read_csv(f'{output_dir}/{data_prefix}-TOKEN.csv').set_index(OHCO)

In [24]:
VOCAB = pd.read_csv(f'{output_dir}/{data_prefix}-VOCAB.csv').set_index('term_str')

# Convert to Gensim

We now create a Gensim-style corpus of docs, a list of lists of tokens.

In [25]:
docs = TOKENS[~TOKENS.pos.str.match('NNPS?')].dropna(subset='term_str')\
    .groupby(BAG)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
docs = [doc for doc in docs if len(doc) > 1] # Lose single word docs

In [26]:
vocab = Dictionary(docs)

# Generate word embeddings with Gensim's module

In [27]:
model = word2vec.Word2Vec(docs, **w2v_params)

In [28]:
model.wv.vectors

array([[-0.03868556,  0.21934517,  0.35239646, ...,  0.7828589 ,
        -0.31337866,  0.12883148],
       [-0.29524717, -0.30323213, -0.59161323, ...,  0.15516481,
        -0.04994084, -0.8938875 ],
       [ 0.04115698, -0.17666396,  0.28215903, ..., -0.15221389,
         0.06701489, -0.02697449],
       ...,
       [ 0.15146533, -0.23200135,  0.1642928 , ...,  0.11276452,
         0.07608485, -0.073874  ],
       [ 0.02789322,  0.03042475,  0.11556743, ..., -0.05955997,
        -0.09291907,  0.01019334],
       [ 0.00538412, -0.2820923 ,  0.08695275, ..., -0.08120006,
        -0.07713098,  0.0100612 ]], dtype=float32)

# Visualize with tSNE

In [35]:
VOCAB['gsid'] = pd.DataFrame([i for i in vocab.items()], columns=['gsid', 'term_str']).set_index('term_str')
VOCAB = VOCAB.dropna(subset='gsid').copy()
VOCAB.gsid = VOCAB.gsid.astype(int)

In [36]:
def get_vector(row):
    w = row.name
    try:
        vec = model.wv[w]
    except KeyError as e:
        vec = None
    return vec

In [37]:
WV = pd.DataFrame(VOCAB.apply(get_vector, axis=1).dropna()).apply(lambda x: pd.Series(x[0]), axis=1)

## Use ScikitLearn's TSNE library

In [38]:
tsne_engine = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)

In [39]:
tsne_model = tsne_engine.fit_transform(WV.to_numpy())

In [None]:
TSNE = pd.DataFrame(tsne_model, columns=['x','y'], index=WV.index)

In [None]:
TSNE

# Semantic Algebra

## Analogies

$A : B :: C : D? \rightarrow B - A + C = D$


In [33]:
def complete_analogy(A, B, C, n=2):
    try:
        cols = ['term', 'sim']
        return pd.DataFrame(model.wv.most_similar(positive=[B, C], negative=[A])[0:n], columns=cols)
    except KeyError as e:
        print('Error:', e)
        return None
    
def get_most_similar(positive, negative=None):
    return pd.DataFrame(model.wv.most_similar(positive, negative), columns=['term', 'sim'])