# MemorAI - Word Embedding
- Salesken: https://huggingface.co/salesken/query_wellformedness_score
- Marvin: https://www.ischool.berkeley.edu/sites/default/files/sproject_attachments/final_report.pdf
- Query-WellFormedness: https://github.com/google-research-datasets/query-wellformedness
- Checklist: https://github.com/marcotcr/checklist

In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import output
    from google.colab import drive

    !pip install gensim==4.0.1
    output.clear()
    drive.mount('/content/gdrive')

In [None]:
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
from typing import List
from sklearn.feature_extraction.text import CountVectorizer

import string
import nltk
import json
import gensim.downloader as gendw
import pandas as pd
import numpy as np

# init
nltk.download('punkt')

# configs
SAVE_MOEL = False
MODEL_PATH = '/content/gdrive/MyDrive/Berkeley/W210/w210_Capstone_Project/Repo/memorai/tim/models'

# data
DOCS = []
DOCS.append({
'question': '''What do you remember about your spouse?''',
'answer': '''I have had more than a little bit of luck in life, but nothing equals in magnitude my marriage to Martin D. Ginsburg. I do not have words adequate to describe my supersmart, exuberant, ever-loving spouse. Early on in our marriage, it became clear to him that cooking was not my strong suit. To the everlasting appreciation of our food-loving children (we became four in 1965, when son James was born), Marty made the kitchen his domain and became Chef Supreme in our home. Marty coached me through the birth of our son, he was the first reader and critic of articles, speeches, and briefs I drafted, and he was at my side constantly, in and out of the hospital, during two long bouts with cancer. And I betray no secret in reporting that, without him, I would not have gained a seat on the U.S. Supreme Court.'''
})
DOCS.append({
'question': '''Who was your best friend? What were they like?''',
'answer': '''Justice Antonin Scalia was my best friend. Once asked how we could be friends, given our disagreement on lots of things, Justice Scalia answered: "I attack ideas. I don't attack people. Some very good people have some very bad ideas. And if you can't separate the two, you gotta get another day job. You don't want to be a judge. At least not a judge on a multi-member panel." When we were in India together, we went to Agra to see the Taj Mahal and there is a doorway where you get sight of it. I stood there, when we got there, in that doorway - tears were running down my cheek, it amazed him that I had such an emotional response. I will miss the challenges and the laughter Justice Scalia provoked, his pungent, eminently quotable opinions, so clearly stated that his words never slipped from the reader's grasp, the roses he brought me on my birthday, the chance to appear with him once more as supernumeraries at the opera. He was, indeed, a magnificent performer. How blessed I was to have high spirits, and quick wit. In the words of a duet for tenor Scalia and soprano Ginsburg, we were different, yes, in our interpretation of written texts, yet one in our reverence for the Court and its place in the U.S. system of governance. It was my great good fortune to have known him as a working colleague and treasured friend.'''
})

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Word2Vec

In [None]:
# show pre-trained models & corpora
info = gendw.info()
# print(json.dumps(info, indent=4))
print('MODELS:')
for key in info['models'].keys():
    print(key)

print('\nCORPORA:')
for key in info['corpora'].keys():
    print(key)

MODELS:
fasttext-wiki-news-subwords-300
conceptnet-numberbatch-17-06-300
word2vec-ruscorpora-300
word2vec-google-news-300
glove-wiki-gigaword-50
glove-wiki-gigaword-100
glove-wiki-gigaword-200
glove-wiki-gigaword-300
glove-twitter-25
glove-twitter-50
glove-twitter-100
glove-twitter-200
__testing_word2vec-matrix-synopsis

CORPORA:
semeval-2016-2017-task3-subtaskBC
semeval-2016-2017-task3-subtaskA-unannotated
patent-2017
quora-duplicate-questions
wiki-english-20171001
text8
fake-news
20-newsgroups
__testing_matrix-synopsis
__testing_multipart-matrix-synopsis


In [None]:
model = gendw.load("glove-wiki-gigaword-50")
# model.most_similar("glass")

In [None]:
model.similarity_matrix

AttributeError: ignored

In [None]:
def get_keywords_by_count(
                docs: List[str], 
                top: int = 10,
                tolist: bool = False) -> pd:
    """Return a list of top keywords sorted first by alphabetical order and then 
    by counts.
    """

    def stem_tokens(tokens: List, stemmer: PorterStemmer = PorterStemmer()):
        stemmed = []
        for item in tokens:
            stemmed.append(stemmer.stem(item))
        return stemmed

    def tokenize(text: str, stemmer: PorterStemmer = PorterStemmer()):
        tokens = nltk.word_tokenize(text)
        tokens = [i for i in tokens if i not in string.punctuation]
        stems = stem_tokens(tokens, stemmer)
        return stems

    # remove stop words & get count vector
    vect = CountVectorizer(
                stop_words='english',
                # tokenizer=tokenize # optional, default seem to work better
    ) 
    matrix = vect.fit_transform(docs)

    # sort & return keywords
    counts = pd.DataFrame(
                    matrix.toarray(),
                    columns=vect.get_feature_names()).T
    counts = counts.reset_index()
    counts.columns = ['vocb', 'count']
    counts = counts.sort_values(by=['count'], ignore_index=True, ascending=False)

    if tolist:
        return counts.iloc[0:top,]['vocb'].tolist()
    else:
        return counts.iloc[0:top,]

In [None]:
doc = DOCS[0]['answer']
get_keywords_by_count([doc], 20, tolist=True)

['marty',
 'supreme',
 'son',
 'loving',
 'marriage',
 'martin',
 'hospital',
 'james',
 'kitchen',
 'life',
 'little',
 'long',
 'luck',
 'magnitude',
 '1965',
 'ginsburg',
 'reader',
 'reporting',
 'seat',
 'secret']

In [None]:
distances

In [None]:
from gensim.similarities import Similarity
from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile

index_tmpfile = get_tmpfile("index")
query = [(1, 2), (6, 1), (7, 2)]

index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary))  # build the index
similarities = index[query]  # get similarities between the query and all index documents
similarities

array([0.38490018, 0.4082483 , 0.33333334, 0.27216554, 0.57735026,
       0.        , 0.        , 0.        , 0.        ], dtype=float32)

In [None]:
# train the skip-gram
print('Training Word2Vec...')
model = Word2Vec(
            sentences=nodes_paths, 
            epochs=10, 
            vector_size=100, 
            window=5, 
            min_count=0, 
            sg=1, 
            workers=2)
if SAVE_MOEL:
    pwd = os.path.dirname(os.path.realpath('__file__'))
    save_path = f'{pwd}/models/word2vec.wordvectors' 
    model.save(save_path)
    print(f'Model saved to {save_path}...')

# retrieve node embeddings and corresponding subjects
node_ids = model.wv.index_to_key  # list of node IDs
node_embeddings = (
    model.wv.vectors
)

print('Complete!')