# Week 4 - Word Embeddings

This week, we build on last week's topic modeling techniques by taking a text corpus we have developed, specifying an underlying number of dimensions, and training a model with a neural network auto-encoder (one of Google's word2vec  algorithms) that best describes corpus words in their local linguistic contexts, and exploring their locations in the resulting space to learn about the discursive culture that produced them. Documents here are represented as densely indexed locations in dimensions, rather than sparse mixtures of topics (as in LDA topic modeling), so that distances between those documents (and words) are consistently superior, though they require the full vector of dimension loadings (rather than just a few selected topic loadings) to describe. We will explore these spaces to understand complex, semantic relationships between words, index documents with descriptive words, identify the likelihood that a given document would have been produced by a given vector model, and explore how semantic categories can help us understand the cultures that produced them.

For this notebook we will be using the following packages

In [7]:
#All these packages need to be installed from pip
import gensim#For word2vec, etc
import requests #For downloading our datasets
import nltk #For stop words and stemmers
import numpy as np #For arrays
import pandas #Gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import seaborn #Makes the graphics look nicer
import sklearn.metrics.pairwise #For cosine similarity
import sklearn.manifold #For T-SNE
import sklearn.decomposition #For PCA

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning
%matplotlib inline

import os #For looking through files
import os.path #For managing file paths

import json
import bs4

## <span style="color:red">*Your Turn*</span>

<span style="color:red">Construct cells immediately below this that build a word2vec model with your corpus. Interrogate word relationships in the resulting space. Plot a subset of your words. What do these word relationships reveal about the *social* and *cultural game* underlying your corpus? What was surprising--what violated your prior understanding of the corpus? What was expected--what confirmed your knowledge about this domain?

# Getting my corpora

In [None]:
nyt_api = '35fa0940e36e46a1997d4c6439dd25dc'
guard_api = '0bd937fb-cf7e-4727-8698-5b69390c8cd3'
search_term = 'artificial intelligence'
begDate = '2016-01-01'
endDate = '2016-12-31'
begDate1 = '20160101'
begDate2 = '20161231'

In [9]:
# this is for later use
def getGuardian(api_key, search, from_date, to_date, pages = 5):

    searchDict = {
        'date' : [], #The date the article was published
        'section' : [], #The section of the article
        'title' : [], #The title of the article
        'url' : [], #The url to the article
        'text' : [], #The text of the article
        }
    for page in list(range(pages)):
        gaAPItarget = 'https://content.guardianapis.com/search?api-key={}&q={}&from-date={}&to-date={}&page={}'
        r = requests.get(gaAPItarget.format(api_key, search, from_date, to_date, page))
        response = json.loads(r.text)
        Docs = response['response']['results']

        for Doc in Docs:
            #These are provided by the directory
            searchDict['date'].append(Doc['webPublicationDate'])
            searchDict['section'].append(Doc['sectionName'])
            searchDict['title'].append(Doc['webTitle'])
            searchDict['url'].append(Doc['webUrl'])

            #We need to download the text though
            try:
                text_raw = requests.get(Doc['webUrl']).text
            except:
                requests.ConnectionError
            soup = bs4.BeautifulSoup(text_raw, 'html.parser')
            pars = soup.body.findAll('p', class_= None)
            text_full = []
            for par in pars:
                text_full.append(par.text)
            text_clean = ' '.join(text_full)
            searchDict['text'].append(text_clean)
            
    searchDF = pandas.DataFrame(searchDict)

    #Get tokens
    searchDF['tokenized_text'] = searchDF['text'].apply(lambda x: nltk.word_tokenize(x))
    searchDF['token_counts'] = searchDF['tokenized_text'].apply(lambda x: len(x))

    #Delete rows with no text due to the irregularity of the original html codes
    finalDF = searchDF[searchDF['text'] != '']
    return finalDF

In [None]:
GuardDF = getGuardian(guard_api, search_term, begDate, endDate, pages=2)
GuardDF

In [5]:
def getNYT(api_key, search_term, begin_date, end_date, pages = 5):
    searchDict = {
        'date' : [], #The date the article was published
        'section' : [], #The section of the article
        'source' : [], #The source of the article
        'text' : [], #The text of the article
        'title' : [], #The title of the article
        'url' : [], #The url to the article
    }

    NYTAPItarget = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key={}&q={}&sort=newest&page={}&begindate={}&enddate={}'
    for page in list(range(pages)):
        r = requests.get(NYTAPItarget.format(api_key, search_term, page, begin_date, end_date))
        response = json.loads(r.text)
        Docs = response['response']['docs']
        
        for Doc in Docs:
            #These are provided by the directory
            searchDict['date'].append(Doc['pub_date'])
            searchDict['section'].append(Doc['section_name'])
            searchDict['source'].append(Doc['source'])
            searchDict['title'].append(Doc['headline']['main'])
            searchDict['url'].append(Doc['web_url'])
            
            #We need to download the text though
            try:
                text_raw = requests.get(Doc['web_url']).text
            except:
                requests.ConnectionError
            soup = bs4.BeautifulSoup(text_raw, 'html.parser')
            pars = soup.body.findAll('p', class_= r'story-body-text')
            text_full = []
            for par in pars:
                text_full.append(par.text)
            text_clean = ' '.join(text_full)
            searchDict['text'].append(text_clean)
            
    searchDF = pandas.DataFrame(searchDict)

    #Get tokens
    searchDF['tokenized_text'] = searchDF['text'].apply(lambda x: nltk.word_tokenize(x))
    searchDF['token_counts'] = searchDF['tokenized_text'].apply(lambda x: len(x))

    #Delete rows with no text due to the irregularity of the original html codes
    finalDF = searchDF[searchDF['text'] != '']
    return finalDF

In [12]:
nytDF = getNYT(nyt_api, search_term, begDate, endDate, pages = 100)
nytDF.to_pickle('nyt.pkl') # pickle the result for reproducibility

KeyError: 'response'

In [11]:
# NYTsampleDF = pandas.read_pickle('NYTsample.pkl')
nytDF['title']

0     Apple Reversed Its iPhone Slump. But What's Next?
1     UK Pardons Thousands Convicted Under Past Anti...
2     Canadian Tech Companies Ask Ottawa to Issue Vi...
3     Diversity in Tech: Lots of Attention, Little P...
4     U.S. Fintech Venture Firm Nyca Raises $125 Mil...
5     With Supplies Tight, Memory Chipmakers Head In...
6     Alphabet’s Profits Stay Predictably Good in a ...
7     Review: Alice Returns in ‘Resident Evil: The F...
8     Doomsday Clock Moves Closer to Midnight, Signa...
9     Xiaomi Executive Barra Joins Facebook to Lead ...
10    New Startup Investments Aim to Stem Canadian T...
11        How Efficiency Is Wiping Out the Middle Class
12       Daily Report: American Jobs and Chinese Robots
13    RBC Targets 40 Percent of Total Technology Spe...
14    Diversity in Tech: Lots of Attention, Little P...
15    Alibaba Raises Guidance as Strategy Shift Make...
16         How Alexa Fits Into Amazon’s Prime Directive
17       ‘Brexit,’ Astana, Italy: Your Tuesday B

We also want to remove stop words and stem. Tokenizing requires two steps. Word2Vec needs to retain the sentence structure so as to capture a "continuous bag of words (CBOW)" and all of the skip-grams within a word window. The algorithm tries to preserve the distances induced by one of these two local structures. This is very different from clustering and LDA topic modeling which extract unordered words alone. As such, tokenizing is slightly more involved.

In [None]:
def normlizeTokens(tokenLst, stopwordLst = None, stemmer = None, lemmer = None, vocab = None):
    #We can use a generator here as we just need to iterate over it

    #Lowering the case and removing non-words
    workingIter = (w.lower() for w in tokenLst if w.isalpha())

    #Now we can use the semmer, if provided
    if stemmer is not None:
        workingIter = (stemmer.stem(w) for w in workingIter)

    #And the lemmer
    if lemmer is not None:
        workingIter = (lemmer.lemmatize(w) for w in workingIter)

    #And remove the stopwords
    if stopwordLst is not None:
        workingIter = (w for w in workingIter if w not in stopwordLst)
        
    #We will return a list with the stopwords removed
    if vocab is not None:
        vocab_str = '|'.join(vocab)
        workingIter = (w for w in workingIter if re.match(vocab_str, w))
    
    return list(workingIter)

#initialize our stemmer and our stop words
stop_words_nltk = nltk.corpus.stopwords.words('english')
snowball = nltk.stem.snowball.SnowballStemmer('english')
wordnet = nltk.stem.WordNetLemmatizer()

In [None]:
#Apply our functions, notice each row is a list of lists now
NYTsampleDF['tokenized_sents'] = NYTsampleDF['text'].apply(lambda x: [nltk.word_tokenize(s) for s in nltk.sent_tokenize(x)])
NYTsampleDF['normalized_sents'] = NYTsampleDF['tokenized_sents'].apply(lambda x: [normlizeTokens(s, stopwordLst = stop_words_nltk, stemmer = None) for s in x])

NYTsampleDF[:100:10]

# Word2Vec

We will be using the gensim implementation of [Word2Vec](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec).

To load our data our data we give all the sentences to the trainer:

In [None]:
NYTsampleW2V = gensim.models.word2vec.Word2Vec(NYTsampleDF['normalized_sents'].sum())
NYTsampleW2V['president'][:10] #Shortening because it's very large
NYTsampleW2V.syn0 # full matrix
NYTsampleW2V.index2word[10] # translate from the matrix to words

In [None]:
NYTsampleW2V.most_similar('president') # similar vectors

In [None]:
NYTsampleW2V.doesnt_match(['administration', 'administrations', 'presidents', 'president', 'washington']) # least match

In [None]:
NYTsampleW2V.most_similar(positive=['clinton', 'republican'], negative = ['democrat']) # semantic equation

In [None]:
NYTsampleW2V.save("data/senpressreleasesWORD2Vec") # save for later use

In [None]:
# dimension reduction
numWords = 50
targetWords = NYTsampleW2V.index2word[:numWords] # select a subset

In [None]:
wordsSubMatrix = []
for word in targetWords:
    wordsSubMatrix.append(senReleasesW2V[word])
wordsSubMatrix = np.array(wordsSubMatrix)
wordsSubMatrix # smaller, reduced matrix that preserved the distances from the original

In [None]:
pcaWords = sklearn.decomposition.PCA(n_components = 50).fit(wordsSubMatrix) # use PCA to reduce the dimensions
reducedPCA_data = pcaWords.transform(wordsSubMatrix)
#T-SNE is theoretically better, but you should experiment
tsneWords = sklearn.manifold.TSNE(n_components = 2).fit_transform(reducedPCA_data)

In [None]:
# Now plot it!
fig = plt.figure(figsize = (10,6))
ax = fig.add_subplot(111)
ax.set_frame_on(False)
plt.scatter(tsneWords[:, 0], tsneWords[:, 1], alpha = 0)#Making the points invisible 
for i, word in enumerate(targetWords):
    ax.annotate(word, (tsneWords[:, 0][i],tsneWords[:, 1][i]), size =  20 * (numWords - i) / numWords)
plt.xticks(())
plt.yticks(())
plt.show()

## <span style="color:red">*Your Turn*</span>

<span style="color:red">Construct cells immediately below this that build a doc2vec model with your corpus. Interrogate document and word relationships in the resulting space. Construct a heatmap that plots the distances between a subset of your documents against each other, and against a set of informative words. Find distances between *every* document in your corpus and a word or query of interest. What do these doc-doc proximities reveal about your corpus? What do these word-doc proximities highlight? Demonstrate and document one reasonable way to select a defensible subset of query-relevant documents for subsequent analysis.

# Doc2Vec

Instead of just looking at just how words embed within in the space, we can look at how the different documents relate to each other within the space. First lets load our data--abstracts of most U.S. physics papers from the 1950s.

We will load these as documents into Word2Vec, but first we need to normalize and pick some tags

In [None]:
NYTsampleDF2 = pandas.read_pickle('NYTsample.pkl')

In [None]:
keywords = ['photomagnetoelectric', 'quantum', 'boltzmann', 'proton', 'positron', 'feynman', 'classical', 'relativity']

In [None]:
NYTsampleDF2['tokenized_words'] = NYTsampleDF2['abstract'].apply(lambda x: nltk.word_tokenize(x))
NYTsampleDF2['normalized_words'] = NYTsampleDF2['tokenized_words'].apply(lambda x: normlizeTokens(x, stopwordLst = stop_words_nltk))

In [None]:
taggedDocs = []
for index, row in apsDF.iterrows():
    #Just doing a simple keyword assignment
    docKeywords = [s for s in keywords if s in row['normalized_words']]
    docKeywords.append(row['copyrightYear'])
    docKeywords.append(row['doi']) #This lets us extract individual documnets since doi's are unique
    taggedDocs.append(gensim.models.doc2vec.LabeledSentence(words = row['normalized_words'], tags = docKeywords))
NYTsampleDF2['TaggedAbstracts'] = taggedDocs

Now we can train a Doc2Vec model:

In [None]:
NYTsampleD2V = gensim.models.doc2vec.Doc2Vec(NYTsampleDF2['TaggedAbstracts'], size = 100) #Limiting to 100 dimensions

We can get vectors for the tags/documents, just as we did with words. Documents are actually the centroids (high dimensional average points) of their words. 

In [None]:
NYTsampleD2V.docvecs[1952]

The words can still be accessed in the same way:

In [None]:
NYTsampleD2V['atom']

We can still use the ``most_similar`` command to perform simple semantic equations:

In [None]:
NYTsampleD2V.most_similar(positive = ['atom','electrons'], negative = ['electron'], topn = 1)

This is interesting. **Electron** is to **electrons** as **atom** is to **atoms**. Another way to understand this, developed below is: **electrons - electron** induces a singular to plural dimension, so when we subtract **electron** from **atom** and add **electrons**, we get **atoms**! 

In [None]:
NYTsampleD2V.most_similar(positive = ['einstein','law'], negative = ['equation'], topn = 1)

In other words **Einstein** minus **equation** plus **law** equals **Meissner**--Walthur Meissner studied mechanical engineering and physics ... and was more likely to produce a "law" than a "equation", like the Meissner effect, the damping of the magnetic field in superconductors. If we built our word-embedding with a bigger corpus like the entire arXiv, a massive repository of physics preprints, we would see many more such relationships like **gravity - Newton + Einstein = relativity**.

We can also compute all of these *by hand*--explicitly wth vector algebra: 

In [None]:
sklearn.metrics.pairwise.cosine_similarity(NYTsampleD2V['electron'].reshape(1,-1), NYTsampleD2V['positron'].reshape(1,-1))
#We reorient the vectors with .reshape(1, -1) so that they can be computed without a warning in sklearn

In the doc2vec model, the documents have vectors just as the words do, so that we can compare documents with each other and also with words (similar to how a search engine locates a webpage with a query). First, we will calculate the distance between a word and documents in the dataset:

In [None]:
NYTsampleD2V.docvecs.most_similar([ NYTsampleD2V['electron'] ], topn=5 )

Now let's go the other way around and find words most similar to this document:

In [None]:
NYTsampleD2V.most_similar( [ NYTsampleD2V.docvecs['10.1103/PhysRev.98.875'] ], topn=5) 

We can even look for documents most like a query composed of multiple words:

In [None]:
NYTsampleD2V.docvecs.most_similar([ NYTsampleD2V['electron']+NYTsampleD2V['positron']+NYTsampleD2V['neutron']], topn=5 )

Now let's plot some words and documents against one another with a heatmap:

In [None]:
heatmapMatrix = []
for tagOuter in keywords:
    column = []
    tagVec = apsD2V.docvecs[tagOuter].reshape(1, -1)
    for tagInner in keywords:
        column.append(sklearn.metrics.pairwise.cosine_similarity(tagVec, apsD2V.docvecs[tagInner].reshape(1, -1))[0][0])
    heatmapMatrix.append(column)
heatmapMatrix = np.array(heatmapMatrix)

In [None]:
heatmapMatrix.shape

In [None]:
fig, ax = plt.subplots()
hmap = ax.pcolor(heatmapMatrix, cmap='terrain')
cbar = plt.colorbar(hmap)

cbar.set_label('cosine similarity', rotation=270)
a = ax.set_xticks(np.arange(heatmapMatrix.shape[1]) + 0.5, minor=False)
a = ax.set_yticks(np.arange(heatmapMatrix.shape[0]) + 0.5, minor=False)

a = ax.set_xticklabels(keywords, minor=False, rotation=270)
a = ax.set_yticklabels(keywords, minor=False)

Now let's look at a heatmap of similarities between the first ten documents in the corpus:

In [None]:
targetDocs = NYTsampleDF2['doi'][:10]

heatmapMatrixD = []

for tagOuter in targetDocs:
    column = []
    tagVec = NYTsampleD2V.docvecs[tagOuter].reshape(1, -1)
    for tagInner in targetDocs:
        column.append(sklearn.metrics.pairwise.cosine_similarity(tagVec, NYTsampleD2V.docvecs[tagInner].reshape(1, -1))[0][0])
    heatmapMatrixD.append(column)
heatmapMatrixD = np.array(heatmapMatrixD)

In [None]:
fig, ax = plt.subplots()
hmap = ax.pcolor(heatmapMatrixD, cmap='terrain')
cbar = plt.colorbar(hmap)

cbar.set_label('cosine similarity', rotation=270)
a = ax.set_xticks(np.arange(heatmapMatrixD.shape[1]) + 0.5, minor=False)
a = ax.set_yticks(np.arange(heatmapMatrixD.shape[0]) + 0.5, minor=False)

a = ax.set_xticklabels(targetDocs, minor=False, rotation=270)
a = ax.set_yticklabels(targetDocs, minor=False)

Now let's look at a heatmap of similarities between the first ten documents and our keywords:

In [None]:
heatmapMatrixC = []

for tagOuter in targetDocs:
    column = []
    tagVec = NYTsampleD2V.docvecs[tagOuter].reshape(1, -1)
    for tagInner in keywords:
        column.append(sklearn.metrics.pairwise.cosine_similarity(tagVec, NYTsampleD2V.docvecs[tagInner].reshape(1, -1))[0][0])
    heatmapMatrixC.append(column)
heatmapMatrixC = np.array(heatmapMatrixC)

In [None]:
fig, ax = plt.subplots()
hmap = ax.pcolor(heatmapMatrixC, cmap='terrain')
cbar = plt.colorbar(hmap)

cbar.set_label('cosine similarity', rotation=270)
a = ax.set_xticks(np.arange(heatmapMatrixC.shape[1]) + 0.5, minor=False)
a = ax.set_yticks(np.arange(heatmapMatrixC.shape[0]) + 0.5, minor=False)

a = ax.set_xticklabels(keywords, minor=False, rotation=270)
a = ax.set_yticklabels(targetDocs, minor=False)

We will save the model in case we would like to use it again.

In [None]:
NYTsampleD2V.save('data/NYTsampleD2V')

We can later load it:

In [None]:
#apsD2V = gensim.models.word2vec.Word2Vec.load('data/apsW2V')

## <span style="color:red">*Your Turn*</span>

<span style="color:red">Construct cells immediately below this that calculate the scores for a small sample of documents from outside your corpus to identify which are *closest* to your corpus. Then calculate the scores for a few phrases or sentences to identify the ones most likely to have appeared in your corpus. Interrogate patterns associated with these document/phrase scores (e.g., which companies produced job ads most or least likely to find jobseekers in the resume corpus?) What do these patterns suggest about the boundaries of your corpus?

# The Score Function

The score function is a simple calculation developed by [Matt Taddy](https://arxiv.org/pdf/1504.07295.pdf) to calculate the likelihood that a given text would have been generated by a word-embedding model by summing the inner product between each pair of the text's word vectors. 

Here, we explore this using a model trained with millions of resumes from the CareerBuilder website (we can't share the private resumes...but we can share a model built with them :-):

In [None]:
resume_model  = gensim.models.word2vec.Word2Vec.load('/mnt/efs/resources/shared/Notebook-4-data/resume.model')

We can examine the vacabularies of this model by building a word-index map:

In [None]:
vocab = resume_model.index2word

Let's load a few job ads. Here, we only use a small sample of all of them. Uncomment this cell if you want to load more.

In [None]:
# with open('data/joblistings.merged.parsed.unique.grpbyyear.2010-2015.02.tsv','r') as tsv:
#     ads = [line.strip().split('\t') for line in tsv]
    
# adsDF = pandas.DataFrame(ads, columns = ads[0])
# reducedDF = adsDF[['hiringOrganization_organizationName', 'jobDescription', 'jobLocation_address_region', 'jobLocation_geo_latitude', 'jobLocation_geo_longitude', 'qualifications', 'responsibilities']][1:]
# N = reducedDF.shape[0]
# indices = random.sample(range(1, N+1), 100)
# sampleDF = reducedDF.iloc[indices]
# sampleDF.to_csv('data/SampleJobAds.csv')

Let's just load the sample and take a look at it. The sentences in each job description are already tokenized and normalized.

In [None]:
NYTsampleDF3 = pandas.read_pickle('NYTsample.pkl')
NYTsampleDF3['tokenized_words'] = NYTsampleDF3['abstract'].apply(lambda x: nltk.word_tokenize(x))
NYTsampleDF3['normalized_words'] = NYTsampleDF3['tokenized_words'].apply(lambda x: normlizeTokens(x, stopwordLst = stop_words_nltk))

#We need to convert the last couple columns from strings to lists
NYTsampleDF3['tokenized_sents'] = NYTsampleDF3['tokenized_sents'].apply(lambda x: eval(x))
NYTsampleDF3['normalized_sents'] = NYTsampleDF3['normalized_sents'].apply(lambda x: eval(x))
NYTsampleDF3

Let's define a function to calculate the likelihood of each job description. The idea is borrowed from [Matt Taddy](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/deepir.ipynb), who shows how a document can be characterized as the inner product of the distance between its words. In other words, this analysis will show which job ads are most likely to find an appropriate pool of workers in the resume bank that generated our word embedding.  

In [None]:
def nytprob(ad, model):
    sen_scores = model.score(ad, len(ad))
    nyt_score = sen_scores.mean()
    return nyt_score

Let's apply this function to every job description.

In [None]:
NYTsampleDF3['likelihood'] = NYTsampleDF3['normalized_sents'].apply(lambda x: nytprob(x, resume_model))

Let's take a look at the top 5 job descriptions that have the highest likelihood.

In [None]:
for ad in NYTsampleDF3.sort_values(by = 'likelihood', ascending = False)['jobDescription'][:5]:
    print (ad + '\n\n')

Let's take a look at the bottom 5 job descriptions that have the lowest likelihood to be matched by the resumes.

In [None]:
for article in NYTsampleDF3.sort_values(by = 'likelihood')['jobDescription'][:5]:
    print (article + '\n\n')

We can do the same for phrases corresponding to job skills.

In [None]:
nytprob([["python", "programming"]], resume_model)

In [None]:
nytprob([["basic", "programming"]], resume_model)

Basic programming appears to be more likely in this pool of resumes than python programming. 

We can also do some simple statistics. Unfortunately, we don't have a large sample here. Nevertheless, let's first look at the mean likelihood score of each hiring organization. Some organizations will do well to hire on CareerBuilder...while others will not.

In [None]:
NYTsampleDF3.groupby("hiringOrganization_organizationName")[['likelihood']].mean().sort_values('likelihood', ascending = False)

We can also look at the mean likelihood of each state.

In [None]:
NYTsampleDF3.groupby("jobLocation_address_region")[['likelihood']].mean().sort_values('likelihood', ascending = False)

You would increase the sample size if you want to do a more serious study.

## <span style="color:red">*Your Turn*</span>

<span style="color:red">Construct cells immediately below this that identify semantic dimensions of interest from your data (e.g., gender: man-woman) and project words onto these dimensions. Plot the array of relevant words along each semantic dimension. Which words are most different. Which dimensions are most different? On which dimension are your words most different? Print three short textual examples from the corpus that illustrate the association you have explored.

<span style="color:red">***Stretch***: Project documents from your corpus along a dimension of interest. Sample relevant documents from your corpus with this functionality and explain your rationale? Calculate the cosine of the angle between two dimensions (encoded as vectors) of interest. What does this suggest about the relationship between them within your corpus? 

<span style="color:red">***Super stretch***: Create 90% bootstrap confidence intervals around your word projections onto a given dimension by generating 10 separate word2vec models, sampling $n$ documents (the total number in your corpus) for each, but with replacement. The bounds will be defined as the highest and lowest projection across your 10 samples. Which words are *significantly* different on your semantic dimension of interest?

# Projection

In [None]:
# word2vec model of the NYT articles
nytimes_model = gensim.models.word2vec.Word2Vec.load_word2vec_format('/mnt/efs/resources/shared/Notebook-4-data/nytimes_cbow.reduced.txt')

First we can visualize with dimension reduction

In [None]:
#words to create dimensions
tnytTargetWords = ['man','him','he', 'woman', 'her', 'she', 'black','blacks','African', 'white', 'whites', 'Caucasian', 'rich', 'richer', 'richest', 'expensive', 'wealthy', 'poor', 'poorer', 'poorest', 'cheap', 'inexpensive']
#words we will be mapping
tnytTargetWords += ["doctor","lawyer","plumber","scientist","hairdresser", "nanny","carpenter","entrepreneur","musician","writer", "banker","poet","nurse", "steak", "bacon", "croissant", "cheesecake", "salad", "cheeseburger", "vegetables", "beer", "wine", "pastry", "basketball", "baseball", "boxing", "softball", "volleyball", "tennis", "golf", "hockey", "soccer"]


wordsSubMatrix = []
for word in tnytTargetWords:
    wordsSubMatrix.append(nytimes_model[word])
wordsSubMatrix = np.array(wordsSubMatrix)
wordsSubMatrix

In [None]:
pcaWordsNYT = sklearn.decomposition.PCA(n_components = 50).fit(wordsSubMatrix)
reducedPCA_dataNYT = pcaWordsNYT.transform(wordsSubMatrix)
#T-SNE is theoretically better, but you should experiment
tsneWordsNYT = sklearn.manifold.TSNE(n_components = 2).fit_transform(reducedPCA_dataNYT)

In [None]:
tsneWordsNYT[:,1].shape

In [None]:
fig = plt.figure(figsize = (10,6))
ax = fig.add_subplot(111)
ax.set_frame_on(False)
plt.scatter(tsneWordsNYT[:, 0], tsneWordsNYT[:, 1], alpha = 0) # Making the points invisible
for i, word in enumerate(tnytTargetWords):
    ax.annotate(word, (tsneWordsNYT[:, 0][i],tsneWordsNYT[:, 1][i]), size =  20 * (len(tnytTargetWords) - i) / len(tnytTargetWords))
plt.xticks(())
plt.yticks(())
plt.show()

In [None]:
# Define functions for getting dimensions
def normalize(vector):
    normalized_vector = vector / np.linalg.norm(vector)
    return normalized_vector

def dimension(model, positives, negatives):
    diff = sum([normalize(model[x]) for x in positives]) - sum([normalize(model[y]) for y in negatives])
    return diff

In [None]:
# calculate three dimensions: gender, race, and class
Gender = dimension(nytimes_model, ['man','him','he'], ['woman', 'her', 'she'])
Race = dimension(nytimes_model, ['black','blacks','African'], ['white', 'whites', 'Caucasian'])
Class = dimension(nytimes_model, ['rich', 'richer', 'richest', 'expensive', 'wealthy'], ['poor', 'poorer', 'poorest', 'cheap', 'inexpensive'])

# Words to evaluate/compare
Occupations = ["doctor","lawyer","plumber","scientist","hairdresser", "nanny","carpenter","entrepreneur","musician","writer", "banker","poet","nurse"]
Foods = ["steak", "bacon", "croissant", "cheesecake", "salad", "cheeseburger", "vegetables", "beer", "wine", "pastry"]
Sports  = ["basketball", "baseball", "boxing", "softball", "volleyball", "tennis", "golf", "hockey", "soccer"]

In [None]:
# Define a function to project words in a word list to each of the three dimensions.
def makeDF(model, word_list):
    g = []
    r = []
    c = []
    for word in word_list:
        g.append(sklearn.metrics.pairwise.cosine_similarity(nytimes_model[word].reshape(1,-1), Gender.reshape(1,-1))[0][0])
        r.append(sklearn.metrics.pairwise.cosine_similarity(nytimes_model[word].reshape(1,-1), Race.reshape(1,-1))[0][0])
        c.append(sklearn.metrics.pairwise.cosine_similarity(nytimes_model[word].reshape(1,-1), Class.reshape(1,-1))[0][0])
    df = pandas.DataFrame({'gender': g, 'race': r, 'class': c}, index = word_list)
    return df

In [None]:
# Get the projections.
OCCdf = makeDF(nytimes_model, Occupations) 
Fooddf = makeDF(nytimes_model, Foods)
Sportsdf = makeDF(nytimes_model, Sports)

In [None]:
# define functions for plotting
def Coloring(Series):
    x = Series.values
    y = x-x.min()
    z = y/y.max()
    c = list(plt.cm.rainbow(z))
    return c

def PlotDimension(ax,df, dim):
    ax.set_frame_on(False)
    ax.set_title(dim, fontsize = 20)
    colors = Coloring(df[dim])
    for i, word in enumerate(df.index):
        ax.annotate(word, (0, df[dim][i]), color = colors[i], alpha = 0.6, fontsize = 12)
    MaxY = df[dim].max()
    MinY = df[dim].min()
    plt.ylim(MinY,MaxY)
    plt.yticks(())
    plt.xticks(())

In [None]:
# plot the words in each of the dimentions
fig = plt.figure(figsize = (12,4))
ax1 = fig.add_subplot(131)
PlotDimension(ax1, OCCdf, 'gender')
ax2 = fig.add_subplot(132)
PlotDimension(ax2, OCCdf, 'race')
ax3 = fig.add_subplot(133)
PlotDimension(ax3, OCCdf, 'class')
plt.show()