# Create Word Context Embeddings

In [1]:
import sys
sys.path

['/Users/caitlinmoroney/Documents/Grad_School_American/Boukouvalas_RA/30 June 2020/PMI/laplace_0/shifted_5',
 '/Users/caitlinmoroney/opt/anaconda3/lib/python37.zip',
 '/Users/caitlinmoroney/opt/anaconda3/lib/python3.7',
 '/Users/caitlinmoroney/opt/anaconda3/lib/python3.7/lib-dynload',
 '',
 '/Users/caitlinmoroney/opt/anaconda3/lib/python3.7/site-packages',
 '/Users/caitlinmoroney/opt/anaconda3/lib/python3.7/site-packages/IPython/extensions',
 '/Users/caitlinmoroney/.ipython']

In [2]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)
import numpy; print("NumPy", numpy.__version__)
import scipy; print("SciPy", scipy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)

Darwin-19.5.0-x86_64-i386-64bit
Python 3.7.7 (default, May  6 2020, 04:59:01) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
NumPy 1.18.5
SciPy 1.5.0
Scikit-Learn 0.23.1


In [3]:
# import modules
import numpy as np
import pandas as pd
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.base import TransformerMixin
from sklearn.decomposition import FastICA, TruncatedSVD, PCA, NMF
from sklearn.preprocessing import StandardScaler
import re
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer

In [4]:
# to convert contractions picked up by word_tokenize() into full words
contractions = {
    "n't": 'not',
    "'ve": 'have',
    "'s": 'is', # note that this will include possessive nouns
    'gonna': 'going to',
    'gotta': 'got to',
    "'d": 'would',
    "'ll": 'will',
    "'re": 'are',
    "'m": 'am',
    'wanna': 'want to'
}

# to convert nltk_pos tags to wordnet-compatible PoS tags
def convert_pos_wordnet(tag):
    tag_abbr = tag[0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
                
    if tag_abbr in tag_dict:
        return tag_dict[tag_abbr]

In [5]:
class ContextMatrix(TransformerMixin):
    
    # initialize class & private variables
    def __init__(self,
                 window_size = 4,
                 remove_stopwords = True,
                 add_start_end_tokens = True,
                 lowercase = False,
                 lemmatize = False,
                 pmi = False,
                 spmi_k = 1,
                 laplace_smoothing = 0,
                 pmi_positive = False,
                 sppmi_k = 1):
        
        """ Params:
                window_size: size of +/- context window (default = 4)
                remove_stopwords: boolean, whether or not to remove NLTK English stopwords
                add_start_end_tokens: boolean, whether or not to append <START> and <END> to the
                beginning/end of each document in the corpus (default = True)
                lowercase: boolean, whether or not to convert words to all lowercase
                lemmatize: boolean, whether or not to lemmatize input text
                pmi: boolean, whether or not to compute pointwise mutual information
                pmi_positive: boolean, whether or not to compute positive PMI
        """
        self.window_size = window_size
        self.remove_stopwords = remove_stopwords
        self.add_start_end_tokens = add_start_end_tokens
        self.lowercase = lowercase
        self.lemmatize = lemmatize
        self.pmi = pmi
        self.spmi_k = spmi_k
        self.laplace_smoothing = laplace_smoothing
        self.pmi_positive = pmi_positive
        self.sppmi_k = sppmi_k
        self.corpus = None
        self.clean_corpus = None
        self.vocabulary = None
        self.X = None
        self.doc_terms_lists = None
    
    def fit(self, corpus, y = None):
        
        """ Learn the dictionary of all unique tokens for given corpus.
        
            Params:
                corpus: list of strings
            
            Returns: self
        """
        self.corpus = corpus
        
        term_dict = dict()
        k = 0
        corpus_words = []
        clean_corpus = []
        doc_terms_lists = []
        detokenizer = TreebankWordDetokenizer()
        lemmatizer = WordNetLemmatizer()
        
        for text in corpus:
            text = re.sub(r'[_~`@$%^&*[\]+=\|}{\"\'<>/]+', '', text)
            
            words = word_tokenize(text)
            
            if self.remove_stopwords:
                clean_words = []
                for word in words:
                    if word.lower() not in set(stopwords.words('english')):
                        clean_words.append(word)
                words = clean_words
                
            if self.lowercase:
                clean_words = []
                for word in words:
                    clean_words.append(word.lower())
                
                words = clean_words
                
            if self.lemmatize:
                clean_words = []
                for word in words:
                    PoS_tag = pos_tag([word])[0][1]
                    
                    # to change contractions to full word form
                    if word in contractions:
                        word = contractions[word]

                    if PoS_tag[0].upper() in 'JNVR':
                        word = lemmatizer.lemmatize(word, convert_pos_wordnet(PoS_tag))
                    else:
                        word = lemmatizer.lemmatize(word)

                    clean_words.append(word)
                    
                words = clean_words
            
            # detokenize trick taken from this StackOverflow post:
            # https://stackoverflow.com/questions/21948019/python-untokenize-a-sentence
            # and NLTK treebank documentation:
            # https://www.nltk.org/_modules/nltk/tokenize/treebank.html
            text = detokenizer.detokenize(words)
            clean_corpus.append(text)
            
            [corpus_words.append(word) for word in words]
            
            if self.add_start_end_tokens:
                words = ['<START>'] + words + ['<END>']
            
            doc_terms_lists.append(words)
            
        self.clean_corpus = clean_corpus
        
        self.doc_terms_lists = doc_terms_lists
        
        corpus_words = list(set(corpus_words))
        
        if self.add_start_end_tokens:
            corpus_words = ['<START>'] + corpus_words + ['<END>']
        
        corpus_words = sorted(corpus_words)
        
        for el in corpus_words:
            term_dict[el] = k
            k += 1
            
        self.vocabulary = term_dict
        
        return self
        
    def transform(self, new_corpus = None, y = None):
        
        """ Compute the co-occurrence matrix for given corpus and window_size, using term dictionary
            obtained with fit method.
        
            Returns: term-context co-occurrence matrix (shape: target terms by context terms) with
            raw counts
        """
        num_terms = len(self.vocabulary)
        window = self.window_size
        X = np.full((num_terms, num_terms), self.laplace_smoothing)
        
        lemmatizer = WordNetLemmatizer()
        if type(new_corpus) != list:
            new_corpus = self.corpus
        
        for text in new_corpus:
            text = re.sub(r'[_~`@$%^&*[\]+=\|}{\"\'<>/]+', '', text)
            
            words = word_tokenize(text)
            
            if self.remove_stopwords:
                clean_words = []
                for word in words:
                    if word.lower() not in set(stopwords.words('english')):
                        clean_words.append(word)
                words = clean_words
                
            if self.lowercase:
                clean_words = []
                for word in words:
                    clean_words.append(word.lower())
                
                words = clean_words
                
            if self.lemmatize:
                clean_words = []
                for word in words:
                    PoS_tag = pos_tag([word])[0][1]
                    
                    # to change contractions to full word form
                    if word in contractions:
                        word = contractions[word]

                    if PoS_tag[0].upper() in 'JNVR':
                        word = lemmatizer.lemmatize(word, convert_pos_wordnet(PoS_tag))
                    else:
                        word = lemmatizer.lemmatize(word)

                    clean_words.append(word)
                    
                words = clean_words
                
            if self.add_start_end_tokens:
                words = ['<START>'] + words + ['<END>']
            
            for i in range(len(words)):
                target = words[i]
                
                # check to see if target word is in the dictionary; if not, skip
                if target in self.vocabulary:
                    
                    # grab index from dictionary
                    target_dict_index = self.vocabulary[target]
                    
                    # find left-most and right-most window indices for each target word
                    left_end_index = max(i - window, 0)
                    right_end_index = min(i + window, len(words) - 1)
                    
                    # loop over all words within window
                    # NOTE: this will include the target word; make sure to skip over it
                    for j in range(left_end_index, right_end_index + 1):
                        
                        # skip "context word" where the "context word" index is equal to the
                        # target word index
                        if j != i:
                            context_word = words[j]
                            
                            # check to see if context word is in the fitted dictionary; if
                            # not, skip
                            if context_word in self.vocabulary:
                                X[target_dict_index, self.vocabulary[context_word]] += 1
        
        # if pmi = True, compute pmi matrix from word-context raw frequencies
        # more concise code taken from this StackOverflow post:
        # https://stackoverflow.com/questions/58701337/how-to-construct-ppmi-matrix-from-a-text-corpus
        if self.pmi:
            denom = X.sum()
            col_sums = X.sum(axis = 0)
            row_sums = X.sum(axis = 1)
            
            expected = np.outer(row_sums, col_sums)/denom
            
            X = X/expected
            
            for i in range(X.shape[0]):
                for j in range(X.shape[1]):
                
                    if X[i,j] > 0:
                        X[i,j] = np.log(X[i,j]) - np.log(self.spmi_k)
                        
                        if self.pmi_positive:
                            X[i,j] = max(X[i,j] - np.log(self.sppmi_k), 0)
        
        # note that X is a dense matrix
        self.X = X

        return X

In [6]:
tweets = pd.read_csv('COVID19_Dataset-text_labels_only.csv')

In [7]:
tweets

Unnamed: 0,Is_Unreliable,Category,Tweet
0,1,"1, 3, 6, 9",We are living in scary times in Canada. Gov’t ...
1,1,"1, 6, 8, 9","Just as bad in Canada. In fact, our government..."
2,1,"1, 4, 9",It was only a matter of time before the mainst...
3,1,"6, 8",Russia's taking no chances: Foreigners infecte...
4,1,"6, 8, 9",Although there is now a presumptive confirmed ...
...,...,...,...
555,0,,BREAKING: Harvard classes will move online sta...
556,0,,Singularity University is hosting a FREE Virtu...
557,0,,Coronavirus: how does it spread and what are t...
558,0,,Stanford just cancelled classes for the rest o...


In [8]:
cm = ContextMatrix(window_size = 15,
                   lowercase = True,
                   lemmatize = True,
                   pmi = True,
                   spmi_k = 5)

In [9]:
word_context_matrix = cm.fit_transform(tweets['Tweet'])

In [10]:
pd.DataFrame(word_context_matrix, index = cm.vocabulary, columns = cm.vocabulary)

Unnamed: 0,!,#,(,),",",-,--,.,..,...,....1,zombie,zone,zoomer,zuckerberg,—,‘,’,“,”,❝real
!,0.921877,-2.112565,0.000000,-3.093184,-2.755814,-2.080000,0.000000,-1.982839,0.000000,0.049452,...,0.000000,0.000000,0.00000,0.0,-1.336843,0.000000,-0.998831,-0.862434,-1.940978,0.129494
#,-2.112565,-0.652148,-2.478527,-2.511317,-2.041679,-1.849531,-1.808637,-1.753919,-1.080398,-2.952200,...,0.000000,-2.179010,-1.38578,0.0,-2.259053,-1.867231,-1.872251,-2.947795,-2.863189,0.000000
(,0.000000,-2.478527,-1.529236,1.210563,-1.750749,-2.340601,0.000000,-1.837975,0.000000,-0.498831,...,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,-2.175723,-2.103864,0.000000,0.000000
),-3.093184,-2.511317,1.210563,-1.594815,-1.783539,-2.373391,0.000000,-1.708245,0.000000,-0.531621,...,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,-2.208512,-2.136654,0.000000,0.000000
",",-2.755814,-2.041679,-1.750749,-1.783539,-1.481760,-1.949010,-1.160901,-1.714647,0.000000,-1.899000,...,0.126953,-1.818957,0.00000,0.0,-1.899000,-1.794859,-1.848670,-1.489129,-1.809988,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
‘,0.000000,-1.867231,0.000000,0.000000,-1.794859,0.000000,0.000000,-1.862667,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.057434,0.000000,0.000000,0.000000
’,-0.998831,-1.872251,-2.175723,-2.208512,-1.848670,-1.888476,0.000000,-1.580005,0.000000,-0.739853,...,0.000000,0.000000,0.00000,0.0,0.000000,0.057434,-0.807307,-0.878549,-0.650842,-0.372129
“,-0.862434,-2.947795,-2.103864,-2.136654,-1.489129,-1.123470,-0.335361,-1.719456,0.000000,-0.380313,...,0.000000,0.000000,0.00000,0.0,-0.380313,0.000000,-0.878549,-0.599051,1.318136,0.000000
”,-1.940978,-2.863189,0.000000,0.000000,-1.809988,-1.221186,-0.027612,-1.679970,0.000000,-0.072563,...,0.000000,0.000000,0.00000,0.0,-0.072563,0.000000,-0.650842,1.318136,0.709595,0.000000


In [11]:
word_context_matrix.shape

(2327, 2327)

In [13]:
comps = [50, 100, 150, 200, 250, 500]

for i in range(len(comps)):
    ica = FastICA(n_components = comps[i], random_state = i)
    std_scaler = StandardScaler()
    X_std = std_scaler.fit_transform(word_context_matrix)
    embeddings = ica.fit_transform(X_std)
    df = pd.DataFrame(embeddings,
                      index = cm.vocabulary,
                      columns = ['Comp {}'.format(j+1) for j in range(comps[i])])
    file_name = 'word_embed_{}.csv'.format(comps[i])
    df.to_csv(file_name)