# Import modules

In [34]:
# import modules
import numpy as np
import pandas as pd
import pickle
from scipy.io import loadmat
import re
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet

# Load data

In [2]:
# Load tweets
tweets = pd.read_excel('../COVID19_Dataset-CM-ZB-complete with sources.xlsx')
tweets

Unnamed: 0,Is_Unreliable,Category,Tweet
0,1,"1, 3, 6, 9",We are living in scary times in Canada. Gov’t ...
1,1,"1, 6, 8, 9","Just as bad in Canada. In fact, our government..."
2,1,"1, 4, 9",It was only a matter of time before the mainst...
3,1,"6, 8",Russia's taking no chances: Foreigners infecte...
4,1,"6, 8, 9",Although there is now a presumptive confirmed ...
...,...,...,...
555,0,,BREAKING: Harvard classes will move online sta...
556,0,,Singularity University is hosting a FREE Virtu...
557,0,,Coronavirus: how does it spread and what are t...
558,0,,Stanford just cancelled classes for the rest o...


# Read in word embeddings

In [4]:
# Read in word embeddings
A_mat_contents = loadmat('../18Jan2021_Zorro_output/A.mat')
S_mat_contents = loadmat('../18Jan2021_Zorro_output/S.mat')

In [7]:
A = A_mat_contents['A']
A.shape # context words by components

(2327, 250)

In [9]:
S = S_mat_contents['S']
S.shape # components by target words

(250, 2327)

In [13]:
S_T = S.T
S_T.shape # target words by components

(2327, 250)

In [17]:
# Load vocabulary list
with open('tweet_vocab_list', 'rb') as f:
    tweet_vocab_list = pickle.load(f)

In [20]:
len(tweet_vocab_list)

2327

In [21]:
vocabulary_dict = dict()
for i in range(len(tweet_vocab_list)):
    word = tweet_vocab_list[i]
    vocabulary_dict[word] = i
vocabulary_dict

{'!': 0,
 '#': 1,
 '(': 2,
 ')': 3,
 ',': 4,
 '-': 5,
 '--': 6,
 '.': 7,
 '..': 8,
 '...': 9,
 '....': 10,
 '1': 11,
 '1,600': 12,
 '1,975': 13,
 '1-': 14,
 '10': 15,
 '10,000': 16,
 '100': 17,
 '1000': 18,
 '10:30': 19,
 '10:30am': 20,
 '10th': 21,
 '11': 22,
 '115': 23,
 '118,000': 24,
 '12': 25,
 '133': 26,
 '14': 27,
 '14:21': 28,
 '1500': 29,
 '16-18': 30,
 '17': 31,
 '170': 32,
 '18': 33,
 '19': 34,
 '1900': 35,
 '19th': 36,
 '1:30': 37,
 '1b': 38,
 '1st': 39,
 '2': 40,
 '2-': 41,
 '2.5': 42,
 '20': 43,
 '20-40s': 44,
 '200': 45,
 '2019': 46,
 '2019-ncovid': 47,
 '2019ncov': 48,
 '2020': 49,
 '2021': 50,
 '21': 51,
 '22': 52,
 '23': 53,
 '24': 54,
 '24thminute': 55,
 '25': 56,
 '26': 57,
 '27': 58,
 '2700': 59,
 '29th': 60,
 '2\uf30e1': 61,
 '3': 62,
 '3,000': 63,
 '3.4': 64,
 '30': 65,
 '30.8': 66,
 '32': 67,
 '3200': 68,
 '36': 69,
 '366': 70,
 '38': 71,
 '39': 72,
 '4': 73,
 '40': 74,
 '40-70': 75,
 '409k': 76,
 '42nd': 77,
 '43': 78,
 '4600': 79,
 '48': 80,
 '5': 81,
 '50': 8

# Derive Tweet Embeddings

In [36]:
# to convert contractions picked up by word_tokenize() into full words
contractions = {
    "n't": 'not',
    "'ve": 'have',
    "'s": 'is', # note that this will include possessive nouns
    'gonna': 'going to',
    'gotta': 'got to',
    "'d": 'would',
    "'ll": 'will',
    "'re": 'are',
    "'m": 'am',
    'wanna': 'want to'
}

# to convert nltk_pos tags to wordnet-compatible PoS tags
def convert_pos_wordnet(tag):
    tag_abbr = tag[0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
                
    if tag_abbr in tag_dict:
        return tag_dict[tag_abbr]

In [37]:
# define function to get string embeddings from word embeddings

def get_text_vectors(word_embeddings, # numpy array
                     word_index_dict, # dictionary mapping words to index in array
                     text_list, # list of strings to derive embeddings for
                     remove_stopwords = True,
                     lowercase = True,
                     lemmatize = True,
                     add_start_end_tokens = True):
    
    lemmatizer = WordNetLemmatizer()
    
    for k in range(len(text_list)):
        text = text_list[k]
        text = re.sub(r'[_~`@$%^&*[\]+=\|}{\"\'<>/]+', '', text)
        text_vec = np.zeros(word_embeddings.shape[1])
        words = word_tokenize(text)
        tracker = 0 # to track whether we've encountered a word for which we have an embedding (in each tweet)
        
        if remove_stopwords:
            clean_words = []
            for word in words:
                if word.lower() not in set(stopwords.words('english')):
                    clean_words.append(word)
            words = clean_words

        if lowercase:
            clean_words = []
            for word in words:
                clean_words.append(word.lower())

            words = clean_words

        if lemmatize:
            clean_words = []
            for word in words:
                PoS_tag = pos_tag([word])[0][1]

                # to change contractions to full word form
                if word in contractions:
                    word = contractions[word]

                if PoS_tag[0].upper() in 'JNVR':
                    word = lemmatizer.lemmatize(word, convert_pos_wordnet(PoS_tag))
                else:
                    word = lemmatizer.lemmatize(word)

                clean_words.append(word)

            words = clean_words

        if add_start_end_tokens:
            words = ['<START>'] + words + ['<END>']
        
        for i in range(len(words)):
            word = words[i]
            if word in word_index_dict:
                word_embed_vec = word_embeddings[word_index_dict[word],:]
                if tracker == 0:
                    text_matrix = word_embed_vec
                else:
                    text_matrix = np.vstack((text_matrix, word_embed_vec))
                    
                # only increment if we have come across a word in the embeddings dictionary
                tracker += 1
                    
        for j in range(len(text_vec)):
            text_vec[j] = text_matrix[:,j].mean()
            
        if k == 0:
            full_matrix = text_vec
        else:
            full_matrix = np.vstack((full_matrix, text_vec))
            
    return full_matrix

In [39]:
# dictionary of word embeddings
emb = {'A': A.copy(), # context word embeddings
       'ST': S_T.copy()} # target word embeddings

In [40]:
for i in range(len(emb)):
    keys = list(emb.keys())
    key = keys[i]
    embeddings = emb[key]
    
    # get tweet embeddings from word embeddings
    X = get_text_vectors(embeddings, vocabulary_dict, tweets['Tweet'])
    
    file_name = 'tweet_embed_{}.npy'.format(key)
    np.save(file_name, X)