# Build word2vec Models

Much of this code has been copied and adapted from [Laura Nelson's "measuring_intersectionality" GitHub](https://github.com/lknelson/measuring_intersectionality).

In [3]:
import os, string, warnings, glob, gensim
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from random import choices

# Import NLTK collocation packages.
from nltk.collocations import *
from nltk import bigrams, word_tokenize 
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# Ignore warnings.
warnings.simplefilter("ignore")

# Declare directory.
abs_dir = "/Users/williamquinn/Documents/DH/Python/MJP/"

def fast_tokenize(text):
    
    # Get a list of punctuation marks
    punct = string.punctuation + '“' + '”' + '‘' + "’"
    
    lower_case = text.lower()
    lower_case = lower_case.replace('—', ' ').replace('\n', ' ')
    
    # Iterate through text removing punctuation characters
    no_punct = "".join([char for char in lower_case if char not in punct])
    
    # Split text over whitespace into list of words
    tokens = no_punct.split()
    
    return tokens

## Import & Normalize Texts

In [4]:
%%time

df = pd.read_csv(abs_dir + 'Output/mjp_documents.txt', sep='\t') \
    .dropna()

# df = df.sample(frac = 0.02)

# Lower text field.
df['text'] = df['text'].str.lower()

# Remove numbers & underscores from text field, which signal front matter and ads too strongly.
df['text'] = df['text'].str.replace('\d*_*', '')

# Remove magazine titles from text fields.
mag_titles = [t for t in df['magazine'].unique()]
mag_titles = mag_titles + ['the freewoman', 'the new freewoman', 'the egoist']

df['text'] = df['text'].replace(r'|'.join(mag_titles),
                                ' ', regex = True)

# Split text string into list of words.
df['text'] = df['text'].str.split()

# Lemmatizing reduces to a root synonym
# Stemming: need not be a dictionary word, removes prefix and affix based on few rules.
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)

def lemma_and_stem(list_of_words):
    return [stemmer.stem(lemmatizer.lemmatize(w)) for w in list_of_words]
  
df['text'] = df['text'].apply(lemma_and_stem)

# Join word list into single string.
df['text'] = df['text'].str.join(' ')

# Remove duplicate rows.
df = df.drop_duplicates()

# Convert 'date' column to date time.
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d', errors = 'coerce')

# Create "year" column.
df['year'] = df['date'].dt.year

# Drop rows with na (rows that failed to convert to datetime or do not have kwic).
df = df.dropna()

# Re-name columns.
df = df.add_prefix('meta_')

# Change capitalization of 'magazine' column.
df['meta_magazine'] = df['meta_magazine'].str.title()

print (df.shape)
df.head()

(7656, 6)
CPU times: user 3min 22s, sys: 2.4 s, total: 3min 25s
Wall time: 3min 26s


Unnamed: 0,meta_mjp_id,meta_magazine,meta_type,meta_text,meta_date,meta_year
0,1,The Little Review,front,literatur drama music art margaret c anderson ...,1914-12-01,1914
1,2,The Little Review,advertisements,for the holiday vaudevill by carolin caffin an...,1914-12-01,1914
2,3,The Little Review,poetry,vol i decemb no poem richard aldington on a mo...,1914-12-01,1914
3,4,The Little Review,articles,a great pilgrimpagan georg soul shakespear in ...,1914-12-01,1914
4,5,The Little Review,poetry,suffici helen hoyt i wish no guardian angel i ...,1914-12-01,1914


## Create List of Sentences 

In [9]:
%%time

sentences = [sentence for text in df['meta_text'] for sentence in sent_tokenize(text)]
words_by_sentence = [fast_tokenize(sentence) for sentence in sentences]
words_by_sentence = [sentence for sentence in words_by_sentence if sentence != []]

model = gensim.models.Word2Vec(words_by_sentence, vector_size = 150, 
                               window = 15, min_count = 10, 
                               sg = 1, alpha = 0.025,
                               batch_words = 10000, workers = 3)

model.wv.save_word2vec_format(abs_dir + 'Word-Doc_Vectors/Models/mjp_w2v.txt')

CPU times: user 19min 49s, sys: 4.83 s, total: 19min 54s
Wall time: 6min 58s


## Build Multiple Models for Confidence Intervals

In [10]:
%%time

#create 40 random models for constructing confidence intervals
def gen_model(words_by_sentence, num):
    """
    Takes a list of words by senence as input and a number (for naming the file)
    Saves a word2vec model in the word2vec_robust folder
    """

    model = gensim.models.Word2Vec(words_by_sentence, vector_size = 150, 
                                   window = 15, min_count = 10, 
                                   sg = 1, alpha = 0.025,
                                   batch_words = 10000, workers = 3)  
    
    model.wv.save_word2vec_format(abs_dir + 'Word-Doc_Vectors/Models/model_%d.txt' % num)
    
#Number of sentences, for use in creating random sentences
num_sent = len(words_by_sentence)

for num in range(0, 40):
    print(num)
    
    #extract random sample of sentences with replacement, 
    #equal to total number of sentences in the full corpus
    gen_model(choices(words_by_sentence, k = num_sent), num)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
CPU times: user 13h 3min 58s, sys: 2min 33s, total: 13h 6min 32s
Wall time: 4h 27min 51s
