# Data Cleaning & Syntactic Dependencies

In [1]:
%%time

# Import necessary libraries.
import re, warnings, csv, sys, os, datetime, math, itertools, json, spacy, scipy.stats
import pandas as pd
import numpy as np
from operator import itemgetter
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Import NLTK collocation packages.
from nltk.collocations import *
from nltk import bigrams, word_tokenize 
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# Import stopwords.
stop_words = set(stopwords.words('english'))

# Import language model.
nlp = spacy.load('en_core_web_sm')

# Ignore warnings.
warnings.simplefilter("ignore")

# Declare directory.
abs_dir = "/Users/williamquinn/Documents/DH/"

CPU times: user 9.47 s, sys: 3.5 s, total: 13 s
Wall time: 34.8 s


## Import & Normalize Data

In [8]:
%%time

df = pd.read_csv(abs_dir + 'Python/MJP/Output/mjp_documents.txt', sep='\t') \
    .dropna()



# df = df.sample(frac = 0.02)

# Lower text field.
df['text'] = df['text'].str.lower()

# Remove numbers & underscores from text field, which signal front matter and ads too strongly.
df['text'] = df['text'].str.replace('\d*_*', '')

# Remove magazine titles from text fields.
mag_titles = [t for t in df['magazine'].unique()]
mag_titles = mag_titles + ['the freewoman', 'the new freewoman', 'the egoist']

df['text'] = df['text'].replace(r'|'.join(mag_titles),
                                ' ', regex = True)

# Split text string into list of words.
df['text'] = df['text'].str.split()

# Lemmatizing reduces to a root synonym
# Stemming: need not be a dictionary word, removes prefix and affix based on few rules.
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)

def lemma_and_stem(list_of_words):
    return [lemmatizer.lemmatize(w) for w in list_of_words] # Currently only lemmatizer.
#     return [stemmer.stem(lemmatizer.lemmatize(w)) for w in list_of_words]
  
df['text'] = df['text'].apply(lemma_and_stem)

# Join word list into single string.
df['text'] = df['text'].str.join(' ')

# Remove duplicate rows.
df = df.drop_duplicates()

# Convert 'date' column to date time.
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d', errors = 'coerce')

# Create "year" column.
df['year'] = df['date'].dt.year

# Drop rows with na (rows that failed to convert to datetime or do not have kwic).
df = df.dropna()

# Re-name columns.
df = df.add_prefix('meta_')

# Change capitalization of 'magazine' column.
df['meta_magazine'] = df['meta_magazine'].str.title()

print (df.shape)
df.head()

(7656, 6)
CPU times: user 1min 19s, sys: 1.47 s, total: 1min 20s
Wall time: 1min 21s


Unnamed: 0,meta_mjp_id,meta_magazine,meta_type,meta_text,meta_date,meta_year
0,1,The Little Review,front,literature drama music art margaret c anderson...,1914-12-01,1914
1,2,The Little Review,advertisements,for the holiday vaudeville by caroline caffin ...,1914-12-01,1914
2,3,The Little Review,poetry,vol i december no poem richard aldington on a ...,1914-12-01,1914
3,4,The Little Review,articles,a great pilgrimpagan george soule shakespeare ...,1914-12-01,1914
4,5,The Little Review,poetry,sufficience helen hoyt i wish no guardian ange...,1914-12-01,1914


## Get Syntactic Dependencies of Keywords

In [14]:
%%time

# "Men" does not lemmatize to "man"
print (lemmatizer.lemmatize('men'))

# Declare keywords to search for (filtering out non-keywords).
keywords = ['woman', 'women', #'she', 'her', 'herself', 'hers', 'girl', 'girls',
            'man', 'men', #'he', 'him', 'himself', 'his', 'boy', 'boys',
            'feminist', 'freewoman', 'bondwoman', 'spinster', 
            'ego', 'egoism', 'egoist']

keywords = list ( set(lemma_and_stem(keywords)) )

keywords

men
CPU times: user 361 µs, sys: 114 µs, total: 475 µs
Wall time: 424 µs


['freewoman',
 'man',
 'egoism',
 'ego',
 'men',
 'feminist',
 'woman',
 'spinster',
 'bondwoman',
 'egoist']

In [3]:
%%time

# Declare function to extract ngrams based on syntax.
# Stricter function (might be missing 'spinster' & 'ego').
def get_syntactic_dependencies(text):
    doc = nlp(text)
#     doc_length = len(doc)
    keyword_bigrams = []
    
#     Retrieve significant bigrams.
    for token in doc:
        # Only accept selected dependencies.
        if token.pos_ == 'NOUN' and token.dep_ in ['nsubj', 'dobj', 'compound']:
            for child in token.children:
                if child.pos_ in ['PROPN', 'NOUN', 'VERB']: # Only accept selected parts of speech.
                    
#                     Organize tuple order so that keyword is always first (helps with counting).
                    if token.lemma_ in keywords:
                        keyword_bigrams.append({token.lemma_: child.lemma_}) # use dict. to unnest later & keep dependencies in order.
#                         keyword_bigrams.append(token.lemma_)
#                         keyword_bigrams.append(child)
                    elif child.lemma_ in keywords:
                        keyword_bigrams.append({child.lemma_: token.lemma_}) # Maintain order
#                         keyword_bigrams.append(token)
#                         keyword_bigrams.append(child.lemma_)

                    else:
                        pass # Keyword isn't in bigram and not needed.
    
#     If noun_bigrams is not empty, return a list of significant bigrams and.
    if keyword_bigrams:
        return keyword_bigrams

# Get ngrams.
df['meta_ngrams'] = df['meta_text'].apply(get_syntactic_dependencies)
df = df.dropna()

# Clean up ngrams.
df['meta_ngrams'] = df['meta_ngrams'].astype('str')
df['meta_ngrams'] = df['meta_ngrams'].apply(lambda x: re.sub(r'[\[\]{}""\'\']', '', x))
df['meta_ngrams'] = df['meta_ngrams'].str.split(', ')

# Split (explode) list of dictionaries.
df = df.explode('meta_ngrams')

# Separate dictionary keys/values within ngrams column.
df['meta_ngrams'] = df['meta_ngrams'].str.split(': ')
df[['source','target']] = pd.DataFrame(df['meta_ngrams'].tolist(), index= df.index)

# Filter out self-references.
df = df.query('source != target')

# Create list by group (source).
df = df \
    .groupby(['meta_mjp_id', 'meta_magazine', 'meta_type', 'meta_date', 'meta_year', 'source'])['target'] \
    .apply(list) \
    .reset_index()

# # Convert date to unix epoch.
# df['unix'] = df['meta_date'].apply(lambda x: datetime.datetime.timestamp(x)*1000)

df.head()

CPU times: user 30min 58s, sys: 4min 18s, total: 35min 17s
Wall time: 36min 56s


Unnamed: 0,meta_mjp_id,meta_magazine,meta_type,meta_date,meta_year,source,target
0,4,The Little Review,articles,1914-12-01,1914,man,[professor]
1,4,The Little Review,articles,1914-12-01,1914,woman,[age]
2,6,The Little Review,articles,1914-12-01,1914,man,[poem]
3,6,The Little Review,articles,1914-12-01,1914,woman,[ugli]
4,7,The Little Review,poetry,1914-12-01,1914,man,[poet]


## Separate Keywords from Targets

In [4]:
%%time

def remove_words(targetList, bannedList):
    return [w for w in targetList if w not in bannedList]

df['target'] = df.apply(lambda x: remove_words(x['target'], keywords), axis = 1)

df.head(4)

CPU times: user 90.1 ms, sys: 5.76 ms, total: 95.9 ms
Wall time: 101 ms


Unnamed: 0,meta_mjp_id,meta_magazine,meta_type,meta_date,meta_year,source,target
0,4,The Little Review,articles,1914-12-01,1914,man,[professor]
1,4,The Little Review,articles,1914-12-01,1914,woman,[age]
2,6,The Little Review,articles,1914-12-01,1914,man,[poem]
3,6,The Little Review,articles,1914-12-01,1914,woman,[ugli]


## Save Dependencies

In [6]:
%%time

# Save targets as object (i.e., list)
df['target'] = df['target'].astype(object)

df.to_csv(abs_dir + 'Marsden_Magazines/DH2022_LanguageEvolution/Data_Outputs/syntactic-dep-vectors.csv',
                sep = '\t', index = False)

CPU times: user 78.2 ms, sys: 6.07 ms, total: 84.3 ms
Wall time: 90.9 ms
