## SemEval 2019 Task 4 - Preprocess raw article text

Jonathan Miller and Negar Adyaniyazdi, VCU, CMSC516, Fall 2018

Will be following steps located here, with modification:

https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72

In [1]:
import pandas as pd
import numpy as np

Load article text data

In [2]:
DATA_PATH = '../data/'
DATA_INTERIM_PATH = DATA_PATH + 'interim/'

text_train = pd.read_csv(DATA_INTERIM_PATH + 'text_train.csv')
text_val = pd.read_csv(DATA_INTERIM_PATH + 'text_val.csv')

In [3]:
example_text = text_train.loc[0,'article_text']
example_text

"After DeVos Announced Plans To Reexamine Title IX, Texas Attorney Tweets He Would 'Be Ok If [She] Was Sexually Assaulted' When explaining her decision to reevaluate Title IX guidelines as they pertain to sexual assault on college campuses, Secretary of Education Betsy DeVos said: ?Every survivor of sexual misconduct must be taken seriously. Every student accused of sexual misconduct must know that guilt is not predetermined.? \nThe Obama administration?s changes to Title IX have been criticized for, among other things, substantially lowering the burden of proof as it pertains to sexual assault, as well as denying elements of due process to the accused. \nHowever, many progressives are lashing out at DeVos because they hate her, and also rape culture and stuff. \nPerhaps the most grotesque attack came when Rob Ranco, a Texas attorney, tweeted Friday that ?I'm not wishing for it ? but I'd be ok if #BetsyDevos was sexually assaulted. #Sexual Assault #TitleIX.? \nThe Washington Times repo

Initialization

In [4]:
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from contractions import CONTRACTION_MAP
import unicodedata

In [5]:
nlp = spacy.load('en_core_web_md', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

Define text normalization methods

In [6]:
# Remove accented characters
# Sómě Áccěntěd těxt -> Some Accented text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

# Expand contractions
# Y'all can't expand contractions I'd think -> You all cannot expand contractions I would think

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# Remove special characters, numeric removal optional
# Well this was fun! What do you think? 123#@! -> Well this was fun What do you think

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

# Simple stemming
# My system keeps crashing his crashed yesterday, ours crashes daily ->
# My system keep crash hi crash yesterday, our crash daili

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

# Lemmatization
# My system keeps crashing! his crashed yesterday, ours crashes daily ->
# My system keep crash ! his crash yesterday , ours crash daily

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

# Remove Stopwords
# The, and, if are stopwords, computer is not -> , , stopwords , computer not
# Note that 'no' and 'not' have been reintroduced as stopwords

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

Combine normalization methods into consolidated corpus normalization function

In [7]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus