# Notebook 10: Preprocessing Pipelines with Coreference Resolved Corpus

**Project: Data Triage of Transcribed Nixon Tapes** <br>
*Michelle Ballard and April Crompton* <br>
Loyola University Maryland Data Science Project 

**Goal: Evaluate preprocessing pipelines with pronouns resolved**  

## Import Statements

In [1]:
import os
import pickle
import pprint
import re
import sys
from collections import Counter, defaultdict


import numpy as np
import pandas as pd
import spacy
from collections import Counter, defaultdict
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from spellchecker import SpellChecker # for pipes 12, 13

snow_stemmer = SnowballStemmer(language='english')
nlp = spacy.load('en_core_web_sm')
spell = SpellChecker()

In [2]:
# unpickle preserved dataframes 

pkl_file = open('df_chunks_collab_coref.pkl', 'rb')

df_chunks_coref = pickle.load(pkl_file)
print("\n~~~df_chunks_coref~~~\n")
print(df_chunks_coref.info())


~~~df_chunks_coref~~~

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   corpus          141 non-null    object 
 1   speech_final    141 non-null    object 
 2   exhibit         141 non-null    object 
 3   orig_exhibit    141 non-null    object 
 4   Final_CoRef     141 non-null    object 
 5   Count_Replaced  141 non-null    int64  
 6   Count_Tokens    141 non-null    int64  
 7   Ratio_Replaced  141 non-null    float64
dtypes: float64(1), int64(2), object(5)
memory usage: 8.9+ KB
None


## Set corpus-wide functions and variables
This uses df_coref Final_CoRef:

In [3]:
df_corpus = df_chunks_coref[['corpus','exhibit','orig_exhibit','Final_CoRef']].copy()
df_corpus.rename(columns={'Final_CoRef':'speech_final'}, inplace=True)
corpus = df_corpus['speech_final']
id_corpus = df_corpus['exhibit']
index_corpus = df_corpus.index

In [4]:
# Flatten lists into one full list
# ref: http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html
def flatten(line):
    out = []
    for item in line:
        if isinstance(item, list):
            out.extend(flatten(item))
        else:
            out.append(item)
    return out

In [5]:
# Flatten lists into a string
def stringitize(txtlst):
    string = flatten(txtlst)
    string = ' '.join(str(e.strip()) for e in string)
    return string

In [6]:
# Tokenize and define entities within the corpus
    
def sptokenize(txt):
    """Function tokenizes a string, returning a list of spacy tokens for further processing"""
    if isinstance(txt, str):
        doc = nlp(txt)
        sptoken = [t for t in doc]
        return sptoken
    else: 
        print("Function requires a string")
        return None

def tokenize(txt):
    """Function requests spacy tokens and produces a list of text tokens"""
    sptoken = sptokenize(txt)
    tokens = [t.text for t in sptoken]
    return tokens
        
def postagger(txt):
    """ Function requests spacy tokens, and produces a list of tuples of text tokens and pos tags"""
    sptoken = sptokenize(txt)
    tokens = [t.text for t in sptoken]
    pos = [t.tag_ for t in sptoken]
    pos_tagged = list(zip(tokens, pos))
    return pos_tagged

In [7]:
%%time
# create a single, flattened list of pos-tagged tokens within the full corpus
pos_tagged_flat = []
for i,r in enumerate(corpus):
    pos_tagged_flat.extend(postagger(r))
    print(id_corpus[i])

Part_1_exhibit_12
Part_2_exhibit_12
Part_3_exhibit_12
Part_4_exhibit_12
exhibit_13
exhibit_14
exhibit_15
exhibit_19
exhibit_24
exhibit_37
472-004_472-005_472-006
Part_1_051-001
Part_2_051-001
Part_1_472-021
Part_2_472-021
482-017_482-018
002-001_002-002
491-014
538-015
587-003
601-033
Part_1_697-029
Part_2_697-029
342-027
Part_1_741-002
Part_2_741-002
741-010
343-036
347-004
Part_1_779-002
Part_2_779-002
000-000_35d
393-013_393-014
394-021_395-001
854-017
855-010
856-004
Part_1_858-003
Part_2_858-003
862-004
Part_1_862-006
Part_2_862-006
864-004
Part_1_865-014
Part_2_865-014
866-003
Part_1_878-014
Part_2_878-014
Part_3_878-014
882-012
884-007
Part_1_885-007
Part_2_885-007
037-175_037-176
Part_1_886-008
Part_2_886-008
Part_3_886-008
Part_4_886-008
421-018
037-204_037-205
Part_1_422-020
Part_2_422-020
Part_1_422-033
Part_2_422-033
Part_3_422-033
Part_1_423-003
Part_2_423-003
Part_3_423-003
Part_4_423-003
890-019
044-158
Part_1_428-019
Part_2_428-019
Part_3_428-019
Part_4_428-019
896-004


In [146]:
# Review
print(pos_tagged_flat[0:0])

[]


## Build an automated-customized stopwords list

### Words to add: Top frequency terms
These will be added to the stopwords list (to remove from the text).  
They are all lowercase, ascii only, no punctuation.

In [9]:
# Function to add high frequency terms, single-letter words, and common interjections to the stopwords list

def more_stopwords(txtlst):
    # remove punctuation, ascii only, lowercase, counts
    tp = r'\b\w+\b' # includes one-letter words
    cv_i = CountVectorizer(strip_accents='ascii', lowercase=True, token_pattern=tp)
    all_cv_i = cv_i.fit_transform(txtlst) # count all words
    df_dtm = pd.DataFrame(all_cv_i.toarray(), columns=cv_i.get_feature_names_out()) # put the array of features and feature-counts into a DF
    df_dtm.set_index(index_corpus, inplace=True)# ensure index is equal to df_all index

    #   total number of unique words
    dtm_all_feat = df_dtm.shape[1]
    dtm_all_obs = df_dtm.shape[0]
    print("total features:",dtm_all_feat, "\ntotal observations:", dtm_all_obs)
    #   sum total wordcounts
    dtm_ttl = pd.DataFrame(df_dtm.sum(axis=0), columns=['counts']) #sum all values by column in the matrix
    #   pct of total wordcounts
    dtm_ttl['pct'] = dtm_ttl.counts.apply(lambda x: x/dtm_all_feat)
    #   min wordcounts across exhibits
    dtm_ttl['min'] = pd.DataFrame(df_dtm.min(axis=0))
    #   max wordcounts across exhibits
    dtm_ttl['max'] = pd.DataFrame(df_dtm.max(axis=0))

    # find 0.1% features occurring every thousand words or more often overall, ignore case
    top = 0.1
    top_dtm = dtm_ttl[dtm_ttl.pct > top]
    topthou = list(top_dtm.index)
    
    # find all words that are one letter, add them to the list
    toks = df_dtm.columns
    topthou.extend([i for i in toks if len(i)==1])
    
    # find all words that are interjections, add them to the list
    flat_toks = stringitize(toks)
    doc = nlp(flat_toks)
    for tok in doc:
        if tok.tag_=="UH":
            topthou.append(str(tok))
    
    # Ensure the list is unique:
    topthou = list(set(topthou))
    
    #display results#
    print('\nEnsure high frequency words are in the Stopwords list:',len(topthou))
    print(sorted(list(topthou)))
    
    # Sort and display DF
    pd.set_option('display.max_rows',10000)
    display(top_dtm.sort_values(by='pct', ascending=False))
    pd.reset_option('display.max_rows')
    
    return topthou

In [10]:
%%time
# Run the stopwords function on the entire corpus
add_to_stopwords = more_stopwords(corpus)

total features: 10480 
total observations: 141

Ensure high frequency words are in the Stopwords list: 159
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_', 'a', 'about', 'ah', 'all', 'and', 'any', 'are', 'as', 'at', 'aw', 'b', 'be', 'because', 'bob', 'but', 'c', 'can', 'colson', 'could', 'd', 'dean', 'did', 'do', 'does', 'duh', 'e', 'eh', 'ehrlichman', 'er', 'f', 'for', 'from', 'g', 'gee', 'get', 'go', 'going', 'got', 'h', 'ha', 'had', 'hah', 'has', 'have', 'heck', 'heh', 'hello', 'here', 'hew', 'hey', 'hf', 'hi', 'hm', 'huh', 'hunt', 'i', 'if', 'in', 'is', 'it', 'j', 'jeez', 'john', 'just', 'k', 'know', 'l', 'lem', 'let', 'll', 'm', 'magruder', 'me', 'mean', 'mitchell', 'n', 'nah', 'no', 'not', 'now', 'now4', 'o', 'of', 'oh', 'ohara', 'on', 'one', 'oops', 'or', 'other', 'out', 'p', 'people', 'please', 'point', 'president', 'q', 'quack', 'quote', 'r', 're', 'right', 's', 'said', 'say', 'see', 'so', 'some', 't', 'that', 'the', 'then', 'there', 'they', 'thing', 'think', 'this', 't

Unnamed: 0,counts,pct,min,max
the,22821,2.177576,3,368
that,20628,1.968321,4,266
i,20374,1.944084,1,377
to,16302,1.555534,1,211
and,14156,1.350763,1,200
you,13326,1.271565,0,260
s,12358,1.179198,0,183
uh,9927,0.947233,0,293
it,9129,0.871088,1,123
a,8561,0.816889,0,146


CPU times: user 1.97 s, sys: 190 ms, total: 2.16 s
Wall time: 2.17 s


### Words to remove: Proper Nouns
This will be used to remove useful words from the stopwords list (to keep in the text).

In [11]:
def proper_nouns(txtlst):
    """Function to generate a dictionary of POS tags for each word in the vocabulary, 
    determine most likely POS for the word in this corpus, create a list of proper nouns 
    to remove from the stopwords list"""
    # Determine the most likely POS tag for each word
    #    assign all POS identified for each instance of each word in the corpus
    tkns_tagged = defaultdict(list)
    for t in txtlst:
        tkns_tagged[t[0]].append(t[1])
    #    count the number of times each POS occurs, and order by most common values
    tkns_tagged_count = {}
    for k,v in tkns_tagged.items():
        tkns_tagged_count[k] = Counter(v).most_common()
    # Most likely part of speech
    intj_pos = {k:'INTJ' for k,v in tkns_tagged_count.items() if 'INTJ' in list(zip(*v))[0]}
    pos_mostlikely = {k:v[0][0] for k,v in tkns_tagged_count.items()}
    
    # Dict of only Proper Nouns
    pos_nnp = {k:v for k,v in pos_mostlikely.items() if v in ['NNP','NNPS']}
    len(pos_nnp)

    # Clean: truly proper nouns will have more uppercase-first-letter instances than lowercase instances within the corpus
    # No words with punctuation except '.' are considered for these; punctuation must be removed for final stopwords review
    pos_not_nnp = []
    pos_nnp_sw = [] 
    for k in pos_nnp.keys():
        try: # if the count of the capitalized word is greater than the count of the lowercased word, it is most likely a proper noun
            if re.search(r'[^A-Za-z\.]|[\-]|\b\w\b|\.\w', k) != None: pos_not_nnp.append(k) # do not consider words with embedded punctuation or single-letter words or words with no uppercase
            elif tkns_tagged_count[k][0][1] <= tkns_tagged_count[k.lower()][0][1]: pos_not_nnp.append(k)
            elif k in intj_pos: pos_not_nnp.append(k) # words that are interjections are not proper nouns
            elif k.lower() in intj_pos: pos_not_nnp.append(k) # words that are interjections are not proper nouns
            else: pos_nnp_sw.append(k)
        except: pos_nnp_sw.append(k)

    # lowercase words to remove
    pos_nnp_not_stopwords = [] 
    for p in pos_nnp_sw:
        p = p.lower()
        pos_nnp_not_stopwords.append(p)
    pos_nnp_not_stopwords = list(set(pos_nnp_not_stopwords)) # Unique only This is the list of proper nouns to remove from the stopwords
    
    #display results#
    print('\nEnsure Proper Nouns are removed from the Stopwords list:',len(pos_nnp_not_stopwords))
    print(sorted(pos_nnp_not_stopwords))
    return pos_nnp_not_stopwords#, pos_mostlikely, pos_nnp

In [12]:
%%time
# Run the function to identify and validate proper nouns within the corpus
rmv_pnouns = proper_nouns(pos_tagged_flat) # This variable will also be used to validate Named Entities


Ensure Proper Nouns are removed from the Stopwords list: 1182
['aah', 'abbie', 'abc', 'abe', 'abzug', 'acapulco', 'aclu', 'acree', 'adam', 'adams', 'administration', 'administrator', 'adn', 'affeldt', 'africa', 'ag', 'agency', 'agnew', 'agronsky', 'ahh', 'al', 'alabama', 'alan', 'albert', 'alex', 'alexander', 'alexandria', 'allen', 'allentown', 'allright', 'almighty', 'alsop', 'ambassador', 'ambrose', 'amendment', 'america', 'americans', 'ampi', 'amy', 'anderson', 'andrews', 'andso', 'angeles', 'anna', 'anolla', 'anthony', 'appropriations', 'april', 'arabs', 'arc', 'arends', 'arhichman', 'arizona', 'arkansas', 'arlington', 'armstrongs', 'army', 'arnold', 'arthur', 'arts', 'ash', 'asi', 'assistant', 'assyrian', 'attorney', 'august', 'australia', 'bacon', 'bahamas', 'bailey', 'baker', 'banking', 'barbados', 'barbara', 'barker', 'baroody', 'barry', 'bart', 'baxter', 'bay', 'bayh', 'bayhs', 'baz', 'bazelon', 'bdb', 'beach', 'beard', 'bebe', 'becker', 'bedford', 'beg', 'belcher', 'bella', 

### Words to remove: Important Words per tfidf weight
Challenge: misspelled words get a higher weight when reviewed with this algorithm

In [13]:
# txtlst is a list of all words
def tfidf_words(txtlst, top=0.2, bottom=0.8):
    """Function to generate a dictionary of POS tags for each word in the vocabulary, 
    determine most likely POS for the word in this corpus, create a list of proper nouns 
    to remove from the stopwords list"""
    top = top # hold default of top 20% of important terms out of stop words list
    bottom = bottom # hold default of bottom 20% of important terms as infrequent terms to remove from the token list
    corpus = txtlst #speech_final
    # run vectorizer
    vectorizer = TfidfVectorizer()
    corpus = vectorizer.fit_transform(corpus)
    # sort features by weights
    wts_tfidf = (dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_)))
    wts_tfidf = sorted(wts_tfidf.items(), key=lambda item: item[1], reverse=True)
    # select top proportion
    top_wts_tfidf = wts_tfidf[:int(round(len(wts_tfidf)*top,0))]
    top_wts_tfidf = [t[0] for t in top_wts_tfidf] # This is the list to remove from stopwords
    # select bottom proportion
    bottom_wts_tfidf = wts_tfidf[int(round(len(wts_tfidf)*bottom)):]
    bottom_wts_tfidf = [b[0] for b in bottom_wts_tfidf] # This is the list to remove from the token list as Infrequnt Terms
    #display results#
    print("\nRemove top weighted terms from the Stopwords list:",len(top_wts_tfidf))
    print(top_wts_tfidf[:500])
    print("\n\nRemove bottom weighted terms from the Pipeline as Infrequent:",len(bottom_wts_tfidf))
    print(bottom_wts_tfidf[:500])  
    return top_wts_tfidf, bottom_wts_tfidf, wts_tfidf


In [14]:
rmv_tfidf = tfidf_words(corpus)[0] # these will be removed from the stopwords list so they remain in the analysis as important terms
infreq_terms = tfidf_words(corpus)[1] # these will be removed from the analysis as infrequent terms
allwts = tfidf_words(corpus)[2]


Remove top weighted terms from the Stopwords list: 2083
['01', '0l', '111nere', '11ouse', '11ust', '126', '128', '129', '12th', '131', '132', '133', '134', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '161', '162', '163', '164', '165', '168', '171', '175', '180', '1933', '1948', '1952', '1954', '1958', '1964', '1969', '1974', '1eak', '1etter', '1he', '1iat', '1ittle', '340', '3ordon', '400', '406', '440', '492', '498', '4et', '4o', '4th', '500', '505', '5l', '600', '6l', '703', '7et', '87a', 'a1most', 'a1ready', 'aad', 'aah', 'abandon', 'abbie', 'aberration', 'abort', 'aboug', 'abroad', 'absences', 'abso1utely', 'absolve', 'absolving', 'absuidity', 'absurdity', 'abuser', 'abzug', 'ac', 'academic', 'acapulco', 'acceded', 'accelerate', 'accent', 'accepts', 'accidentally', 'accommodate', 'accommodation', 'accomodated', 'accompained', 'accompanied', 'accompanying', 'a

In [15]:
# Terms defined as important
allwts[:len(rmv_tfidf)]

[('01', 5.2626798770413155),
 ('0l', 5.2626798770413155),
 ('111nere', 5.2626798770413155),
 ('11ouse', 5.2626798770413155),
 ('11ust', 5.2626798770413155),
 ('126', 5.2626798770413155),
 ('128', 5.2626798770413155),
 ('129', 5.2626798770413155),
 ('12th', 5.2626798770413155),
 ('131', 5.2626798770413155),
 ('132', 5.2626798770413155),
 ('133', 5.2626798770413155),
 ('134', 5.2626798770413155),
 ('136', 5.2626798770413155),
 ('137', 5.2626798770413155),
 ('138', 5.2626798770413155),
 ('139', 5.2626798770413155),
 ('140', 5.2626798770413155),
 ('141', 5.2626798770413155),
 ('142', 5.2626798770413155),
 ('143', 5.2626798770413155),
 ('144', 5.2626798770413155),
 ('145', 5.2626798770413155),
 ('146', 5.2626798770413155),
 ('147', 5.2626798770413155),
 ('148', 5.2626798770413155),
 ('149', 5.2626798770413155),
 ('150', 5.2626798770413155),
 ('151', 5.2626798770413155),
 ('152', 5.2626798770413155),
 ('153', 5.2626798770413155),
 ('154', 5.2626798770413155),
 ('155', 5.2626798770413155),
 (

In [16]:
# Terms defined as unimportant
allwts[-len(infreq_terms):-1]

[('welcome', 3.7586024802650413),
 ('worrying', 3.7586024802650413),
 ('yours', 3.7586024802650413),
 ('yourself', 3.7586024802650413),
 ('100', 3.653241964607215),
 ('60', 3.653241964607215),
 ('62', 3.653241964607215),
 ('69', 3.653241964607215),
 ('71', 3.653241964607215),
 ('accomplish', 3.653241964607215),
 ('active', 3.653241964607215),
 ('added', 3.653241964607215),
 ('administrative', 3.653241964607215),
 ('air', 3.653241964607215),
 ('amendment', 3.653241964607215),
 ('answering', 3.653241964607215),
 ('anti', 3.653241964607215),
 ('apparent', 3.653241964607215),
 ('approval', 3.653241964607215),
 ('approximately', 3.653241964607215),
 ('arrangement', 3.653241964607215),
 ('authorize', 3.653241964607215),
 ('authorized', 3.653241964607215),
 ('backed', 3.653241964607215),
 ('bastard', 3.653241964607215),
 ('belonged', 3.653241964607215),
 ('blowing', 3.653241964607215),
 ('bow', 3.653241964607215),
 ('boxes', 3.653241964607215),
 ('burden', 3.653241964607215),
 ('capacity', 3.

### Review and refine popular stopwords
Generate the corpus-custom stopwords list

In [17]:
# SciKitLearn Stopwords
pop_stop_words = list(ENGLISH_STOP_WORDS)+["nt"] #sklearn len 318
print("popular stop words:",len(pop_stop_words))
print(pop_stop_words)

# difference between stopwords and Top Thou words from corpus
freq_to_add = [t for t in add_to_stopwords if t not in pop_stop_words]
print("\nadd stopwords from this corpus: +",len(freq_to_add ))
print(freq_to_add)
# add them
cp_stop_words = pop_stop_words+freq_to_add
print("with added stop words:",len(cp_stop_words))

# # remove periods from the proper-nouns list
# nnp_to_rmv_nopunc = list(map(lambda x: x.replace(".",""),rmv_pnouns))
# difference between stopwords and proper nouns
nnp_to_rmv = [k.lower() for k in rmv_pnouns if k.lower() in cp_stop_words]
print("\nremove proper nouns in this corpus: -",len(nnp_to_rmv))
print(nnp_to_rmv)
# remove them
cp_stop_wordsalt = [k for k in cp_stop_words if k not in nnp_to_rmv]
print(len(cp_stop_wordsalt))
# Note: a few interjections will likely sneak back throug with 'remove proper nouns' 
# and that should be ok because they should be very infrequent, not causing a problem in the overall model.

# remove tfidf important terms
tfidf_to_rmv = [r for r in rmv_tfidf if r in cp_stop_wordsalt]
print("\nremove tfidf weighted terms in this corpus: -",len(tfidf_to_rmv))
print(tfidf_to_rmv)
# remove them
cp_stop_words = [r for r in cp_stop_wordsalt if r not in tfidf_to_rmv]
print(len(cp_stop_words))

# Add stemmed version of all stopwords so this step can occur after stemming
flat_toks = stringitize(cp_stop_words)
doc = nlp(flat_toks)
add_stemlem = []
for tok in doc:
    lem = tok.lemma_
    add_stemlem.append(lem)
    add_stemlem.append(snow_stemmer.stem(lem))
add_stemlem = [r for r in add_stemlem if r not in cp_stop_words]
print("\nadd lemmatized and stemmed versions of all stopwords: +",len(add_stemlem),"\n",add_stemlem)
cp_stop_words.extend(add_stemlem)


# Ensure it is a unique list
cp_stop_words = list(set(cp_stop_words ))

popular stop words: 319
['something', 'itself', 'everything', 'very', 'him', 'without', 'describe', 'already', 'call', 'moreover', 'system', 'most', 'no', 'am', 'thick', 'mostly', 'bill', 'de', 'off', 'through', 'themselves', 'over', 'several', 'thru', 'became', 'which', 'onto', 'around', 'via', 'afterwards', 'see', 'serious', 'few', 'amongst', 'for', 'whence', 'others', 'become', 'yourself', 'give', 'co', 'however', 'though', 'here', 'hereupon', 'name', 'against', 'me', 'amoungst', 'while', 'part', 'put', 'cant', 'along', 'between', 'her', 'has', 'thin', 'whose', 'except', 'in', 'there', 'where', 'ten', 'empty', 'whole', 'many', 'across', 'and', 'another', 'yet', 'towards', 'done', 'fire', 'seem', 'up', 'noone', 'should', 're', 'whereupon', 'we', 'hereafter', 'but', 'rather', 'that', 'so', 'eg', 'until', 'seeming', 'are', 'any', 'down', 'much', 'have', 'top', 'somehow', 'more', 'herself', 'whom', 'within', 'from', 'besides', 'may', 'five', 'each', 'full', 'everywhere', 'is', 'above', 

In [18]:
# stopwords output
print(len(cp_stop_words),"stopwords\n",sorted(cp_stop_words))


471 stopwords
 ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'I', '_', 'a', 'about', 'abov', 'above', 'across', 'after', 'afterward', 'afterwards', 'again', 'against', 'ah', 'all', 'almost', 'alon', 'alone', 'along', 'alreadi', 'already', 'also', 'although', 'alway', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'ani', 'anoth', 'another', 'any', 'anyhow', 'anyon', 'anyone', 'anyth', 'anything', 'anyway', 'anywher', 'anywhere', 'are', 'around', 'as', 'at', 'aw', 'b', 'back', 'be', 'became', 'becaus', 'because', 'becom', 'become', 'becomes', 'becoming', 'been', 'befor', 'before', 'beforehand', 'behind', 'being', 'below', 'besid', 'beside', 'besides', 'between', 'beyond', 'both', 'bottom', 'but', 'by', 'c', 'call', 'can', 'cant', 'co', 'con', 'could', 'couldnt', 'cri', 'cry', 'd', 'describ', 'describe', 'detail', 'did', 'do', 'does', 'done', 'down', 'due', 'dure', 'during', 'e', 'each', 'eg', 'eh', 'eight', 'either', 'eleven', 'els', 'else', 'elsewher', 'else

## Identify Relevant Named Entities and N-Grams across the Corpus
Protect these from being impacted by stopwords.  
Relevant is defined as occurring more frequently than the average n-grams

### Find n-grams and named entities

In [19]:
%%time
ngrams = [] # collect ngrams
nents = [] # collect named entitites

# Identify NGrams
for i,r in enumerate(corpus):
    doc = nlp(r)
    for chunk in doc.noun_chunks:
        chktxt = chunk.text
        if len(chktxt.split())>=2:
            ngrams.append(re.sub('\b[^A-Za-z]+|\*+|\-+',"",(chktxt.strip().lower()),flags=re.MULTILINE)) #remove any punctuation at the front, which causes errors

# Identify Named Entities >=2 words
    for ent in doc.ents:
        etxt = ent.text
        if len(etxt.split())>=2:
            nents.append(etxt.strip())


CPU times: user 1min 14s, sys: 9.26 s, total: 1min 23s
Wall time: 1min 23s


### Expand and validate named entities

In [20]:
%%time

# Add Named Entities based on 2+ POS Proper Nouns
named_entity_pos = []
ne = []
for t in pos_tagged_flat:
    if t[1] in ['NNP','NNPS']:
        ne.append(t[0])
    elif len(ne)>=2:
        named_entity_pos.append(" ".join([tok for tok in ne]))
        ne = []
    else: ne = []
                                
nents = nents + named_entity_pos

CPU times: user 142 ms, sys: 1.98 ms, total: 144 ms
Wall time: 143 ms


In [21]:
# Validate terms are Proper Nouns based on the NNP remove-stop words evaluation list
#   do not consider words with embedded punctuation or words that have all-caps
named_entity = [n for n in nents if (
    (n.lower().split()[0] in rmv_pnouns and n.lower().split()[-1] in rmv_pnouns) and
    (re.search(r'[^A-Za-z\.\s]|[\-]|\b[A-Z]{2,}\b', n) == None))]
named_entity = list(set(named_entity)) # unique items
print(len(named_entity))

619


In [22]:
# Review
named_entity = sorted(named_entity, reverse=True) # These named entities will be preserved
named_entity

['white House',
 'justice Bob',
 'john Dean',
 'house house',
 'York City',
 'Yep February',
 'Wilbur Mills',
 'White Rat',
 'White House Press Secretary Ron Ziegler',
 'White House Police',
 'White House Hill',
 'White House Correspondents',
 'White House Aide Charles Colson',
 'White House',
 'Weicker Weicker',
 'Wbat Hoover',
 'Wayne the Wizard',
 'Watergate Haldeman',
 'Watergate Commission',
 'Washington Star',
 'Washington Post',
 'Warren Commission',
 'Walter Jenkins',
 'Wally Johnson',
 'Virgin Islands',
 'Vietnam Marine',
 'Viet Nam',
 'Vice President',
 'Vesco Grand Jury',
 'Vernon Acree',
 'Van Shunway',
 'Van Shumway',
 'United States Treasury',
 'United States Senator',
 'United States Senate',
 'United States Department',
 'United States',
 'Unite House',
 'Ube Hoover',
 'Tunney on Committee',
 'Tree Country Club',
 'Tony Lewis',
 'Tom Pappas',
 'Tom Huston',
 'Tom Dodd',
 'Tom Clark',
 'Tom Bishop',
 'Times for Times',
 'Thomas Lumbard',
 'Teddy Kennedy',
 'Ted Williams'

### Refine and validate n-grams
Reduce noise by considering stopwords and de-duplicating

In [23]:
# standardize the ngrams with lowercase and no words beginning with punctuation or including * or + which cause errors
ngrams = list(set(ngrams)) #ensure unique list
ngrams_cln = [re.sub('\b[^A-Za-z]+|\b[\*\-\.\"\']+|\w*\s?[\,\"\'\-]+\s?\w*', "", n).
              strip().lower() for n in ngrams]

# consider adding 'their' to this list if it will be re-run
# six_sw = [",","the", "this", "that", "those", "these", "a", "an", "another", "all", "any", "and", "as", "at", "her", "his", "ah", "um", "uh"] # these stopwords often occur at the beginning of an n-gram but do not add value

# if it starts with one of the stopwords, it needs to be more than 2 words long to count as an ngram, then remove the stopword
ngrams_clnr = []
for n in ngrams_cln:
    if n:
#         if n.split()[0] in six_sw:
        if n.split()[0] in cp_stop_words:
            if len(n.split())>2:
                ngrams_clnr.append(stringitize([n.split()[1:]]))
        elif len(n.split())>=2: ngrams_clnr.append(n)

# if n_grams already exist as named_entity, remove from n_grams list
ngrams_cln = [n for n in ngrams_clnr if n.title() not in named_entity]          

# count the frequency 
ngrams_count = Counter(ngrams_cln)
ngrams_count = dict(sorted(ngrams_count.items(), key = lambda x: x[1], reverse = True))

#identify ngrams with frequency > average and frequency > 1
freq_ngrams = [] # unique list of above average-used ngrams
n_grams = [] # unique list of ngrams used more than once
ave_ct = round(sum(ngrams_count.values())/len(ngrams_count))
for k,v in ngrams_count.items():
    if v>1: n_grams.append(k)
    if v>ave_ct: freq_ngrams.append(k)
        
print(len(n_grams)) # list of useful ngrams
print(len(freq_ngrams)) # list of relevant ngrams

705
705


In [24]:
# Review Sample
sorted(freq_ngrams[0:-1:50])

['agnew plane',
 'continuing blackmail',
 'executive privilege',
 'haldeman statement',
 'jewish seat',
 'many questions',
 'much knowledge',
 'potential criminal liability',
 'problem area',
 'second meeting',
 'second story job',
 'the reverse',
 'the whole thing',
 'ugly inferences',
 'whole watergate thing']

### Capture N-Grams and Named Entities within the text
identify which n-grams and named entities exist within each text block

In [25]:
# Set up Dataframe to capture pipelines
df_all_pipes = df_corpus[['corpus','exhibit','orig_exhibit','speech_final']].copy()

In [26]:
# Functions to add a pd.Series of a list of relevant N-Grams and Named Entities to the DataFrame
def nm_ent(txt):
    """reviews named_entity list and returns a list of matching items"""
    ne = []
    for n in named_entity:
        x = re.findall(n, txt, flags=re.IGNORECASE)
        if x: ne.append(n)
    ne = flatten(ne)
    return ne
        
def n_grm(txt):
    """reviews ngrams list and returns a list of matching items in a tuple of ngrams and frequent ngrams"""
    ng = []
    fng = []
    for n in n_grams:
        x = re.findall(n, txt, flags=re.IGNORECASE)
        if x: ng.append(n)
    for n in freq_ngrams:
        x = re.findall(n, txt, flags=re.IGNORECASE)
        if x: fng.append(n)
    ng = flatten(ng)
    fng = flatten(fng)
    return ng, fng

In [27]:
%%time
# Capture a list of Named Entities in the DF
df_all_pipes['named_ent'] = df_corpus['speech_final'].apply(lambda t: nm_ent(t))

CPU times: user 18.4 s, sys: 12.6 ms, total: 18.4 s
Wall time: 18.4 s


In [28]:
%%time
# Capture a list of N-Grams in the DF
df_all_pipes['all_ngrams'] = df_all_pipes['speech_final'].apply(lambda t: n_grm(t))

# Split N-Grams into All vs Frequent
df_all_pipes['freq_ngrams'] = df_all_pipes['all_ngrams'].apply(lambda x: x[1])
df_all_pipes['all_ngrams'] = df_all_pipes['all_ngrams'].apply(lambda x: x[0])

CPU times: user 43.6 s, sys: 29.5 ms, total: 43.6 s
Wall time: 43.6 s


In [29]:
# Create allgrams column with a combined list of frequent n_grams and named entities
df_all_pipes["named_ent"] =  df_all_pipes['named_ent']
df_all_pipes["allgrams"] = df_all_pipes['freq_ngrams']+df_all_pipes['named_ent']
# Flatten the list for ease of iteration
df_all_pipes["allgrams"] = df_all_pipes["allgrams"].apply(lambda x: list(set(flatten(x))))

display(df_all_pipes.head())


Unnamed: 0,corpus,exhibit,orig_exhibit,speech_final,named_ent,all_ngrams,freq_ngrams,allgrams
0,WG_Trial,Part_1_exhibit_12,exhibit_12,"John , sit down , sit down . Good morning . W...","[white House, York City, White House, United S...","[press conference, thousand dollars, other thi...","[press conference, thousand dollars, other thi...","[the hell, a lot, good man, York City, that so..."
1,WG_Trial,Part_2_exhibit_12,exhibit_12,"Sure . Uh , there 's no doubt about that . M...","[white House, White House, United States Senat...","[thousand dollars, other thing, other things, ...","[thousand dollars, other thing, other things, ...","[the hell, a lot, two years, that sort, waterg..."
2,WG_Trial,Part_3_exhibit_12,exhibit_12,"Uh , he John does n't go until Friday . Frida...","[white House, York City, White House, John Ehr...","[thousand dollars, other thing, white house st...","[thousand dollars, other thing, white house st...","[the hell, a lot, god damned thing, the troubl..."
3,WG_Trial,Part_4_exhibit_12,exhibit_12,"But , based on what information it would ? Fo...","[white House, White House, Sirica Delays Sente...","[executive privilege, other thing, national se...","[executive privilege, other thing, national se...","[damned thing, public statement, the hell, a l..."
4,WG_Trial,exhibit_13,exhibit_13,"Well , you go round and round and you come up...","[white House, White House, Supposing Mitchell,...","[executive privilege, damn thing, god damn thi...","[executive privilege, damn thing, god damn thi...","[the hell, a lot, Supposing Mitchell, that sor..."


### Set up for preprocessing pipelines
Create needed functions and datasets`

#### Set up text with N-Grams and Named Entities
Create text blocks with n-grams and named entities terms set up as a single token, terms separated with an underscore, keeping the word-set together in the pipelines

In [30]:
%%time  
# copy 'speech_final' to prepare for updates to this string
df_all_pipes["NG_text"] = df_all_pipes["speech_final"].copy()
df_all_pipes["NE_text"] = df_all_pipes["speech_final"].copy()

# Update NG_text with underscores where there are allgrams
# Update NE_text with underscores where there are named entities
rows = len(df_all_pipes)
for r in range(rows):
    for ng in df_all_pipes['allgrams'][r]:
        fng = re.sub(" ","_",ng)
        newtxt = re.sub(ng, fng, df_all_pipes.loc[r,'NG_text'], flags=re.IGNORECASE)
        df_all_pipes.update(pd.DataFrame({"NG_text": [newtxt]}, index = [r]))
    for ne in df_all_pipes['named_ent'][r]:
        fne = re.sub(" ","_",ne)
        newtxte = re.sub(ne, fne, df_all_pipes.loc[r,'NE_text'], flags=re.IGNORECASE)
        df_all_pipes.update(pd.DataFrame({"NE_text": [newtxte]}, index = [r]))

CPU times: user 18.4 s, sys: 92 ms, total: 18.5 s
Wall time: 18.6 s


In [31]:
# Review (ctrl-F "_" to highlight the underscores in the text blocks below)
print(df_all_pipes['NG_text'][80])
print("\n",df_all_pipes['NE_text'][80])

 to be repudiated by the committee , that meets Tuesday .  to , to wait ... Phone rings : Ya , Ya .  .  Very nice , sure would .  Oh great , great .  You 're going over to the ... Oh , you did .  Are n't you going to have dinner there ? Oh , it starts at ten of eight .  I see .  Well , we 're supposed to be there at eight forty five .  at eight forty five , OK . Well we 'll have a wonderful tine and I 'll see you there .  See you when you get back .  Bye . Maybe we should .  I think , I think we 've got to get bouncing .  Ah , that , there , there 's another_thing I 'm thinking of , Bob , is that there's ... I'm thinking of the fact that this  here now , may make the hearings a_hell of a_lot less interesting and also a_hell of alot ... they sure as hell --my wife about that . Sure . I hate to see that stuff keep getting obstructed by Watergate .  That is n't the problem we 're dealing with today .  All this_stuff 's developing on Watergate . This makes Watergate look a_lot worse than W

In [32]:
# Review (ctrl-F "_" to highlight the underscores in the text blocks below)
print(df_all_pipes['NG_text'][2])
print("\n",df_all_pipes['NE_text'][2])

 Uh , he John does n't go until Friday . Friday ... Well , in any event , could we do it . Thursday ?  This meeting : This meeting you ca n't do meeting today , can you ? I do n't think so .  I was suggesting a meeting with Mitchell ... Mitchell , Ehrlichman , DEAN and Bob ' that 's all .  Now , Mitchell has to be there because , uh , uh , Mitchell is seriously involved and , uh , we 're trying to keep , uh , we 've got to see how we , uh , hole we handle it from here on .  We are in the process of having to determine which way to go and , uh , John has thought it through , as well as John can .  I do , I do n't want Moore there on this occasion . No . You have n't told Moore all of this , have you ? Moore 's got , uh , by being with me , has more bits and pieces .  I 've had to give Moore Right . because Moore is making Right . judgments that , uh ... Well , the point is , once you get down to the PR , once you decide what you 're going to do , then we can let Moore know , and so fort

#### Set up Lemmatization & Stemming w POS
First Lemmatize with POS, then Stem.  
Punctuation is removed and text is lowercased for a cleaner result.

Spacy Lemmatizer and POS tag
Per https://devskrol.com/2021/11/28/spacy-stemming-vs-lemmatization/:
SpaCy is built for production use it’s pipelines are more trained and provides more accuracy than NLTK.

NLTK snowball stemmer is precise over large datasets
ref: https://www.geeksforgeeks.org/snowball-stemmer-nlp/
ref: https://www.youtube.com/watch?v=vwjZSNPKjws

Used together, these are effective in reducing noise  

However, stemming creates words that can seem nonsensical, so the stemmed word can be replaced most frequent lemmatized word to make topic review simpler.

In [33]:
# Capture tokens, pos, lem, stem on text
# first uses Spacy to tokenize, POS tag, and lemmatize
# then uses NLTK snowball stemmer
# from nltk.stem.snowball import SnowballStemmer

def lemmer_pos_stemmer(text):
    """Output is a list of tuples.  The tuples have 4 items each.
    Each tuple includes the original text token, the POS tag, the lemmatized word, and the stemmed word"""
    doc = nlp(text)
    tokenz = []
    postagz = []
    lemmaz = []
    stemz = []
    for token in doc:
        tokenz.append(token.text)
        postagz.append(token.tag_)
        lemmaz.append(token.lemma_)
        stemz.append(snow_stemmer.stem(token.lemma_))
    return tokenz, postagz, lemmaz, stemz

In [34]:
%%time
# run the function on the df, first on the N-Grams text and then the Named Entities text
df_all_pipes["token_ng"] = df_all_pipes["NG_text"].apply(lambda txt: lemmer_pos_stemmer(txt))
df_all_pipes["token_ne"] = df_all_pipes["NE_text"].apply(lambda txt: lemmer_pos_stemmer(txt))

CPU times: user 2min 36s, sys: 20.2 s, total: 2min 56s
Wall time: 2min 56s


In [35]:
# split the NG tuple
df_all_pipes["pos_tag_ng"] = df_all_pipes["token_ng"].apply(lambda x: x[1])
df_all_pipes["lemmatized_ng"] = df_all_pipes["token_ng"].apply(lambda x: x[2])
df_all_pipes["stemmed_ng"] = df_all_pipes["token_ng"].apply(lambda x: x[3])
df_all_pipes["token_ng"] = df_all_pipes["token_ng"].apply(lambda x: x[0])

# split the NE tuple
df_all_pipes["pos_tag_ne"] = df_all_pipes["token_ne"].apply(lambda x: x[1])
df_all_pipes["lemmatized_ne"] = df_all_pipes["token_ne"].apply(lambda x: x[2])
df_all_pipes["stemmed_ne"] = df_all_pipes["token_ne"].apply(lambda x: x[3])
df_all_pipes["token_ne"] = df_all_pipes["token_ne"].apply(lambda x: x[0])

df_all_pipes[['token_ng','pos_tag_ng', 'lemmatized_ng', 'stemmed_ng',
                  'token_ne','pos_tag_ne', 'lemmatized_ne', 'stemmed_ne']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   token_ng       141 non-null    object
 1   pos_tag_ng     141 non-null    object
 2   lemmatized_ng  141 non-null    object
 3   stemmed_ng     141 non-null    object
 4   token_ne       141 non-null    object
 5   pos_tag_ne     141 non-null    object
 6   lemmatized_ne  141 non-null    object
 7   stemmed_ne     141 non-null    object
dtypes: object(8)
memory usage: 8.9+ KB


In [36]:
%%time
# Replace the Stemmed word with the highest-frequency lemmatized word for ease of readability

#  Discover the highest frequency lemmatized word for each stemmed word
lemstemdiff = df_all_pipes[["lemmatized_ng","stemmed_ng"]].copy()
numrows = len(lemstemdiff["lemmatized_ng"])
alldifferences = []
for i in range(numrows):
    diffrow = []
    lrow = df_all_pipes["lemmatized_ng"][i]
    srow = df_all_pipes["stemmed_ng"][i]
    if lrow != srow: 
        for wi in range(len(lrow)):
            if lrow[wi].lower() != srow[wi]: diffrow.append((lrow[wi].lower(), srow[wi]))
    alldifferences.append(diffrow)

# Create a dictionary of the lemmatized words for each stemmed word
stem_dict = defaultdict(list)
for a in alldifferences:
    if len(a) == 0: continue #if there are no differences, skip this
    for b in a:
        # lowercase and remove punctuation
        stemmed_b = re.sub(r'[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]',"",b[1]).lower() #####
        lemmed_b  = re.sub(r'[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]',"",b[0]).lower() #####
        stem_dict[stemmed_b].append(lemmed_b) # create a list of the lemmatized words in the dict of each stemmed word
        
for k,v in stem_dict.items():
    stem_dict[k] = Counter(v) # count each lemmatized word associated with the stemmed word
    
# Map the most frequent lemmatized term to the stemmed term for ease of readability
def remaplem(txt):
    relemd = []
    for t in txt:
        t = re.sub(r'[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]',"",t).lower() #####
        s = (stem_dict[t])
        if len(s) > 0:
            l = list(s.keys())[0]
        else:
            l = t
        relemd.append(l)
    return relemd

CPU times: user 350 ms, sys: 7.53 ms, total: 358 ms
Wall time: 357 ms


In [37]:
# Create a column of stemmed words that have the most frequent lemmatized word mapped for readability
df_all_pipes["lemstemrelem_ng"] = df_all_pipes['token_ng'].apply(lambda x: remaplem(x))
df_all_pipes["lemstemrelem_ne"] = df_all_pipes['token_ne'].apply(lambda x: remaplem(x))

In [38]:
# CHECK:
#    All 'ng' and 'ne' columns should have the same length
#    The lemstemrelem_ng/ne should have the same number of terms as the stemmed_ng/ne

pd.set_option("max_colwidth", 800)

checkng = df_all_pipes.loc[
    [0,65,84],['token_ng','pos_tag_ng', 'lemmatized_ng', 'stemmed_ng', 'lemstemrelem_ng']]
print("Check NGrams, which should be the same length")
display(checkng)
for r in range(len(checkng)):
    for c in range(len(checkng.columns)):
        print(len(checkng.iloc[r,c]))

checkne = df_all_pipes.loc[
    [0,65,84],['token_ne','pos_tag_ne', 'lemmatized_ne', 'stemmed_ne', 'lemstemrelem_ne']]
print("\nCheck Named Entities, which should be the same length")
display(checkne)
for r in range(len(checkne)):
    for c in range(len(checkne.columns)):
        print(len(checkne.iloc[r,c]))
        
pd.reset_option("max_colwidth")

Check NGrams, which should be the same length


Unnamed: 0,token_ng,pos_tag_ng,lemmatized_ng,stemmed_ng,lemstemrelem_ng
0,"[ , John, ,, sit, down, ,, sit, down, ., Good, morning, ., Well, ,, what, is, the, Dean, summary, of, the, day, about, ?, John, caught, me, on, the_way, out, and, asked, me, about, why, Gray, was, holding, back, on, information, ,, if, that, was, under, instructions, from, us, ., , And, it, ,, uh, ,, it, was, and, it, was, n't, ., Uh, ,, it, was, instructions, proposed, by, the, Attorney_General, ,, consistent, with, Mr., press_conference, statement, that, no, further, raw_data, was, to, be, turned, over, to, the, ..., full_committee, ., ..., full_committee, ., Right, ., And, that, was, ...]","[_SP, NNP, ,, VB, RP, ,, VB, RP, ., JJ, NN, ., UH, ,, WP, VBZ, DT, NNP, NN, IN, DT, NN, IN, ., NNP, VBD, PRP, IN, NNP, RP, CC, VBD, PRP, IN, WRB, NNP, VBD, VBG, RP, IN, NN, ,, IN, DT, VBD, IN, NNS, IN, PRP, ., _SP, CC, PRP, ,, UH, ,, PRP, VBD, CC, PRP, VBD, RB, ., UH, ,, PRP, VBD, NNS, VBN, IN, DT, NNP, ,, JJ, IN, NNP, NNP, NN, IN, DT, JJ, NN, VBD, TO, VB, VBN, RP, IN, DT, :, NN, ., NFP, NN, ., UH, ., CC, DT, VBD, ...]","[ , John, ,, sit, down, ,, sit, down, ., good, morning, ., well, ,, what, be, the, Dean, summary, of, the, day, about, ?, John, catch, I, on, the_way, out, and, ask, I, about, why, Gray, be, hold, back, on, information, ,, if, that, be, under, instruction, from, we, ., , and, it, ,, uh, ,, it, be, and, it, be, not, ., uh, ,, it, be, instruction, propose, by, the, Attorney_General, ,, consistent, with, Mr., press_conference, statement, that, no, further, raw_data, be, to, be, turn, over, to, the, ..., full_committee, ., ..., full_committee, ., right, ., and, that, be, ...]","[ , john, ,, sit, down, ,, sit, down, ., good, morn, ., well, ,, what, be, the, dean, summari, of, the, day, about, ?, john, catch, i, on, the_way, out, and, ask, i, about, whi, gray, be, hold, back, on, inform, ,, if, that, be, under, instruct, from, we, ., , and, it, ,, uh, ,, it, be, and, it, be, not, ., uh, ,, it, be, instruct, propos, by, the, attorney_gener, ,, consist, with, mr., press_confer, statement, that, no, further, raw_data, be, to, be, turn, over, to, the, ..., full_committe, ., ..., full_committe, ., right, ., and, that, be, ...]","[ , johns, , sit, down, , sit, down, , goodness, morning, , well, , what, is, the, deans, summary, of, the, day, about, , johns, caught, me, on, the_way, out, and, asked, me, about, why, gray, was, holding, backing, on, information, , if, thats, was, underlying, instructions, from, us, , , and, its, , uhs, , its, was, and, its, was, nt, , uhs, , its, was, instructions, proposed, by, the, attorney_generals, , consistent, with, mr, press_conference, statement, thats, no, furtherance, raw_data, was, to, being, turned, overly, to, the, , full_committee, , , full_committee, , rightness, , and, thats, was, ...]"
65,"[ , This, story, and, ,, uh, ,, this, one, ,, uh, ,, this, ,, this, watergate_thing, is, potentially, very, debilitating, around, ., , but, we, have, to, devote, a, large, part, of, our, time, to, keeping, people, busy, in, ,, uh, I, know, ., affirmative, kinds, of, because, thing, involves, people, we, know, ., Yeah, ., thing, involves, ,, frankly, ,, people, who, do, n't, , guilty, ., , This, and, that, ., Yeah, ., And, ,, and, ,, also, for, ,, you, ,, you, do, n't, want, anybody, guilty, ,, or, ,, it, is, n't, the_question, ., We, know, ...]","[_SP, DT, NN, CC, ,, UH, ,, DT, NN, ,, UH, ,, DT, ,, DT, NN, VBZ, RB, RB, VBG, RB, ., _SP, CC, PRP, VBP, TO, VB, DT, JJ, NN, IN, PRP$, NN, IN, VBG, NNS, JJ, RB, ,, UH, PRP, VBP, ., JJ, NNS, IN, IN, NN, VBZ, NNS, PRP, VBP, ., UH, ., NN, VBZ, ,, RB, ,, NNS, WP, VBP, RB, _SP, JJ, ., _SP, DT, CC, DT, ., UH, ., CC, ,, CC, ,, RB, IN, ,, PRP, ,, PRP, VBP, RB, VB, NN, JJ, ,, CC, ,, PRP, VBZ, RB, NN, ., PRP, VBP, ...]","[ , this, story, and, ,, uh, ,, this, one, ,, uh, ,, this, ,, this, watergate_thing, be, potentially, very, debilitate, around, ., , but, we, have, to, devote, a, large, part, of, our, time, to, keep, people, busy, in, ,, uh, I, know, ., affirmative, kind, of, because, thing, involve, people, we, know, ., yeah, ., thing, involve, ,, frankly, ,, people, who, do, not, , guilty, ., , this, and, that, ., yeah, ., and, ,, and, ,, also, for, ,, you, ,, you, do, not, want, anybody, guilty, ,, or, ,, it, be, not, the_question, ., we, know, ...]","[ , this, stori, and, ,, uh, ,, this, one, ,, uh, ,, this, ,, this, watergate_th, be, potenti, veri, debilit, around, ., , but, we, have, to, devot, a, larg, part, of, our, time, to, keep, peopl, busi, in, ,, uh, i, know, ., affirm, kind, of, becaus, thing, involv, peopl, we, know, ., yeah, ., thing, involv, ,, frank, ,, peopl, who, do, not, , guilti, ., , this, and, that, ., yeah, ., and, ,, and, ,, also, for, ,, you, ,, you, do, not, want, anybodi, guilti, ,, or, ,, it, be, not, the_quest, ., we, know, ...]","[ , this, story, and, , uhs, , this, one, , uhs, , this, , this, watergate_thing, is, potentially, very, debilitating, around, , , but, we, having, to, devote, a, large, partly, of, ours, timely, to, keeping, people, busy, in, , uhs, i, knowingly, , affirmative, kinds, of, because, thing, involves, people, we, knowingly, , yeah, , thing, involves, , frankly, , people, whos, doing, nt, , guilty, , , this, and, thats, , yeah, , and, , and, , also, for, , you, , you, doing, nt, wants, anybody, guilty, , or, , its, is, nt, the_question, , we, knowingly, ...]"
84,"[ , Okay, ,, be, right, there, ., , Okay, Buddy, ., , Yes, ,, please, ., , Haldeman, ,, please, ., , Thank, you, ., , Hello, ., , Mr._Haldeman, ,, here, you, are, ., , Hope, you, 're, enjoying, this, lovely, day, ., , Uh, ,, I, ', m, afraid, not, ., , Got, to, get, out, and, take, a, look, at, day, ., , It, really, looks, beautiful, outside, ., , Right, ,, you, 're, right, ,, uh, ,, are, you, working, on, haldeman_statement, ?, , Yeah, ,, and, I, ,, uh, ,, talked, to, Bill, ..., , ...]","[_SP, UH, ,, VB, RB, RB, ., _SP, NNP, NNP, ., _SP, UH, ,, UH, ., _SP, NNP, ,, UH, ., _SP, VBP, PRP, ., _SP, UH, ., _SP, NN, ,, RB, PRP, VBP, ., _SP, VBP, PRP, VBP, VBG, DT, JJ, NN, ., _SP, UH, ,, PRP, VBP, VBP, JJ, RB, ., _SP, VBP, TO, VB, RP, CC, VB, DT, NN, IN, NN, ., _SP, PRP, RB, VBZ, JJ, RB, ., _SP, UH, ,, PRP, VBP, JJ, ,, UH, ,, VBP, PRP, VBG, IN, NNP, ., _SP, UH, ,, CC, PRP, ,, UH, ,, VBD, IN, NNP, :, _SP, ...]","[ , okay, ,, be, right, there, ., , Okay, Buddy, ., , yes, ,, please, ., , Haldeman, ,, please, ., , thank, you, ., , hello, ., , mr._haldeman, ,, here, you, be, ., , hope, you, be, enjoy, this, lovely, day, ., , uh, ,, I, ', m, afraid, not, ., , got, to, get, out, and, take, a, look, at, day, ., , it, really, look, beautiful, outside, ., , right, ,, you, be, right, ,, uh, ,, be, you, work, on, haldeman_statement, ?, , yeah, ,, and, I, ,, uh, ,, talk, to, Bill, ..., , ...]","[ , okay, ,, be, right, there, ., , okay, buddi, ., , yes, ,, pleas, ., , haldeman, ,, pleas, ., , thank, you, ., , hello, ., , mr._haldeman, ,, here, you, be, ., , hope, you, be, enjoy, this, love, day, ., , uh, ,, i, ', m, afraid, not, ., , got, to, get, out, and, take, a, look, at, day, ., , it, realli, look, beauti, outsid, ., , right, ,, you, be, right, ,, uh, ,, be, you, work, on, haldeman_stat, ?, , yeah, ,, and, i, ,, uh, ,, talk, to, bill, ..., , ...]","[ , okay, , being, rightness, theres, , , okay, buddy, , , yes, , please, , , haldemans, , please, , , thanks, you, , , hello, , , mr_haldeman, , heres, you, are, , , hopefully, you, re, enjoying, this, lovely, day, , , uhs, , i, , m, afraid, not, , , got, to, gets, out, and, taking, a, looks, at, day, , , its, really, looks, beautiful, outside, , , rightness, , you, re, rightness, , uhs, , are, you, working, on, haldeman_statement, , , yeah, , and, i, , uhs, , talked, to, bill, , , ...]"


7203
7203
7203
7203
7203
7107
7107
7107
7107
7107
4596
4596
4596
4596
4596

Check Named Entities, which should be the same length


Unnamed: 0,token_ne,pos_tag_ne,lemmatized_ne,stemmed_ne,lemstemrelem_ne
0,"[ , John, ,, sit, down, ,, sit, down, ., Good, morning, ., Well, ,, what, is, the, Dean, summary, of, the, day, about, ?, John, caught, me, on, the, way, out, and, asked, me, about, why, Gray, was, holding, back, on, information, ,, if, that, was, under, instructions, from, us, ., , And, it, ,, uh, ,, it, was, and, it, was, n't, ., Uh, ,, it, was, instructions, proposed, by, the, Attorney_General, ,, consistent, with, Mr., press, conference, statement, that, no, further, raw, data, was, to, be, turned, over, to, the, ..., Full, committee, ., ..., full, committee, ., ...]","[_SP, NNP, ,, VB, RP, ,, VB, RP, ., JJ, NN, ., UH, ,, WP, VBZ, DT, NNP, NN, IN, DT, NN, IN, ., NNP, VBD, PRP, IN, DT, NN, RB, CC, VBD, PRP, IN, WRB, NNP, VBD, VBG, RP, IN, NN, ,, IN, DT, VBD, IN, NNS, IN, PRP, ., _SP, CC, PRP, ,, UH, ,, PRP, VBD, CC, PRP, VBD, RB, ., UH, ,, PRP, VBD, NNS, VBN, IN, DT, NNP, ,, JJ, IN, NNP, NNP, NN, NN, IN, DT, JJ, JJ, NNS, VBD, TO, VB, VBN, RP, IN, DT, :, JJ, NN, ., NFP, JJ, NN, ., ...]","[ , John, ,, sit, down, ,, sit, down, ., good, morning, ., well, ,, what, be, the, Dean, summary, of, the, day, about, ?, John, catch, I, on, the, way, out, and, ask, I, about, why, Gray, be, hold, back, on, information, ,, if, that, be, under, instruction, from, we, ., , and, it, ,, uh, ,, it, be, and, it, be, not, ., uh, ,, it, be, instruction, propose, by, the, Attorney_General, ,, consistent, with, Mr., press, conference, statement, that, no, further, raw, datum, be, to, be, turn, over, to, the, ..., full, committee, ., ..., full, committee, ., ...]","[ , john, ,, sit, down, ,, sit, down, ., good, morn, ., well, ,, what, be, the, dean, summari, of, the, day, about, ?, john, catch, i, on, the, way, out, and, ask, i, about, whi, gray, be, hold, back, on, inform, ,, if, that, be, under, instruct, from, we, ., , and, it, ,, uh, ,, it, be, and, it, be, not, ., uh, ,, it, be, instruct, propos, by, the, attorney_gener, ,, consist, with, mr., press, confer, statement, that, no, further, raw, datum, be, to, be, turn, over, to, the, ..., full, committe, ., ..., full, committe, ., ...]","[ , johns, , sit, down, , sit, down, , goodness, morning, , well, , what, is, the, deans, summary, of, the, day, about, , johns, caught, me, on, the, ways, out, and, asked, me, about, why, gray, was, holding, backing, on, information, , if, thats, was, underlying, instructions, from, us, , , and, its, , uhs, , its, was, and, its, was, nt, , uhs, , its, was, instructions, proposed, by, the, attorney_generals, , consistent, with, mr, press, conference, statement, thats, no, furtherance, raw, data, was, to, being, turned, overly, to, the, , full, committee, , , full, committee, , ...]"
65,"[ , This, story, and, ,, uh, ,, this, one, ,, uh, ,, this, ,, this, Watergate, thing, is, potentially, very, debilitating, around, ., , but, we, have, to, devote, a, large, part, of, our, time, to, keeping, people, busy, in, ,, uh, I, know, ., affirmative, kinds, of, because, thing, involves, people, we, know, ., Yeah, ., thing, involves, ,, frankly, ,, people, who, do, n't, , guilty, ., , This, and, that, ., Yeah, ., And, ,, and, ,, also, for, ,, you, ,, you, do, n't, want, anybody, guilty, ,, or, ,, it, is, n't, the, question, ., ...]","[_SP, DT, NN, CC, ,, UH, ,, DT, NN, ,, UH, ,, DT, ,, DT, NNP, NN, VBZ, RB, RB, VBG, RB, ., _SP, CC, PRP, VBP, TO, VB, DT, JJ, NN, IN, PRP$, NN, IN, VBG, NNS, JJ, RB, ,, UH, PRP, VBP, ., JJ, NNS, IN, IN, NN, VBZ, NNS, PRP, VBP, ., UH, ., NN, VBZ, ,, RB, ,, NNS, WP, VBP, RB, _SP, JJ, ., _SP, DT, CC, DT, ., UH, ., CC, ,, CC, ,, RB, IN, ,, PRP, ,, PRP, VBP, RB, VB, NN, JJ, ,, CC, ,, PRP, VBZ, RB, DT, NN, ., ...]","[ , this, story, and, ,, uh, ,, this, one, ,, uh, ,, this, ,, this, Watergate, thing, be, potentially, very, debilitate, around, ., , but, we, have, to, devote, a, large, part, of, our, time, to, keep, people, busy, in, ,, uh, I, know, ., affirmative, kind, of, because, thing, involve, people, we, know, ., yeah, ., thing, involve, ,, frankly, ,, people, who, do, not, , guilty, ., , this, and, that, ., yeah, ., and, ,, and, ,, also, for, ,, you, ,, you, do, not, want, anybody, guilty, ,, or, ,, it, be, not, the, question, ., ...]","[ , this, stori, and, ,, uh, ,, this, one, ,, uh, ,, this, ,, this, waterg, thing, be, potenti, veri, debilit, around, ., , but, we, have, to, devot, a, larg, part, of, our, time, to, keep, peopl, busi, in, ,, uh, i, know, ., affirm, kind, of, becaus, thing, involv, peopl, we, know, ., yeah, ., thing, involv, ,, frank, ,, peopl, who, do, not, , guilti, ., , this, and, that, ., yeah, ., and, ,, and, ,, also, for, ,, you, ,, you, do, not, want, anybodi, guilti, ,, or, ,, it, be, not, the, question, ., ...]","[ , this, story, and, , uhs, , this, one, , uhs, , this, , this, watergate, thing, is, potentially, very, debilitating, around, , , but, we, having, to, devote, a, large, partly, of, ours, timely, to, keeping, people, busy, in, , uhs, i, knowingly, , affirmative, kinds, of, because, thing, involves, people, we, knowingly, , yeah, , thing, involves, , frankly, , people, whos, doing, nt, , guilty, , , this, and, thats, , yeah, , and, , and, , also, for, , you, , you, doing, nt, wants, anybody, guilty, , or, , its, is, nt, the, questionable, , ...]"
84,"[ , Okay, ,, be, right, there, ., , Okay, Buddy, ., , Yes, ,, please, ., , Haldeman, ,, please, ., , Thank, you, ., , Hello, ., , Mr._Haldeman, ,, here, you, are, ., , Hope, you, 're, enjoying, this, lovely, day, ., , Uh, ,, I, ', m, afraid, not, ., , Got, to, get, out, and, take, a, look, at, day, ., , It, really, looks, beautiful, outside, ., , Right, ,, you, 're, right, ,, uh, ,, are, you, working, on, Haldeman, statement, ?, , Yeah, ,, and, I, ,, uh, ,, talked, to, Bill, ..., ...]","[_SP, UH, ,, VB, RB, RB, ., _SP, NNP, NNP, ., _SP, UH, ,, UH, ., _SP, NNP, ,, UH, ., _SP, VBP, PRP, ., _SP, UH, ., _SP, NN, ,, RB, PRP, VBP, ., _SP, VBP, PRP, VBP, VBG, DT, JJ, NN, ., _SP, UH, ,, PRP, VBP, VBP, JJ, RB, ., _SP, VBP, TO, VB, RP, CC, VB, DT, NN, IN, NN, ., _SP, PRP, RB, VBZ, JJ, RB, ., _SP, UH, ,, PRP, VBP, JJ, ,, UH, ,, VBP, PRP, VBG, IN, NNP, NN, ., _SP, UH, ,, CC, PRP, ,, UH, ,, VBD, IN, NNP, :, ...]","[ , okay, ,, be, right, there, ., , Okay, Buddy, ., , yes, ,, please, ., , Haldeman, ,, please, ., , thank, you, ., , hello, ., , mr._haldeman, ,, here, you, be, ., , hope, you, be, enjoy, this, lovely, day, ., , uh, ,, I, ', m, afraid, not, ., , got, to, get, out, and, take, a, look, at, day, ., , it, really, look, beautiful, outside, ., , right, ,, you, be, right, ,, uh, ,, be, you, work, on, Haldeman, statement, ?, , yeah, ,, and, I, ,, uh, ,, talk, to, Bill, ..., ...]","[ , okay, ,, be, right, there, ., , okay, buddi, ., , yes, ,, pleas, ., , haldeman, ,, pleas, ., , thank, you, ., , hello, ., , mr._haldeman, ,, here, you, be, ., , hope, you, be, enjoy, this, love, day, ., , uh, ,, i, ', m, afraid, not, ., , got, to, get, out, and, take, a, look, at, day, ., , it, realli, look, beauti, outsid, ., , right, ,, you, be, right, ,, uh, ,, be, you, work, on, haldeman, statement, ?, , yeah, ,, and, i, ,, uh, ,, talk, to, bill, ..., ...]","[ , okay, , being, rightness, theres, , , okay, buddy, , , yes, , please, , , haldemans, , please, , , thanks, you, , , hello, , , mr_haldeman, , heres, you, are, , , hopefully, you, re, enjoying, this, lovely, day, , , uhs, , i, , m, afraid, not, , , got, to, gets, out, and, taking, a, looks, at, day, , , its, really, looks, beautiful, outside, , , rightness, , you, re, rightness, , uhs, , are, you, working, on, haldemans, statement, , , yeah, , and, i, , uhs, , talked, to, bill, , ...]"


7283
7283
7283
7283
7283
7225
7225
7225
7225
7225
4679
4679
4679
4679
4679


In [39]:
def mapstem(txt):
    """ This function maps the second item in a tuple and returns a blank string 
    if the first item in the tuple is null, 
    It is used for mapping stemmed words without replacing stopwords that were deleted"""
    rtn = []
    for t in txt:
        t1 = re.sub(r'[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]',"",t[1]) ###### @Misha: Removes punctuation in the stemmed words
        if t[0].strip(): x = t1
        else: x = str("")
        rtn.append(x)
    return rtn

#### Set up stopwords removal

In [40]:
def rmv_sw(txt, stopwords=cp_stop_words):
    """ This function removes stop"""
    rtn = []
    for t in txt:
        if re.sub(r'[^A-Za-z0-9_]',"",t) in stopwords: t = str("")
        else: t = re.sub(r'\b\w\b',"",t) # remove any remaining single-letter words
        rtn.append(t)
    return rtn

#### Set up TF-IDF low weighted terms and noisy POS removal

In [41]:
# This removes the bottom 20% of terms, 0.8 implemented as default
infreq_terms = infreq_terms 
len(infreq_terms)

# This removes the bottom 40% of terms, doubling the noise removal
infreq_terms60 = tfidf_words(corpus,bottom=0.6)[1]
len(infreq_terms60)


Remove top weighted terms from the Stopwords list: 2083
['01', '0l', '111nere', '11ouse', '11ust', '126', '128', '129', '12th', '131', '132', '133', '134', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '161', '162', '163', '164', '165', '168', '171', '175', '180', '1933', '1948', '1952', '1954', '1958', '1964', '1969', '1974', '1eak', '1etter', '1he', '1iat', '1ittle', '340', '3ordon', '400', '406', '440', '492', '498', '4et', '4o', '4th', '500', '505', '5l', '600', '6l', '703', '7et', '87a', 'a1most', 'a1ready', 'aad', 'aah', 'abandon', 'abbie', 'aberration', 'abort', 'aboug', 'abroad', 'absences', 'abso1utely', 'absolve', 'absolving', 'absuidity', 'absurdity', 'abuser', 'abzug', 'ac', 'academic', 'acapulco', 'acceded', 'accelerate', 'accent', 'accepts', 'accidentally', 'accommodate', 'accommodation', 'accomodated', 'accompained', 'accompanied', 'accompanying', 'a

4166

In [42]:
# Review

print("perjury" in infreq_terms)
# Risk: This will remove some interesting words.

print("definite" in infreq_terms)
# Risk: And will keep some less interesting words

print("definite" in infreq_terms60)

print("kidnap" in infreq_terms60)

print(list((a,b) for (a,b) in allwts if a=="kidnap"))

True
False
False
False
[('kidnap', 5.2626798770413155)]


In [43]:
def rmv_low(txt, terms=infreq_terms):
    """ replaces low weighted terms with empty string; also used for eliminating nix POS"""
    rtn = []
    for t in txt:
        if t in terms: t = str("")
        else: t = t
        rtn.append(t)
    return rtn

In [44]:
def rmv_tflow(txt, terms):
    """replaces items _not_ in a dictionary or list of terms with an empty string."""
    rtn = []
    for t in txt:
        if re.sub(r'[^A-Za-z_]',"",t) not in terms: t = str("")
        rtn.append(t)
    return rtn

#### Set up noisy POS removal

In [45]:
nixpos = ['PRP$','WDT','WP$','IN','EX','WRB','CC','DT','UH','WP',
          'POS','TO','PRP','BES','HVS','MD']

def mappos(txt, terms=nixpos):
    """ return a blank string if t[1] is in the list of nix POS"""
    rtn = []
    for t in txt:
        if t[1] in terms: x = str("")
        else: x = t[0]
        rtn.append(x)
    return rtn

## Build Preprocessing Pipelines

After considering the PreText results, create pipelines that evaluate notable preprocessing options

**Summary of Pipelines**
- Pipeline   0
	Basline - Tokens in a flat list per transcript 
- Pipeline   1
	N-Grams (NG), Named Entities (NE) defined with an underscore 
- Pipeline   2
	NG, NE, lowercase (L), remove stopwords (SW) 
- Pipeline   3
	NG, NE, L, remove punctuation (P) 
- Pipeline  4
	NG, NE, P  
- Pipeline   5
	NG, NE, L, SW, P  
- Pipeline  6
	NG, NE, L, SW, P, lemmatize & stem (S)  
- Pipeline   7
	NG, NE, L, SW, P, S, remove numbers (N)  
- Pipeline   8
	NG, NE, L, SW, P, S, N, remove lowest 20% TF-IDF terms weighted across the corpus (I20)  
- Pipeline   9 
	NE, L, SW, P, S, N, remove lowest 40% TF-IDF terms weighted across the corpus (I40)  
- Pipeline 10
	NE, L, P, S, I40, remove all parts of speech except nouns, verbs, adjectives, and adverbs  (POS)
- Pipeline 11
	NE, L, SW, P, S, N, run TFIDF vectorizer on processed text and remove the lowest weighted 20% of processed terms  (I20p) 
- Pipeline 12
	NG, NE, L, P, S, N, POS, remove misspelled words occurring only once in the dataset (MS), run TFIDF vectorizer on processed text and remove the most frequent 10% of terms and lowest weighted 20% of processed terms  (10I20p)
- Pipeline 13
	NE, L, P, S, N, POS, MS, 10I20p

### Pipeline0:  Basline - Tokens in a flat list per transcript

In [46]:
%%time
# Tokenize Corpus per exhibit
df_all_pipes['Pipeline0'] = df_all_pipes['speech_final'].apply(lambda txt: tokenize(txt))

CPU times: user 1min 18s, sys: 10.6 s, total: 1min 29s
Wall time: 1min 29s


In [47]:
df_all_pipes.loc[45:55]

Unnamed: 0,corpus,exhibit,orig_exhibit,speech_final,named_ent,all_ngrams,freq_ngrams,allgrams,NG_text,NE_text,...,token_ne,pos_tag_ng,lemmatized_ng,stemmed_ng,pos_tag_ne,lemmatized_ne,stemmed_ne,lemstemrelem_ng,lemstemrelem_ne,Pipeline0
45,WSPF,866-003,866-003,"Good morning Mr. President . Hi , how are yo...","[white House, White House, United States Senat...","[other thing, white house people, goddamned th...","[other thing, white house people, goddamned th...","[the hell, a lot, other course, De Loach De, t...","Good morning Mr._President . Hi , how are yo...","Good morning Mr._President . Hi , how are yo...",...,"[ , Good, morning, Mr._President, ., , Hi, ,,...","[_SP, JJ, NN, NN, ., _SP, UH, ,, WRB, VBP, PRP...","[ , good, morning, mr._president, ., , hi, ,,...","[ , good, morn, mr._presid, ., , hi, ,, how, ...","[_SP, JJ, NN, NN, ., _SP, UH, ,, WRB, VBP, PRP...","[ , good, morning, mr._president, ., , hi, ,,...","[ , good, morn, mr._presid, ., , hi, ,, how, ...","[ , goodness, morning, mr_president, , , hi, ...","[ , goodness, morning, mr_president, , , hi, ...","[ , Good, morning, Mr., President, ., , Hi, ,..."
46,WSPF,Part_1_878-014,878-014,"Say , did you raise the question with the Pre...","[white House, White House, Walter Jenkins, Tom...","[executive privilege, thousand dollars, other ...","[executive privilege, thousand dollars, other ...","[the hell, Committee say Committee, a lot, wat...","Say , did you raise the_question with th._pre...","Say , did you raise the question with the Pre...",...,"[ , Say, ,, did, you, raise, the, question, wi...","[_SP, NNP, ,, VBD, PRP, VB, NN, IN, NN, RP, ,,...","[ , Say, ,, do, you, raise, the_question, with...","[ , say, ,, do, you, rais, the_quest, with, th...","[_SP, NNP, ,, VBD, PRP, VB, DT, NN, IN, DT, NN...","[ , Say, ,, do, you, raise, the, question, wit...","[ , say, ,, do, you, rais, the, question, with...","[ , saying, , did, you, raise, the_question, w...","[ , saying, , did, you, raise, the, questionab...","[ , Say, ,, did, you, raise, the, question, wi..."
47,WSPF,Part_2_878-014,878-014,So where do you come out ? -35- Gray 's alrea...,"[white House, White House, Senate Judiciary Co...","[damn thing, other thing, other things, the qu...","[damn thing, other thing, other things, the qu...","[the hell, a lot, specific knowledge, ball gam...",So where do you come out ? -35- Gray 's alrea...,So where do you come out ? -35- Gray 's alrea...,...,"[ , So, where, do, you, come, out, ?, -35-, Gr...","[_SP, RB, WRB, VBP, PRP, VB, RP, ., ,, NNP, VB...","[ , so, where, do, you, come, out, ?, -35-, Gr...","[ , so, where, do, you, come, out, ?, -35-, gr...","[_SP, RB, WRB, VBP, PRP, VB, RP, ., ,, NNP, VB...","[ , so, where, do, you, come, out, ?, -35-, Gr...","[ , so, where, do, you, come, out, ?, -35-, gr...","[ , so, where, doing, you, come, out, , 35, gr...","[ , so, where, doing, you, come, out, , 35, gr...","[ , So, where, do, you, come, out, ?, -35-, Gr..."
48,WSPF,Part_3_878-014,878-014,I did n't know that . That man watched that -...,"[white House, York City, White House, Nixon Ad...","[other thing, national security, the question,...","[other thing, national security, the question,...","[Dick Moore, the hell, a lot, the question, tw...",I did n't know that . That man watched that -...,I did n't know that . That man watched that -...,...,"[ , I, did, n't, know, that, ., That, man, wat...","[_SP, PRP, VBD, RB, VB, DT, ., DT, NN, VBD, IN...","[ , I, do, not, know, that, ., that, man, watc...","[ , i, do, not, know, that, ., that, man, watc...","[_SP, PRP, VBD, RB, VB, DT, ., DT, NN, VBD, IN...","[ , I, do, not, know, that, ., that, man, watc...","[ , i, do, not, know, that, ., that, man, watc...","[ , i, did, nt, knowingly, thats, , thats, man...","[ , i, did, nt, knowingly, thats, , thats, man...","[ , I, did, n't, know, that, ., That, man, wat..."
49,WSPF,882-012,882-012,"Well , I was wondering what I latest developm...","[white House, York City, Wilbur Mills, White H...","[damn thing, press conference, other thing, wh...","[damn thing, press conference, other thing, wh...","[the hell, a lot, good man, Wilbur Mills, two ...","Well , I was wondering what I latest developm...","Well , I was wondering what I latest developm...",...,"[ , Well, ,, I, was, wondering, what, I, lates...","[_SP, UH, ,, PRP, VBD, VBG, WP, PRP, JJS, NNS,...","[ , well, ,, I, be, wonder, what, I, late, dev...","[ , well, ,, i, be, wonder, what, i, late, dev...","[_SP, UH, ,, PRP, VBD, VBG, WP, PRP, JJS, NNS,...","[ , well, ,, I, be, wonder, what, I, late, dev...","[ , well, ,, i, be, wonder, what, i, late, dev...","[ , well, , i, was, wondering, what, i, latest...","[ , well, , i, was, wondering, what, i, latest...","[ , Well, ,, I, was, wondering, what, I, lates..."
50,WSPF,884-007,884-007,"What I think , what it really gets down to , ...","[white House, White House, Ervin Committee]","[damn thing, the question, the hell, much time...","[damn thing, the question, the hell, much time...","[the hell, whole point, Ervin Committee, a sta...","What I think , what it really gets down to , ...","What I think , what it really gets down to , ...",...,"[ , What, I, think, ,, what, it, really, gets,...","[_SP, WP, PRP, VBP, ,, WP, PRP, RB, VBZ, RP, I...","[ , what, I, think, ,, what, it, really, get, ...","[ , what, i, think, ,, what, it, realli, get, ...","[_SP, WP, PRP, VBP, ,, WP, PRP, RB, VBZ, RP, I...","[ , what, I, think, ,, what, it, really, get, ...","[ , what, i, think, ,, what, it, realli, get, ...","[ , what, i, thinking, , what, its, really, ge...","[ , what, i, thinking, , what, its, really, ge...","[ , What, I, think, ,, what, it, really, gets,..."
51,WSPF,Part_1_885-007,885-007,"Well , I 'm trying to figure out with Ehrlich...","[white House, john Dean, White House, Washingt...","[executive privilege, damn thing, other thing,...","[executive privilege, damn thing, other thing,...","[the hell, a lot, specific knowledge, other co...","Well , I 'm trying to figure out with Ehrlich...","Well , I 'm trying to figure out with Ehrlich...",...,"[ , Well, ,, I, ', m, trying, to, figure, out,...","[_SP, UH, ,, PRP, VBP, VBP, VBG, TO, VB, RP, I...","[ , well, ,, I, ', m, try, to, figure, out, wi...","[ , well, ,, i, ', m, tri, to, figur, out, wit...","[_SP, UH, ,, PRP, VBP, VBP, VBG, TO, VB, RP, I...","[ , well, ,, I, ', m, try, to, figure, out, wi...","[ , well, ,, i, ', m, tri, to, figur, out, wit...","[ , well, , i, , m, trying, to, figure, out, w...","[ , well, , i, , m, trying, to, figure, out, w...","[ , Well, ,, I, ', m, trying, to, figure, out,..."
52,WSPF,Part_2_885-007,885-007,"Already , Bob , that 's already put out anywa...",[Fred LaRue],"[other thing, the question, the hell, the way,...","[other thing, the question, the hell, the way,...","[press secretary, the hell, secret fund, other...","Already , Bob , that 's already put out anywa...","Already , Bob , that 's already put out anywa...",...,"[ , Already, ,, Bob, ,, that, 's, already, put...","[_SP, RB, ,, NNP, ,, DT, VBZ, RB, VBN, RP, RB,...","[ , already, ,, Bob, ,, that, be, already, put...","[ , alreadi, ,, bob, ,, that, be, alreadi, put...","[_SP, RB, ,, NNP, ,, DT, VBZ, RB, VBN, RP, RB,...","[ , already, ,, Bob, ,, that, be, already, put...","[ , alreadi, ,, bob, ,, that, be, alreadi, put...","[ , already, , bobs, , thats, s, already, putt...","[ , already, , bobs, , thats, s, already, putt...","[ , Already, ,, Bob, ,, that, 's, already, put..."
53,WSPF,037-175_037-176,037-175_037-176,"John Dean , please . Yes , Mr. President . ...","[john Dean, Thomas Lumbard, Pat Gray, Mr. Pres...","[executive privilege, other thing, two years, ...","[executive privilege, other thing, two years, ...","[big deal, Pat Gray, the hell, a lot, two year...","john_Dean , please . Yes , Mr._president . ...","john_Dean , please . Yes , Mr._President . ...",...,"[ , john_Dean, ,, please, ., , Yes, ,, Mr._Pr...","[_SP, FW, ,, UH, ., _SP, UH, ,, NNP, ., _SP, U...","[ , john_dean, ,, please, ., , yes, ,, Mr._pr...","[ , john_dean, ,, pleas, ., , yes, ,, mr._pre...","[_SP, FW, ,, UH, ., _SP, UH, ,, NN, ., _SP, UH...","[ , john_dean, ,, please, ., , yes, ,, mr._pr...","[ , john_dean, ,, pleas, ., , yes, ,, mr._pre...","[ , john_dean, , please, , , yes, , mr_presid...","[ , john_dean, , please, , , yes, , mr_presid...","[ , John, Dean, ,, please, ., , Yes, ,, Mr., ..."
54,WSPF,Part_1_886-008,886-008,"John , sit down , sit down . Good morning . W...","[white House, York City, White House, United S...","[press conference, thousand dollars, other thi...","[press conference, thousand dollars, other thi...","[the hell, Assistant U.S. Attorney, a lot, goo...","John , sit down , sit down . Good morning . W...","John , sit down , sit down . Good morning . W...",...,"[ , John, ,, sit, down, ,, sit, down, ., Good,...","[_SP, NNP, ,, VB, RP, ,, VB, RP, ., JJ, NN, .,...","[ , John, ,, sit, down, ,, sit, down, ., good,...","[ , john, ,, sit, down, ,, sit, down, ., good,...","[_SP, NNP, ,, VB, RP, ,, VB, RP, ., JJ, NN, .,...","[ , John, ,, sit, down, ,, sit, down, ., good,...","[ , john, ,, sit, down, ,, sit, down, ., good,...","[ , johns, , sit, down, , sit, down, , goodnes...","[ , johns, , sit, down, , sit, down, , goodnes...","[ , John, ,, sit, down, ,, sit, down, ., Good,..."


### Pipeline1: N-Grams (NG), Named Entities (NE) defined with an underscore 

In [48]:
df_all_pipes["Pipeline1"] = df_all_pipes["token_ng"].copy()

In [49]:
df_all_pipes[['speech_final','allgrams',"Pipeline1"]][80:84]

Unnamed: 0,speech_final,allgrams,Pipeline1
80,"to be repudiated by the committee , that meet...","[pretty big bag, a lot, the question, Grand Ju...","[ , to, be, repudiated, by, the, committee, ,,..."
81,"I just , uh , Bebe is over here overnight and...","[damned thing, the hell, other people, the que...","[ , I, just, ,, uh, ,, Bebe, is, over, here, o..."
82,Just wanted to see what John plans were tomor...,"[the hell, important thing, urgent call, a lot...","[ , Just, wanted, to, see, what, John, plans, ..."
83,Poor guy . Who all have you seen this mornin...,"[the hell, loose cannon, other people, same pl...","[ , poor_guy, ., , Who, all, have, you, seen,..."


### Pipeline2: (NG, NE, L, SW) Pipeline 1 + lowercase (L), remove stopwords (SW) 

In [50]:
%%time
df_all_pipes['lower'] = df_all_pipes['Pipeline1'].apply(lambda txt: [str(t).strip().lower() for t in txt])
df_all_pipes['Pipeline2'] = df_all_pipes['lower'].apply(lambda txt: rmv_sw(txt))


CPU times: user 3.8 s, sys: 35.1 ms, total: 3.83 s
Wall time: 3.86 s


In [51]:
# Test
# does the Pipeline have a stop word?
len(df_all_pipes[df_all_pipes.Pipeline2.apply(lambda x: "to" in [w for w in x])])
# Expect 0

0

In [52]:
print(len(df_all_pipes.loc[50,'Pipeline1']),'vs', len(df_all_pipes.loc[50,'lower']),'vs', len(df_all_pipes.loc[50,'Pipeline2']))
df_all_pipes.loc[50:, ['lower', 'Pipeline2']]

890 vs 890 vs 890


Unnamed: 0,lower,Pipeline2
50,"[, what, i, think, ,, what, it, really, gets, ...","[, , , , ,, , , really, gets, , , ,, basically..."
51,"[, well, ,, i, ', m, trying, to, figure, out, ...","[, , ,, , ', , trying, , figure, , , ehrlichma..."
52,"[, already, ,, bob, ,, that, 's, already, put,...","[, , ,, bob, ,, , , , , , , ., ,, , , , ., , ,..."
53,"[, john_dean, ,, please, ., , yes, ,, mr._pres...","[, john_dean, ,, , ., , , ,, mr._president, .,..."
54,"[, john, ,, sit, down, ,, sit, down, ., good, ...","[, john, ,, sit, , ,, sit, , ., good, morning,..."
...,...,...
136,"[, no, ,, i, ', m, not, going, to, bother, wit...","[, , ,, , ', , , , , bother, , , ., , , , , la..."
137,"[, the, ,, uh, ,, fbi, clips, ?, i, have, that...","[, , ,, , ,, fbi, clips, ?, , , , morning, ?, ..."
138,"[, good, which, is, very, good, ., , and, then...","[, good, , , , good, ., , , , , , , time, ., c..."
139,"[, yes, ,, please, ., , mr._haldeman, ,, pleas...","[, , ,, , ., , mr._haldeman, ,, , ., , thank, ..."


### Pipeline3: (NG, NE, L, P) Pipeline 1 + lowercase (L), remove punctuation (P) 

In [53]:
df_all_pipes['Pipeline3'] = df_all_pipes['lower'].apply(
    lambda txt: [re.sub(r'[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]',"",t) for t in txt])

In [54]:
# Test
# does the Pipeline have a word with punctuation?
len(df_all_pipes[df_all_pipes.Pipeline3.apply(lambda x: "?" in [w for w in x])])
# Expect 0

0

In [55]:
print(len(df_all_pipes.loc[50,'Pipeline1']),'vs', len(df_all_pipes.loc[50,'Pipeline3']))
df_all_pipes.loc[50:100, ['Pipeline1', 'Pipeline3']]

890 vs 890


Unnamed: 0,Pipeline1,Pipeline3
50,"[ , What, I, think, ,, what, it, really, gets,...","[, what, i, think, , what, it, really, gets, d..."
51,"[ , Well, ,, I, ', m, trying, to, figure, out,...","[, well, , i, , m, trying, to, figure, out, wi..."
52,"[ , Already, ,, Bob, ,, that, 's, already, put...","[, already, , bob, , that, s, already, put, ou..."
53,"[ , john_Dean, ,, please, ., , Yes, ,, Mr._pr...","[, john_dean, , please, , , yes, , mr_presiden..."
54,"[ , John, ,, sit, down, ,, sit, down, ., Good,...","[, john, , sit, down, , sit, down, , good, mor..."
55,"[ , But, they, know, ., But, they, know, ., ,...","[, but, they, know, , but, they, know, , , uh,..."
56,"[ , Uh, ,, he, John, does, n't, go, until, Fri...","[, uh, , he, john, does, nt, go, until, friday..."
57,"[ , But, ,, based, on, what, information, it, ...","[, but, , based, on, what, information, it, wo..."
58,"[ , to, this, morning, ., , Well, ,, you, go,...","[, to, this, morning, , , well, , you, go, rou..."
59,"[ , Yes, ?, , Mr._Colson, ,, please, ., , Th...","[, yes, , , mr_colson, , please, , , thank, yo..."


### Pipeline4: (NG, NE, P) Pipeline 1 + remove punctuation (P)

In [56]:
df_all_pipes['Pipeline4'] = df_all_pipes['Pipeline1'].apply(
    lambda txt: [re.sub(r'[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]',"",t) for t in txt])

In [57]:
# Test
# does the Pipeline have a word with punctuation?
len(df_all_pipes[df_all_pipes.Pipeline4.apply(lambda x: "p.m." in [w for w in x])])
# Expect 0

0

In [58]:
print(len(df_all_pipes.loc[55,'Pipeline1']),'vs', len(df_all_pipes.loc[55,'Pipeline4']))
df_all_pipes.loc[50:100, ['Pipeline1', 'Pipeline4']]

7501 vs 7501


Unnamed: 0,Pipeline1,Pipeline4
50,"[ , What, I, think, ,, what, it, really, gets,...","[ , What, I, think, , what, it, really, gets, ..."
51,"[ , Well, ,, I, ', m, trying, to, figure, out,...","[ , Well, , I, , m, trying, to, figure, out, w..."
52,"[ , Already, ,, Bob, ,, that, 's, already, put...","[ , Already, , Bob, , that, s, already, put, o..."
53,"[ , john_Dean, ,, please, ., , Yes, ,, Mr._pr...","[ , john_Dean, , please, , , Yes, , Mr_presid..."
54,"[ , John, ,, sit, down, ,, sit, down, ., Good,...","[ , John, , sit, down, , sit, down, , Good, mo..."
55,"[ , But, they, know, ., But, they, know, ., ,...","[ , But, they, know, , But, they, know, , , U..."
56,"[ , Uh, ,, he, John, does, n't, go, until, Fri...","[ , Uh, , he, John, does, nt, go, until, Frida..."
57,"[ , But, ,, based, on, what, information, it, ...","[ , But, , based, on, what, information, it, w..."
58,"[ , to, this, morning, ., , Well, ,, you, go,...","[ , to, this, morning, , , Well, , you, go, r..."
59,"[ , Yes, ?, , Mr._Colson, ,, please, ., , Th...","[ , Yes, , , Mr_Colson, , please, , , Thank,..."


### Pipeline5: (NG, NE, L, SW, P) Pipeline 2 + remove stop-words (SW), remove punctuation (P) 

In [59]:
df_all_pipes['Pipeline5'] = df_all_pipes['Pipeline2'].apply(
    lambda txt: [re.sub(r'[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]',"",t) for t in txt])

In [60]:
# Test
# does the Pipeline have a word with punctuation?
print(len(df_all_pipes[df_all_pipes.Pipeline5.apply(lambda x: "-" in [w for w in x])]))

# does the Pipeline have a low frequency/stopword?
print(len(df_all_pipes[df_all_pipes.Pipeline5.apply(lambda x: "to" in [w for w in x])]))
# Expect all 0's

0
0


In [61]:
# Review & Test 
check2 = df_all_pipes.loc[[77, 80, 84],['Pipeline2', 'Pipeline5']]
display(check2)
for r in range(len(check2)):
    for c in range(len(check2.columns)):
        print(len(check2.iloc[r,c]))

# Expect all lengths the same for each example

Unnamed: 0,Pipeline2,Pipeline5
77,"[, , , , come, , :30, ,, , , , better, , ., , ...","[, , , , come, , 30, , , , , better, , , , , ,..."
80,"[, , , repudiated, , , committee, ,, , meets, ...","[, , , repudiated, , , committee, , , meets, t..."
84,"[, okay, ,, , , , ., , okay, buddy, ., , , ,, ...","[, okay, , , , , , , okay, buddy, , , , , , , ..."


7077
7077
4037
4037
4596
4596


### Pipeline6: (NG, NE, L, SW, P, S) Pipeline 5 + lemmatize & stem (S)

In [62]:
# bring stemmed and pipeline5 words together
df_all_pipes['Pipeline6'] = list(zip(df_all_pipes.Pipeline5, df_all_pipes.stemmed_ng))

In [63]:
# create a tuple of mapped stemmed word to the pipeline word
df_all_pipes['Pipeline6'] = df_all_pipes['Pipeline6'].apply(lambda x: list(zip(x[0],x[1])))

# replace the tuple with the stemmed word if the pipeline word exists
df_all_pipes['Pipeline6'] = df_all_pipes['Pipeline6'].apply(lambda x: mapstem(x))

In [64]:
check3 = df_all_pipes.loc[[59,80,82,84],['Pipeline1','Pipeline5', 'Pipeline6']]
display(check3)
for r in range(len(check3)):
    for c in range(len(check3.columns)):
        print(len(check3.iloc[r,c]))

Unnamed: 0,Pipeline1,Pipeline5,Pipeline6
59,"[ , Yes, ?, , Mr._Colson, ,, please, ., , Th...","[, , , , mr_colson, , , , , thank, , , mr_pres...","[, , , , mr_colson, , , , , thank, , , mr_pres..."
80,"[ , to, be, repudiated, by, the, committee, ,,...","[, , , repudiated, , , committee, , , meets, t...","[, , , repudi, , , committe, , , meet, tuesday..."
82,"[ , Just, wanted, to, see, what, John, plans, ...","[, , wanted, , , , john, plans, , tomorrow, , ...","[, , want, , , , john, plan, , tomorrow, , , ,..."
84,"[ , Okay, ,, be, right, there, ., , Okay, Bud...","[, okay, , , , , , , okay, buddy, , , , , , , ...","[, okay, , , , , , , okay, buddi, , , , , , , ..."


3729
3729
3729
4037
4037
4037
7536
7536
7536
4596
4596
4596


In [65]:
# Remove Stop-words again after stemming, which introduced some noise
df_all_pipes['Pipeline6'] = df_all_pipes['Pipeline6'].apply(lambda txt: rmv_sw(txt))

In [66]:
# Test
# does the Pipeline have a word with punctuation?
print(len(df_all_pipes[df_all_pipes.Pipeline6.apply(lambda x: "p.m." in [w for w in x])]))

# does the Pipeline have a low frequency/stopword?
print(len(df_all_pipes[df_all_pipes.Pipeline6.apply(lambda x: "to" in [w for w in x])]))

# does the Pipeline have a low frequency/stopword?
print(len(df_all_pipes[df_all_pipes.Pipeline6.apply(lambda x: "be" in [w for w in x])]))

# does the Pipeline have a low frequency/stopword?
print(len(df_all_pipes[df_all_pipes.Pipeline6.apply(lambda x: "went" in [w for w in x])]))

# does the Pipeline have a low frequency/stopword?
print(len(df_all_pipes[df_all_pipes.Pipeline6.apply(lambda x: "goes" in [w for w in x])]))

# Expect all 0's

0
0
0
0
0


In [67]:
# Test
print(len(df_all_pipes.loc[85,'Pipeline6']),'vs', len(df_all_pipes.loc[85,'stemmed_ng']),'vs', len(df_all_pipes.loc[85,'Pipeline5']))
# Expect all same

4452 vs 4452 vs 4452


### Pipeline7: (NG, NE, L, SW, P, S, N) Pipeline 6 + remove numbers (N)  


In [68]:
df_all_pipes['Pipeline7'] = df_all_pipes['Pipeline6'].apply(
    lambda txt: [re.sub(r'[0-9]',"",t) for t in txt])

In [69]:
print(len(df_all_pipes.loc[50,'Pipeline7']),'vs', len(df_all_pipes.loc[50,'Pipeline6']))
df_all_pipes.loc[[70, 50], ['Pipeline7','Pipeline6']]

890 vs 890


Unnamed: 0,Pipeline7,Pipeline6
70,"[, , , , , , , , , , waterg, , mr_presid, , , ...","[, , , , , , , , , , waterg, , mr_presid, , , ..."
50,"[, , , , , , , realli, , , , , basic, , , , , ...","[, , , , , , , realli, , , , , basic, , , , , ..."


In [70]:
# Test
# does the Pipeline have a word with numbers?
print(len(df_all_pipes[df_all_pipes.Pipeline7.apply(lambda x: "350" in [w for w in x])]))
# Expect 0

0


### Pipeline8: (NG, NE, L, SW, P, S, N, I20) Pipeline 7 + remove lowest 20% TF-IDF terms weighted across the corpus (I20)  

In [71]:
# Remove low-weighted tfidf terms
df_all_pipes['Pipeline8'] = df_all_pipes['Pipeline7'].apply(lambda x: rmv_low(x))

In [72]:
# Test
# does the Pipeline have a word with punctuation?
print(len(df_all_pipes[df_all_pipes.Pipeline8.apply(lambda x: "p.m." in [w for w in x])]))

# does the Pipeline have a low frequency/stopword?
print(len(df_all_pipes[df_all_pipes.Pipeline8.apply(lambda x: "to" in [w for w in x])]))

# does the Pipeline have a low frequency/stopword?
print(len(df_all_pipes[df_all_pipes.Pipeline8.apply(lambda x: "be" in [w for w in x])]))

# does the Pipeline have a low frequency/stopword?
print(len(df_all_pipes[df_all_pipes.Pipeline8.apply(lambda x: "went" in [w for w in x])]))

# does the Pipeline have a low frequency/stopword?
print(len(df_all_pipes[df_all_pipes.Pipeline8.apply(lambda x: "goes" in [w for w in x])]))

# Expect all 0's

0
0
0
0
0


In [73]:
# Review
print(len(df_all_pipes.loc[50,'Pipeline8']),'vs', len(df_all_pipes.loc[50,'Pipeline7']))
# Expect equal lengths

df_all_pipes.loc[50:100, ['Pipeline8','Pipeline7','Pipeline5']]

890 vs 890


Unnamed: 0,Pipeline8,Pipeline7,Pipeline5
50,"[, , , , , , , realli, , , , , , , , , , , , r...","[, , , , , , , realli, , , , , basic, , , , , ...","[, , , , , , , really, gets, , , , basically, ..."
51,"[, , , , , , tri, , figur, , , , , , strategi,...","[, , , , , , tri, , figur, , , ehrlichman, , ,...","[, , , , , , trying, , figure, , , ehrlichman,..."
52,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , bob, , , , , , , , , , , , , , , , , , ...","[, , , bob, , , , , , , , , , , , , , , , , , ..."
53,"[, john_dean, , , , , , , mr_presid, , , , , ,...","[, john_dean, , , , , , , mr_presid, , , , , ,...","[, john_dean, , , , , , , mr_president, , , , ..."
54,"[, , , , , , , , , , morn, , , , , , , , summa...","[, john, , sit, , , sit, , , good, morn, , , ,...","[, john, , sit, , , sit, , , good, morning, , ..."
55,"[, , , , , , , , , , , , , , , , , , , , awful...","[, , , , , , , , , , , , , , , , , , , , awful...","[, , , , , , , , , , , , , , , , , , , , awful..."
56,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , john, , , , , friday, , friday, , , ,...","[, , , , john, , , , , friday, , friday, , , ,..."
57,"[, , , , , , inform, , , , , exampl, , , , , ,...","[, , , base, , , inform, , , , , exampl, , , h...","[, , , based, , , information, , , , , example..."
58,"[, , , morn, , , , , , , round, , round, , , ,...","[, , , morn, , , , , , , round, , round, , , c...","[, , , morning, , , , , , , round, , round, , ..."
59,"[, , , , mr_colson, , , , , , , , mr_presid, ,...","[, , , , mr_colson, , , , , thank, , , mr_pres...","[, , , , mr_colson, , , , , thank, , , mr_pres..."


### Pipeline9: (NE, L, SW, P, S, N, I40) Include Named Entities (NE; no other N-Grams), lowercase (L), remove stop-words (SW), remove punctuation (P), lemmatize and stem (S), remove numbers (N), remove lowest 40% TF-IDF terms weighted across the corpus (I40)  

In [74]:
# Lowercase from Named Entities tokenized column
df_all_pipes['Pipeline9'] = df_all_pipes['token_ne'].apply(lambda txt: [str(t).strip().lower() for t in txt])

In [75]:
%%time
# Remove 40% low-weighted tfidf terms
df_all_pipes['Pipeline9'] = df_all_pipes['Pipeline9'].apply(lambda x: rmv_low(x, infreq_terms60))

CPU times: user 35.5 s, sys: 179 ms, total: 35.7 s
Wall time: 35.9 s


In [76]:
# Then apply Lem/Stem from _named_entity_ column
# bring stemmed and pipeline words together
df_all_pipes['Pipeline9'] = list(zip(df_all_pipes.Pipeline9, df_all_pipes.stemmed_ne))
# create a tuple of mapped stemmed word to the pipeline word
df_all_pipes['Pipeline9'] = df_all_pipes['Pipeline9'].apply(lambda x: list(zip(x[0],x[1])))
# replace the tuple with the stemmed word if the pipeline word exists
df_all_pipes['Pipeline9'] = df_all_pipes['Pipeline9'].apply(lambda x: mapstem(x))

In [77]:
# Remove Stopwords
df_all_pipes['Pipeline9'] = df_all_pipes['Pipeline9'].apply(lambda txt: rmv_sw(txt))

In [78]:
# Remove Punctuation
df_all_pipes['Pipeline9'] = df_all_pipes['Pipeline9'].apply(
    lambda txt: [re.sub(r'[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]',"",t) for t in txt])

In [79]:
# Remove Numbers
df_all_pipes['Pipeline9'] = df_all_pipes['Pipeline9'].apply(
    lambda txt: [re.sub(r'[0-9]',"",t) for t in txt])

In [80]:
# Test

# does the Pipeline have a low weighted term?
print(len(df_all_pipes[df_all_pipes.Pipeline9.apply(lambda x: "definite" in [w for w in x])]))

# does the Pipeline have a particular stop word?
print(len(df_all_pipes[df_all_pipes.Pipeline9.apply(lambda x: "to" in [w for w in x])]))

# does the Pipeline have a word with punctuation?
print(len(df_all_pipes[df_all_pipes.Pipeline9.apply(lambda x: "p.m." in [w for w in x])]))

# does the Pipeline have a word with numbers?
print(len(df_all_pipes[df_all_pipes.Pipeline9.apply(lambda x: "1961" in [w for w in x])]))

# Expect all 0's

print(len(df_all_pipes.loc[50,'Pipeline9']),'vs', len(df_all_pipes.loc[50,'stemmed_ne']))

# expect numbers to be equal

0
0
0
0
903 vs 903


### Pipeline10: (NE, L, POS, P, S, I40) Include Named Entities (NE; no other N-Grams), lowercase (L), remove noisy parts-of-speech (POS), remove punctuation (P), lemmatize and stem (S), (keep numbers,) remove lowest 40% TF-IDF terms weighted across the corpus (I40)  

In [81]:
# Lowercase from Named Entities tokenized column
df_all_pipes['Pipeline10'] = df_all_pipes['token_ne'].apply(lambda txt: [str(t).strip().lower() for t in txt])

In [82]:
# Remove Punctuation
df_all_pipes['Pipeline10'] = df_all_pipes['Pipeline10'].apply(
    lambda txt: [re.sub(r'[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]',"",t) for t in txt])

In [83]:
%%time
# Remove low-weighted tfidf terms
df_all_pipes['Pipeline10'] = df_all_pipes['Pipeline10'].apply(lambda x: rmv_low(x, infreq_terms60))

CPU times: user 34.6 s, sys: 122 ms, total: 34.7 s
Wall time: 34.9 s


In [84]:
# Then apply Lem/Stem from _named_entity_ column
# bring stemmed and pipeline words together
df_all_pipes['Pipeline10'] = list(zip(df_all_pipes.Pipeline10, df_all_pipes.stemmed_ne))
# create a tuple of mapped stemmed word to the pipeline word
df_all_pipes['Pipeline10'] = df_all_pipes['Pipeline10'].apply(lambda x: list(zip(x[0],x[1])))
# replace the tuple with the stemmed word if the pipeline word exists
df_all_pipes['Pipeline10'] = df_all_pipes['Pipeline10'].apply(lambda x: mapstem(x))

In [85]:
# Then remove less informative POS from pos_tag_ne column
# bring POS and pipeline words together
df_all_pipes['Pipeline10'] = list(zip(df_all_pipes.Pipeline10, df_all_pipes.pos_tag_ne))
# create a tuple of mapped stemmed word to the pipeline word
df_all_pipes['Pipeline10'] = df_all_pipes['Pipeline10'].apply(lambda x: list(zip(x[0],x[1])))
# replace the tuple with the stemmed word if the pipeline word exists
df_all_pipes['Pipeline10'] = df_all_pipes['Pipeline10'].apply(lambda x: mappos(x))

In [86]:
# Test

# does the Pipeline have a low weighted term?
print(len(df_all_pipes[df_all_pipes.Pipeline10.apply(lambda x: "definite" in [w for w in x])]))

# does the Pipeline have a particular stop word?
print(len(df_all_pipes[df_all_pipes.Pipeline10.apply(lambda x: "um" in [w for w in x])]))

# does the Pipeline have a word with punctuation?
print(len(df_all_pipes[df_all_pipes.Pipeline10.apply(lambda x: "p.m." in [w for w in x])]))

# does the Pipeline have a word with numbers?
print(len(df_all_pipes[df_all_pipes.Pipeline10.apply(lambda x: "1961" in [w for w in x])]))

# Expect 0's except the last, which should be 2

print(len(df_all_pipes.loc[50,'Pipeline10']),'vs', len(df_all_pipes.loc[50,'stemmed_ne']))

# expect numbers to be equal

0
0
0
2
903 vs 903


In [87]:
# Review

print(len(df_all_pipes.loc[50,'Pipeline10']),'vs', len(df_all_pipes.loc[50,'Pipeline9']))
# Expect equal lengths

df_all_pipes.loc[80:100, ['Pipeline10','Pipeline9','Pipeline6']]

903 vs 903


Unnamed: 0,Pipeline10,Pipeline9,Pipeline6
80,"[, , , repudi, , , , , , , , , , , , , , , , ,...","[, , , repudi, , , , , , , , , , , , , , , , ,...","[, , , repudi, , , committe, , , meet, tuesday..."
81,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , bebe, , , , overnight, , , , tell..."
82,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , john, plan, , tomorrow, , , , , ,..."
83,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, poor_guy, , , , , , , , , morn, , , , , , ,..."
84,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, okay, , , , , , , okay, buddi, , , , , , , ..."
85,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, john, , , , dean, , , , , , , , , , , dean,..."
86,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , john, , , , , , good, morn, , , good, mor..."
87,"[, , , , , , , , , be, , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , case, , , , difficult, case, ..."
88,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , scenario, work, , pretti, , , , , , , , j..."
89,"[, , , , , , , , , , , , , , , , , , , , , sta...","[, , , , , , , , , , , , , , , , , , , , , sta...","[, , , , , , thought, , , , , , , , stage, , d..."


### Pipeline11: (NE, L, SW, P, SL, N, I20p) Include Named Entities (NE; no other N-Grams), lowercase (L), remove stop-words (SW), remove punctuation (P), lemmatize and stem then re-map the most common term for readability (SL), remove numbers (N), run TFIDF vectorizer on processed text and remove the lowest weighted 20% of processed terms  (I20p) 

In [88]:
# Lowercase from Named Entities tokenized column
df_all_pipes['Pipeline11'] = df_all_pipes['token_ne'].apply(lambda txt: [str(t).strip().lower() for t in txt])

In [89]:
# Then apply Lem/Stem from _named_entity_ column
# bring stemmed and pipeline words together
df_all_pipes['Pipeline11'] = list(zip(df_all_pipes.Pipeline11, df_all_pipes.stemmed_ne))
# create a tuple of mapped stemmed word to the pipeline word
df_all_pipes['Pipeline11'] = df_all_pipes['Pipeline11'].apply(lambda x: list(zip(x[0],x[1])))
# replace the tuple with the stemmed word if the pipeline word exists
df_all_pipes['Pipeline11'] = df_all_pipes['Pipeline11'].apply(lambda x: mapstem(x))

In [90]:
# Replace the stemmed term with the most frequent lemmatized term for ease of readability
df_all_pipes['Pipeline11'] = df_all_pipes['Pipeline11'].apply(lambda x: remaplem(x))

In [91]:
# Remove Numbers
df_all_pipes['Pipeline11'] = df_all_pipes['Pipeline11'].apply(
    lambda txt: [re.sub(r'[0-9]',"",t) for t in txt])

In [92]:
# Remove Punctuation: Remove after lem/stem to capture all versions of the terms
df_all_pipes['Pipeline11'] = df_all_pipes['Pipeline11'].apply(
    lambda txt: [re.sub(r"(\'s)|(\'t)|[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]","",t) for t in txt])

In [93]:
# Remove Stopwords: Remove after lem/stem to capture all versions of the terms
df_all_pipes['Pipeline11'] = df_all_pipes['Pipeline11'].apply(lambda txt: rmv_sw(txt))

Run TFIDF on the terms after lem/stem  
This should place a more appropriate weight on each of the terms when compared with only other relevant terms

In [94]:
# Create a string of each pipeline exhibit to evaluate with tfidf vectorizer
exhibits = [e for e in id_corpus]
pipetext = []
for e in exhibits:
    pipetext.append(stringitize(df_all_pipes.Pipeline11[df_all_pipes["exhibit"] == e]))

pipetext = pd.Series(pipetext, index = exhibits, name = 'pipe')
len(pipetext)

141

In [95]:
# Run tfidf vectorizer on just the Pipeline terms
# This enables better weighting on stemmed terms and more appropriately scaled weights on the remaining terms
infreq_terms11 = tfidf_words(pipetext)[1] 


Remove top weighted terms from the Stopwords list: 1425
['aad', 'aah', 'abbie_hoffman', 'aberration', 'abort', 'aboug', 'abouthunt', 'aboutsaid', 'aboutthe', 'abroad', 'absoutely', 'absuidity', 'ac', 'academic', 'acapulco', 'accede', 'accelerate', 'accent', 'accidentally', 'accomodate', 'accompaine', 'ache', 'aclu', 'acne', 'acre', 'activist', 'adduce', 'adhere', 'admissbible', 'admonition', 'adn', 'adopt', 'adrenalin', 'adulation', 'adult', 'adventure', 'adversary', 'advised', 'aegis', 'affront', 'afore', 'africa', 'aft', 'afterthe', 'aggrevate', 'agonize', 'agreeable', 'agressively', 'agri', 'agronsky', 'ahe', 'ahsure', 'ahwas', 'ai', 'aiid', 'ail', 'airfield', 'al_hubbard', 'ala', 'alabama', 'alan_wallace', 'alarm', 'alexander', 'alexandria', 'alibi', 'allentown', 'allentown_boys_club', 'alligator', 'allude', 'allve', 'alsop', 'alw', 'amass', 'ambassador', 'ambassadorial', 'ambiguous', 'ambrose', 'amen', 'amicable', 'ammendment', 'amo', 'amost', 'ampi', 'analyst', 'anand', 'andrews

In [96]:
# Review
print("cancerous" in infreq_terms11)
# Risk: removes some interesting words

print("prostitutes" in infreq_terms11)
# but keeps some interesting words

True
False


In [97]:
%%time
# Remove low-weighted tfidf terms
df_all_pipes['Pipeline11'] = df_all_pipes['Pipeline11'].apply(lambda x: rmv_low(x, infreq_terms11))

CPU times: user 10.5 s, sys: 20.6 ms, total: 10.5 s
Wall time: 10.5 s


In [98]:
# Test

# does the Pipeline have a low weighted term?
print(len(df_all_pipes[df_all_pipes.Pipeline11.apply(lambda x: "dig" in [w for w in x])]))

# does the Pipeline have a particular stop word?
print(len(df_all_pipes[df_all_pipes.Pipeline11.apply(lambda x: "um" in [w for w in x])]))

# does the Pipeline have a word with punctuation?
print(len(df_all_pipes[df_all_pipes.Pipeline11.apply(lambda x: "p.m." in [w for w in x])]))

# does the Pipeline have a word with punctuation?
print(len(df_all_pipes[df_all_pipes.Pipeline11.apply(lambda x: "1961" in [w for w in x])]))

# Expect all 0's

print(len(df_all_pipes.loc[50,'Pipeline11']),'vs', len(df_all_pipes.loc[50,'stemmed_ne']))

# expect numbers to be equal

0
0
0
0
903 vs 903


### Pipeline12: (NG, NE, L, POS, MS, P, SL, N, 10I20p) Include Named Entities and N-Grams (NG, NE), lowercase (L), remove noisy parts-of-speech (POS), remove misspelled words occurring only once in the dataset (MS), remove punctuation (P), lemmatize and stem then re-map the most common term for readability (SL), run TFIDF vectorizer on processed text and remove the most frequent 10% of terms and lowest weighted 20% of processed terms  (10I20p)

In [99]:
# Create Pipe 12 from lem-stem-re-lemmatized NGrams column
# Lowercase 
# Remove Numbers and Punctuation: Remove after lem/stem to capture all versions of the terms
df_all_pipes['Pipeline12'] = df_all_pipes["lemstemrelem_ng"].apply(
    lambda txt: [t.lower() for t in txt]).apply(
    lambda txt: [re.sub(r"(\'s)|(\'t)|[^A-Za-z_\s]|[\-\[\]\.\,\'\"\*]","",t) for t in txt]) # No numbers or punctuation

In [100]:
%%time
# Create a corpus with identified N-Grams
vcorpus= df_all_pipes['NG_text'].copy()

# Create a list of noisy words across the full corpus to eliminate
nixpostok = []
alltoks = defaultdict(int)
for i,c in enumerate(vcorpus):
    doc = nlp(c) # evaluate each exhibit with SpaCy
    #add tokens to the tokens count and to the noisy tokens list in all lowercase.
    for tok in doc:
        t = re.sub(r'[^A-Za-z_\s]|[\-\[\]\.\,\'\"\*]',"",tok.text).strip().lower()
        if len(t)>0: alltoks[t] += 1 # count every token
        if len(t)<2: nixpostok.append(t) # any word less than 2 characters is noise
        if tok.tag_ in nixpos: nixpostok.append(t) # any word with the noisy pos tag is noise

# Create a list of likely misspelled words across the corpus
lowfreq = [k for k,v in alltoks.items() if v<2 and "_" not in k] # do not remove N-Grams
badspell = list(spell.unknown(lowfreq))

# Combine noisy tokens and misspelled words into a list to eliminate from the corpus
nixtok = set(nixpostok + badspell) # Ensure unique

CPU times: user 1min 18s, sys: 10.4 s, total: 1min 29s
Wall time: 1min 29s


In [101]:
# Remove misspelled and noisy tokens (nixtok) after lem/stem to capture all versions of the terms
df_all_pipes['Pipeline12'] = df_all_pipes['Pipeline12'].apply(lambda txt: rmv_low(txt, nixtok))

In [102]:
# Run the TF-IDF vectorizer on the remaining terms (to apply weights more evenly on these terms)
# Remove 20% lowest weighted TFIDF terms and any other terms that are in 90% of the documents

#    Create a string of each pipeline exhibit to evaluate with tfidf vectorizer
string12 = df_all_pipes['Pipeline12'].apply(lambda row: stringitize(row))

#    Run TFIDF Vectorizer to eliminate the most frequent terms and assign a weight to all terms
vectorizer = TfidfVectorizer(max_df = 0.90) #ignore terms that appear in more than 90% of the documents
vectorizedcorpus = vectorizer.fit_transform(string12)

#    Create a tuple of the feature name and the weight
wts_tfidf = (dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_)))

#    Calculate lowest 20% of tfidf values
lower20 = np.percentile(list(wts_tfidf.values()), 20)
print(lower20)
upper20 = np.percentile(list(wts_tfidf.values()), 80)
print(upper20)

#    Remove lower20 terms from the dictionary
wts_tfidf = {k:v for k,v in wts_tfidf.items() if v>lower20}

3.8763855159214247
5.2626798770413155


In [103]:
# Remove lowest weighted overly-frequent terms
df_all_pipes['Pipeline12'] = df_all_pipes['Pipeline12'].apply(lambda txt: rmv_tflow(txt, wts_tfidf))

In [104]:
# Test

# does the Pipeline have a low weighted term?
print(len(df_all_pipes[df_all_pipes.Pipeline12.apply(lambda x: "net" in [w for w in x])]))

# does the Pipeline have a particular stop word?
print(len(df_all_pipes[df_all_pipes.Pipeline12.apply(lambda x: "um" in [w for w in x])]))

# does the Pipeline have a word with punctuation?
print(len(df_all_pipes[df_all_pipes.Pipeline12.apply(lambda x: "p.m." in [w for w in x])]))

# does the Pipeline have a word with numbers?
print(len(df_all_pipes[df_all_pipes.Pipeline12.apply(lambda x: "1961" in [w for w in x])]))

# Expect all 0's

0
0
0
0


In [105]:
# Test
# Ensure very relevant terms remain in the corpus
print(len(df_all_pipes[df_all_pipes.Pipeline12.apply(lambda x: "kidnapping" in [w for w in x])]))
print(len(df_all_pipes[df_all_pipes.Pipeline12.apply(lambda x: "prostitutes" in [w for w in x])]))
# Expect 4, 2

4
2


In [106]:
# Review
print(len(df_all_pipes.loc[50,'Pipeline8']),'vs', len(df_all_pipes.loc[50,'Pipeline12']))
# expect numbers are equal

df_all_pipes.loc[50:100, ['Pipeline8','Pipeline11','Pipeline12']]

890 vs 890


Unnamed: 0,Pipeline8,Pipeline11,Pipeline12
50,"[, , , , , , , realli, , , , , , , , , , , , r...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
51,"[, , , , , , tri, , figur, , , , , , strategi,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
52,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
53,"[, john_dean, , , , , , , mr_presid, , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
54,"[, , , , , , , , , , morn, , , , , , , , summa...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
55,"[, , , , , , , , , , , , , , , , , , , , awful...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
56,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
57,"[, , , , , , inform, , , , , exampl, , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
58,"[, , , morn, , , , , , , round, , round, , , ,...","[, , , , , , , , , , round, , round, , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
59,"[, , , , mr_colson, , , , , , , , mr_presid, ,...","[, , , , mr_colson, , , , , , , , , , , , , , ...","[, , , , mr_colson, , , , , , , , , , , , , , ..."


### Pipeline13: (NE, L, POS, MS, P, SL, N, 10I20p) Include Named Entities (NE; no other N-Grams), lowercase (L), remove noisy parts-of-speech (POS), remove misspelled words occurring only once in the dataset (MS), remove punctuation (P), lemmatize and stem then re-map the most common term for readability (SL), run TFIDF vectorizer on processed text and remove the most frequent 10% of terms and lowest weighted 20% of processed terms  (10I20p)

In [107]:
# Create Pipe 13 from lem-stem-re-lemmatized Named Entities column
# Lowercase 
# Remove Numbers and Punctuation: Remove after lem/stem to capture all versions of the terms
df_all_pipes['Pipeline13'] = df_all_pipes["lemstemrelem_ne"].apply(
    lambda txt: [t.lower() for t in txt]).apply(
    lambda txt: [re.sub(r"(\'s)|(\'t)|[^A-Za-z_\s]|[\-\[\]\.\,\'\"\*]","",t) for t in txt])

In [108]:
%%time
# Create a corpus with identified N-Entities
vcorpus= df_all_pipes['NE_text'].copy()

# Create a list of noisy words across the full corpus to eliminate
nixpostok = []
alltoks = defaultdict(int)
for i,c in enumerate(vcorpus):
    doc = nlp(c) # evaluate each exhibit with SpaCy
    #add tokens to the tokens count and to the noisy tokens list in all lowercase.
    for tok in doc:
        t = re.sub(r'[^A-Za-z_\s]|[\-\[\]\.\,\'\"\*]',"",tok.text).strip().lower()
        if len(t)>0: alltoks[t] += 1 # count every token
        if len(t)<2: nixpostok.append(t) # any word less than 2 characters is noise
        if tok.tag_ in nixpos: nixpostok.append(t) # any word with the noisy pos tag is noise

# Create a list of likely misspelled words across the corpus
lowfreq = [k for k,v in alltoks.items() if v<2 and "_" not in k] # do not remove Named Entities
badspell = list(spell.unknown(lowfreq))

# Combine noisy tokens and misspelled words into a list to eliminate from the corpus
nixtok = set(nixpostok + badspell) # Ensure unique

CPU times: user 1min 14s, sys: 9.56 s, total: 1min 23s
Wall time: 1min 24s


In [109]:
# Remove misspelled and noisy tokens (nixtok) after lem/stem to capture all versions of the terms
df_all_pipes['Pipeline13'] = df_all_pipes['Pipeline13'].apply(lambda txt: rmv_low(txt, nixtok))

In [110]:
# Remove remaining 20% lowest weighted TFIDF terms and any other terms that are in 90% of the documents

#    Re-stringitize the corpus for evaluation by TFIDF vectorizer
string12 = df_all_pipes['Pipeline13'].apply(lambda row: stringitize(row))

#    Run TFIDF Vectorizer to eliminate the most frequent terms and assign a weight to all terms
vectorizer = TfidfVectorizer(max_df = 0.90) #ignore terms that appear in more than 90% of the documents
vectorizedcorpus = vectorizer.fit_transform(string12)

#    Create a tuple of the feature name and the weight
wts_tfidf = (dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_)))

#    Calculate lowest 20% of tfidf values
lower20 = np.percentile(list(wts_tfidf.values()), 20)
print(lower20)
upper20 = np.percentile(list(wts_tfidf.values()), 80)
print(upper20)

#    Remove lower20 terms from the dictionary
wts_tfidf = {k:v for k,v in wts_tfidf.items() if v>lower20}

3.7586024802650413
5.2626798770413155


In [111]:
# Remove lowest weighted and overly-frequent terms
df_all_pipes['Pipeline13'] = df_all_pipes['Pipeline13'].apply(lambda txt: rmv_tflow(txt, wts_tfidf))

In [112]:
# Test

# does the Pipeline have a low weighted term?
print(len(df_all_pipes[df_all_pipes.Pipeline13.apply(lambda x: "net" in [w for w in x])]))

# does the Pipeline have a particular stop word?
print(len(df_all_pipes[df_all_pipes.Pipeline13.apply(lambda x: "um" in [w for w in x])]))

# does the Pipeline have a word with punctuation?
print(len(df_all_pipes[df_all_pipes.Pipeline13.apply(lambda x: "p.m." in [w for w in x])]))

# does the Pipeline have a word with punctuation?
print(len(df_all_pipes[df_all_pipes.Pipeline13.apply(lambda x: "1961" in [w for w in x])]))

# Expect all 0's

0
0
0
0


In [113]:
# Test
# Ensure very relevant terms remain in the corpus
print(len(df_all_pipes[df_all_pipes.Pipeline13.apply(lambda x: "kidnapping" in [w for w in x])]))
print(len(df_all_pipes[df_all_pipes.Pipeline13.apply(lambda x: "prostitutes" in [w for w in x])]))
# Expect 4, 2

4
2


In [114]:
# Review
print(len(df_all_pipes.loc[50,'Pipeline10']),'vs', len(df_all_pipes.loc[50,'Pipeline13']))
# expect numbers are equal

df_all_pipes.loc[50:100, ['Pipeline10','Pipeline11','Pipeline13']]

903 vs 903


Unnamed: 0,Pipeline10,Pipeline11,Pipeline13
50,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
51,"[, , , , , m, , , , , , , , , , , , , , , , , ...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
52,"[, , , , , , be, , , , , , , , be, , , , s, , ...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
53,"[, john_dean, , , , , , , mr_presid, , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
54,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
55,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
56,"[, , , , , , not, , , , , , , , , , , , , , , ...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
57,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."
58,"[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , round, , round, , , , , ,...","[, , , , , , , , , , round, , round, , , , , ,..."
59,"[, , , , mr_colson, , , , , , , , mr_presid, ,...","[, , , , mr_colson, , , , , , , , , , , , , , ...","[, , , , mr_colson, , , , , , , , , , , , , , ..."


## Store Pipelines

### Pickle the indexed DataFrames

In [115]:
# pickle the indexed dataframe to evaluate as needed
output = open('pipelines_w_index_coref.pkl', 'wb')
sys.setrecursionlimit(100000)

pickle.dump(df_all_pipes, output)

output.close()

In [116]:
# Hold a copied version of the indexed pipelines
df_all_pipe_windex = df_all_pipes.copy()

### Remove blanks from the pipelines
strip all nulls out of the list of tokens

In [117]:
def delnulls(line):
    """ This function eliminates any nulls from a list"""
    if isinstance(line, list):
        clnline = [x.strip() for x in line if x.strip()]
        if len(clnline) == 0:
            clnline = None
    else: clnline = line
    return clnline

Pipeline 0 and Pipelin 1 have no blanks.

In [118]:
df_all_pipes.Pipeline2 = df_all_pipes.Pipeline2.apply(lambda line: delnulls(line))

In [119]:
print(len(df_all_pipes.Pipeline2[50]))
print(len(df_all_pipe_windex.Pipeline2[50]))

353
890


In [120]:
df_all_pipes.Pipeline3 = df_all_pipes.Pipeline3.apply(lambda line: delnulls(line))

In [121]:
print(len(df_all_pipes.Pipeline3[50]))
print(len(df_all_pipe_windex.Pipeline3[50]))

677
890


In [122]:
df_all_pipes.Pipeline4 = df_all_pipes.Pipeline4.apply(lambda line: delnulls(line))

In [123]:
print(len(df_all_pipes.Pipeline4[50]))
print(len(df_all_pipe_windex.Pipeline4[50]))

677
890


In [124]:
df_all_pipes.Pipeline5 = df_all_pipes.Pipeline5.apply(lambda line: delnulls(line))

In [125]:
print(len(df_all_pipes.Pipeline5[50]))
print(len(df_all_pipe_windex.Pipeline5[50]))

191
890


In [126]:
df_all_pipes.Pipeline6 = df_all_pipes.Pipeline6.apply(lambda line: delnulls(line))

In [127]:
print(len(df_all_pipes.Pipeline6[50]))
print(len(df_all_pipe_windex.Pipeline6[50]))

166
890


In [128]:
df_all_pipes.Pipeline7 = df_all_pipes.Pipeline7.apply(lambda line: delnulls(line))

In [129]:
print(len(df_all_pipes.Pipeline7[50]))
print(len(df_all_pipe_windex.Pipeline7[50]))

166
890


In [130]:
df_all_pipes.Pipeline8 = df_all_pipes.Pipeline8.apply(lambda line: delnulls(line))

In [131]:
print(len(df_all_pipes.Pipeline8[50]))
print(len(df_all_pipe_windex.Pipeline8[50]))

67
890


In [132]:
df_all_pipes.Pipeline9 = df_all_pipes.Pipeline9.apply(lambda line: delnulls(line))

In [133]:
print(len(df_all_pipes.Pipeline9[50]))
print(len(df_all_pipe_windex.Pipeline9[50]))

7
903


In [134]:
df_all_pipes.Pipeline10 = df_all_pipes.Pipeline10.apply(lambda line: delnulls(line))

In [135]:
print(len(df_all_pipes.Pipeline10[50]))
print(len(df_all_pipe_windex.Pipeline10[50]))

39
903


In [136]:
df_all_pipes.Pipeline11 = df_all_pipes.Pipeline11.apply(lambda line: delnulls(line))

In [137]:
print(len(df_all_pipes.Pipeline11[50]))
print(len(df_all_pipe_windex.Pipeline11[50]))

19
903


In [138]:
df_all_pipes.Pipeline12 = df_all_pipes.Pipeline12.apply(lambda line: delnulls(line))

In [139]:
print(len(df_all_pipes.Pipeline12[50]))
print(len(df_all_pipe_windex.Pipeline12[50]))

21
890


In [140]:
df_all_pipes.Pipeline13 = df_all_pipes.Pipeline13.apply(lambda line: delnulls(line))

In [141]:
print(len(df_all_pipes.Pipeline13[50]))
print(len(df_all_pipe_windex.Pipeline13[50]))

20
903


In [142]:
print(df_all_pipes.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 35 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   corpus           141 non-null    object
 1   exhibit          141 non-null    object
 2   orig_exhibit     141 non-null    object
 3   speech_final     141 non-null    object
 4   named_ent        141 non-null    object
 5   all_ngrams       141 non-null    object
 6   freq_ngrams      141 non-null    object
 7   allgrams         141 non-null    object
 8   NG_text          141 non-null    object
 9   NE_text          141 non-null    object
 10  token_ng         141 non-null    object
 11  token_ne         141 non-null    object
 12  pos_tag_ng       141 non-null    object
 13  lemmatized_ng    141 non-null    object
 14  stemmed_ng       141 non-null    object
 15  pos_tag_ne       141 non-null    object
 16  lemmatized_ne    141 non-null    object
 17  stemmed_ne       141 non-null    ob

### Pickle the clean DataFrames

In [143]:
# pickle initial dataframe to avoid processing datagrooming
output = open('pipelines_coref.pkl', 'wb')
sys.setrecursionlimit(100000)

pickle.dump(df_all_pipes, output)

output.close()

### Pickle import block:

In [144]:
import pprint, pickle
import pandas as pd

# unpickle preserved dataframe for further analysis
pkl_file = open('pipelines_coref.pkl', 'rb')

df_all_pipe = pickle.load(pkl_file)
print(df_all_pipe.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 35 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   corpus           141 non-null    object
 1   exhibit          141 non-null    object
 2   orig_exhibit     141 non-null    object
 3   speech_final     141 non-null    object
 4   named_ent        141 non-null    object
 5   all_ngrams       141 non-null    object
 6   freq_ngrams      141 non-null    object
 7   allgrams         141 non-null    object
 8   NG_text          141 non-null    object
 9   NE_text          141 non-null    object
 10  token_ng         141 non-null    object
 11  token_ne         141 non-null    object
 12  pos_tag_ng       141 non-null    object
 13  lemmatized_ng    141 non-null    object
 14  stemmed_ng       141 non-null    object
 15  pos_tag_ne       141 non-null    object
 16  lemmatized_ne    141 non-null    object
 17  stemmed_ne       141 non-null    ob

In [149]:
df_all_pipe.Pipeline13[0]

['proposed',
 'unwilling',
 'instruction',
 'stubborn',
 'stubborn',
 'bullheaded',
 'forthright',
 'conscious',
 'harming',
 'sam',
 'sam',
 'unwise',
 'criticizing',
 'liberty',
 'tradition',
 'tradition',
 'unravel',
 'compounding',
 'grows',
 'geometrically',
 'compounds',
 'perjuring',
 'themself',
 'peripheral',
 'items',
 'instruction',
 'jack_caulfield',
 'jack',
 'jack',
 'bodyguard',
 'pres',
 'york_city',
 'policeman',
 'jack',
 'jack',
 'transferred',
 'jack',
 'infiltration',
 'informa',
 'tion',
 'jack',
 'jack',
 'consensus',
 'caulfield',
 'retrospect',
 'jack',
 'incredibly',
 'cautious',
 'rejecting',
 'jack',
 'jack',
 'jack',
 'jack',
 'jack',
 'jack',
 'tracked',
 'jack',
 'jack',
 'partially',
 'jack',
 'eve',
 'treasury',
 'jack',
 'jack',
 'jack',
 'codes',
 'kidnapping',
 'prostitutes',
 'weaken',
 'opposition',
 'mugging',
 'teams',
 'persons',
 'puffing',
 'laughing',
 'ing',
 'boards',
 'realistic',
 'tail',
 'lasted',
 'kidnapping',
 'united_stateswhere',
 