# Notebook 14: Discourse Analysis - Text Summarization

**Project: Data Triage of Transcribed Nixon Tapes** <br>
*Michelle Ballard and April Crompton* <br>
Loyola University Maryland Data Science Project 

## Import Statements

In [35]:
import pprint, pickle
import pandas as pd
from statistics import mean, stdev

import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from collections import defaultdict, Counter
snow_stemmer = SnowballStemmer(language='english')

from sklearn.feature_extraction.text import TfidfVectorizer
from spellchecker import SpellChecker
spell = SpellChecker()
import re
import spacy
nlp = spacy.load('en_core_web_sm')
spell = SpellChecker()

In [36]:
# From 03: Cleaned_Combined
# unpickle preserved dataframes
pkl_file = open('cleaned_combined.pkl', 'rb')

df_all_combined = pickle.load(pkl_file)
df_speakers_combined = pickle.load(pkl_file)

df_chunks_combined = pickle.load(pkl_file)
print("\n~~~df_chunks_combined~~~\n")
print(df_chunks_combined.info())


~~~df_chunks_combined~~~

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   corpus        141 non-null    object
 1   orig_exhibit  141 non-null    object
 2   exhibit       141 non-null    object
 3   speech_final  141 non-null    object
 4   abstract      56 non-null     object
dtypes: object(5)
memory usage: 5.6+ KB
None


## Gather and clean the corpus

In [37]:
corpus = df_chunks_combined["speech_final"]
idx = df_chunks_combined['exhibit']
abstracts = df_chunks_combined['abstract']

In [38]:
vcorpus = corpus.copy()
# Run TFIDF Vectorizer to eliminate the most frequent terms and assign a weight to all terms
vectorizer = TfidfVectorizer(max_df = 0.90) #ignore terms that appear in more than 90% of the documents
vectorizedcorpus = vectorizer.fit_transform(vcorpus)
# Create a tuple of the feature name and the weight
wts_tfidf = (dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_)))

In [39]:
# Check
print(wts_tfidf["prostitutes"])
print(wts_tfidf["kidnapping"])
print(wts_tfidf["rights"])
print(wts_tfidf["perjury"])
print(wts_tfidf["cancer"])

4.857214768933151
4.34638914516716
2.5885312276147867
2.242254990896953
2.864784604242945


In [40]:
%%time
# Create a list of noisy words to eliminate
nixpos = ['PRP$','WDT','WP$','IN','EX','WRB','CC','DT','UH','WP','POS','TO','PRP','BES','HVS','MD']
nixpostok = []
alltoks = defaultdict(int)
for i,c in enumerate(corpus):
    doc = nlp(c)
    for tok in doc:
        t = re.sub(r'[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]',"",tok.text).strip().lower()
        if len(t)>0: alltoks[t] += 1
        if len(t)<2: nixpostok.append(t)
        if tok.tag_ in nixpos: nixpostok.append(t)

CPU times: user 1min 14s, sys: 9.8 s, total: 1min 24s
Wall time: 1min 24s


In [41]:
%%time
# Create a list of misspelled words across the corpus
lowfreq = [k for k,v in alltoks.items() if v<2]
badspell = list(spell.unknown(lowfreq))

# Combine noisy tokens and misspelled words into a list to eliminate from the corpus
nixtok = set(nixpostok + badspell) # Ensure unique

CPU times: user 34.7 ms, sys: 1.44 ms, total: 36.2 ms
Wall time: 35.9 ms


In [42]:
# Check words to nix
print(len(badspell), len(nixtok))

nixtok

916 1380


{'',
 'bar',
 'telligible',
 'wentover',
 'byrce',
 'behind',
 'caseand',
 'zielger',
 'cooparate',
 'buzhar',
 'thc',
 'along',
 'while',
 'campaian',
 '136',
 'interrog',
 'undereverybody',
 'whitcover',
 'donti',
 'mm',
 'proprietorship',
 'yknow',
 'circleem',
 'investigatior',
 'd',
 'herewith',
 'said',
 'strikebusters',
 'agressively',
 'sayshunt',
 'within',
 'satified',
 'deana',
 'carrin',
 'you',
 'nothing',
 'taq',
 'doan',
 'chuckll',
 'bisarre',
 'c',
 'might',
 'strachrin',
 'spose',
 '57',
 'ervin',
 'strongview',
 'every',
 'upon',
 '143',
 'beforeuh',
 'sonof',
 'can',
 '62',
 'itleaked',
 'obstru',
 'malfactions',
 'idoht',
 'prelim',
 'cerned',
 'his',
 '103',
 'enrlichman',
 'critized',
 'itmaybe',
 'lfor',
 'sure',
 'consolidations',
 'governmemt',
 'ehrlich14',
 'supportsthe',
 'wed',
 'god',
 '67',
 'reqard',
 'nt',
 'kruschev',
 'per',
 'was1',
 'mishatched',
 'ingression',
 'discu',
 'invar',
 'liddyll',
 'mcphee',
 'beepin',
 'expansible',
 'specificthing',
 

In [43]:
%%time
# Eliminate nixtok from terms identified by dictionary of tfidf weights
besterms = wts_tfidf.copy()
nixed = []
for n in nixtok:
    try: 
        print(n,":",besterms[n])
        nix = besterms.pop(n)
        nixed.append((n,nix))
    except: continue

print(len(nixed))
print(len(wts_tfidf))
print(len(besterms))

bar : 3.7586024802650413
telligible : 5.2626798770413155
byrce : 5.2626798770413155
behind : 3.01138807843482
zielger : 5.2626798770413155
cooparate : 5.2626798770413155
buzhar : 5.2626798770413155
thc : 5.2626798770413155
along : 2.0438040521731144
while : 1.812692331209728
campaian : 5.2626798770413155
136 : 5.2626798770413155
interrog : 5.2626798770413155
whitcover : 5.2626798770413155
mm : 4.34638914516716
proprietorship : 5.2626798770413155
investigatior : 5.2626798770413155
herewith : 5.2626798770413155
strikebusters : 5.2626798770413155
agressively : 5.2626798770413155
within : 2.400478996111847
satified : 5.2626798770413155
deana : 5.2626798770413155
carrin : 5.2626798770413155
nothing : 1.2553466918088445
taq : 5.2626798770413155
doan : 5.2626798770413155
bisarre : 5.2626798770413155
might : 1.264479175372117
strachrin : 5.2626798770413155
spose : 5.2626798770413155
57 : 4.1640675883732055
ervin : 1.6931471805599454
every : 1.4899189389466772
upon : 2.6977305195797787
143 : 5.

In [79]:
def _create_dictionary_table(text_string, bstwrds=besterms) -> dict:
    #reducing words to their root form
    text_string = text_string.lower()
    stem = snow_stemmer
    words = nlp(text_string)

    #creating dictionary for the word frequency table
    frequency_table = defaultdict(float)
    
    for wd in words:
        # Ensure the stemmed, lemmatized, and original version of the pipeline word is accounted for
        wl = wd.lemma_
        ws = stem.stem(wl) #### Changed stemmer to snowball and based on spaCy lemmatized word
        wd = re.sub(r'[^A-Za-z0-9_\s]|[\-\[\]\.\,\'\"\*]',"",wd.text).strip().lower()
        wd, wl, ws = str(wd), str(wl), str(ws)
        if wd in bstwrds:
            blist = []
            try: blist.append(bstwrds[wd])
            except: None
            try: blist.append(bstwrds[wl])
            except: None
            try: blist.append(bstwrds[ws])
            except: None
            best_score = max(blist)
            try:
                if frequency_table[wd] < best_score: frequency_table[wd] = best_score
            except: frequency_table[wd] = best_score
    return frequency_table


def _calculate_sentence_scores(sentences, frequency_table) -> dict:   
    #algorithm for scoring a sentence by its words
    sentence_weight = defaultdict(float)

    for sentence in sentences:
        sent_lst = word_tokenize(sentence.lower())
        sentence_wordcount = len(sent_lst)
        sentence_wordcount_without_pipewrds = 0
        for word_weight in frequency_table:
            if word_weight in sent_lst:
                sentence_wordcount_without_pipewrds += 1
                sentence_weight[sentence[:15]] += frequency_table[word_weight]
        try: 
            if sentence_wordcount_without_pipewrds == 0: sentence_weight[sentence[:15]] = 0
            elif  sentence_wordcount_without_pipewrds == 1: sentence_weight[sentence[:15]] = sentence_weight[sentence[:15]] / 2 # if a sentence only has 1 weighted word, cut that in half
            else:
                sentence_weight[sentence[:15]] = sentence_weight[sentence[:15]] / sentence_wordcount_without_pipewrds
        except: sentence_weight[sentence[:15]] = 0
    
    return sentence_weight

def _calculate_average_score(sentence_weight):
    sum_values = []
    for entry in sentence_weight:
        sum_values.append(float(sentence_weight[entry]))     
    try:
        average_score = mean(sum_values)
        std = stdev(sum_values)
    except:
        average_score = 0
        std = 0

    return average_score, std

def _get_article_summary(sentences, sentence_weight, threshold):
    # number of sentences in the document:
    sentence_counter = len(sentence_weight)
    print("number of sentences in document:", sentence_counter)
    # summary should be approx 10% of the sentences in the document
    if int(sentence_counter/10)>50:
        num_sum = 50
    else: num_sum = int(sentence_counter/10)
    
    # identify the IQR; max threshold is outliers above 1.5 IQR which are probably funny words
    sentence_wts = np.array(list(sentence_weight.values()))
    upper_quartile = np.percentile(sentence_wts, 75)
    lower_quartile = np.percentile(sentence_wts, 25)
    thresholdmax = upper_quartile + ((upper_quartile - lower_quartile) * 1.5)
    num_sum = num_sum + len(sentence_wts[sentence_wts>thresholdmax]) # allows length to include outliers, which will be eliminated by thresholdmax, but holds the place of the index
    
    # identify the sentences to include in the summary
    sentence_wrds = np.array(list(sentence_weight.keys()))

    # indices of the highest of sentence weight values:
    top = sorted(np.argsort(sentence_wts)[-num_sum:])
    top_wrds = list(sentence_wrds[top])
    article_summary = ''
    sentence_counter = 0
    for sentence in sentences:
        if sentence[:15] in top_wrds and sentence_weight[sentence[:15]] <= thresholdmax:
#             print('\n',str(sentence_weight[sentence[:15]]),sentence) # For review
            article_summary += " " + sentence
            sentence_counter += 1
    print("number of sentences in summary:",sentence_counter)

    return article_summary


def _run_article_summary(article, bstwrds=besterms):
    #creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)

    #tokenizing the sentences
    sentences = sent_tokenize(article)

    #algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)

    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = _get_article_summary(sentences, sentence_scores, threshold)

    return article_summary


In [80]:
## Small test For Review

for i in range(0,1):
    print("\n~~Summarized",idx[i],"~~")
    print(_run_article_summary(corpus[i]))
    print("\n~~vs. Abstract",idx[i],"~~")
    print(abstracts[i])
# for i in range(24,26):
#     print("\n~~Summarized",idx[i],"~~")
#     print(_run_article_summary(corpus[i]))
#     print("\n~~vs. Abstract",idx[i],"~~")
#     print(abstracts[i])


~~Summarized Part_1_exhibit_12 ~~
number of sentences in document: 416
number of sentences in summary: 44
 He has been totally unwilling all along to take any guidance, any instruction. He's just quite stubborn and -- he's quite stubborn; also he isn't very smart. I don't think that anyone is criticizing... Well, let's say -- ...  your position on it. The American Civil Liberty Union is against it. It's growing daily. And that is just--and there is no assurance-- That it won't bust. I said, "Jack, come up with a plan that, you know, is a normal infiltration, I mean, you know, buying informa- tion from secretaries and all that sort of thing." I said, "Well, I don't really know as I'm the man, but if you want me there I'll be happy to." All in codes, and involved black bag operations, kidnapping, providing prostitutes, uh, to weaken the opposition, bugging, uh, mugging teams. And--  Uh, Mitchell, Mitchell just virtually sat there puffing and laughing. And so then he was told to go back 

## Create summaries

In [81]:
for i in range(len(corpus)):
    print("\n~~Summarized",idx[i],"~~")
    print(_run_article_summary(corpus[i]))
    print("\n~~vs. Abstract",idx[i],"~~")
    print(abstracts[i])


~~Summarized Part_1_exhibit_12 ~~
number of sentences in document: 416
number of sentences in summary: 44
 He has been totally unwilling all along to take any guidance, any instruction. He's just quite stubborn and -- he's quite stubborn; also he isn't very smart. I don't think that anyone is criticizing... Well, let's say -- ...  your position on it. The American Civil Liberty Union is against it. It's growing daily. And that is just--and there is no assurance-- That it won't bust. I said, "Jack, come up with a plan that, you know, is a normal infiltration, I mean, you know, buying informa- tion from secretaries and all that sort of thing." I said, "Well, I don't really know as I'm the man, but if you want me there I'll be happy to." All in codes, and involved black bag operations, kidnapping, providing prostitutes, uh, to weaken the opposition, bugging, uh, mugging teams. And--  Uh, Mitchell, Mitchell just virtually sat there puffing and laughing. And so then he was told to go back 