# Notebook 02: Obtain, Store, and Clean the Watergate Special Prosecution Force Transcripts

**Project: Data Triage of Transcribed Nixon Tapes** <br>
*Michelle Ballard and April Crompton* <br>
Loyola University Maryland Data Science Project 

## Import statements

In [1]:
import pickle
import pprint
import re
import sys
from collections import Counter, defaultdict
from datetime import date, datetime, time, timedelta # to capture date and time metadata

import pandas as pd
import requests
from bs4 import BeautifulSoup

import pdfplumber

## Obtain Watergate Special Prosecution Force Data


### Retrieve page content from the Nixon Library website

In [2]:
response=requests.get("https://www.nixonlibrary.gov/watergate-special-prosecution-force-transcripts")
if response.status_code == requests.codes.ALL_OK:
    # Process the page
    page = BeautifulSoup(response.content)
else: print(response.status_code)

print("Files Last Updated:", (date.today()))

Files Last Updated: 2022-04-16


In [3]:
# Store data relevant to each exhibit 
all_h3 = page.find_all("h3")  # exhibit metadata DATEs
all_pdfs = page.find_all("a", href=True)  # exhibit transcripts

### Process exhibit metadata 

#### Acquire metadata per exhibit from the web page

In [4]:
# Determine metadata categories
# ref: rtphokie https://stackoverflow.com/questions/11647348/find-next-siblings-until-a-certain-one-using-beautifulsoup

# collect the metadata categories within each section
category_titles = []
for i in all_h3: # each dated section is in h3; metadata falls within dated sections
    for sib in i.next_siblings:
        if sib.name == 'h3':
            break
        else: 
            try:
                text_list = [t.text.strip() for t in sib.findAll("strong") if len(t.text)>1]
                category_titles.extend(text_list)
            except: continue

cat_dict = Counter(category_titles) # counter dict of all categories
idxln = max(cat_dict.values()) # number of records to expect
category_titles = [k for k, v in Counter(category_titles).items() if v>1]  # this removed single occurences of strong text
print("Categories:", category_titles)
print("Number of records:", idxln)

Categories: ['Cassette Number / Minutes:', 'Conversation Number:', 'Location:', 'Participants:']
Number of records: 88


In [5]:
# create a dataframe for metadata
df_meta = pd.DataFrame(columns=(*category_titles,"txtdate","exhibit_number"), index=([i for i in range(idxln)])) #Create a blank DF with column titles

In [6]:
# Populate the Dataframe with Metadata

it = 0
expatt = re.compile(r".+\.pdf")

for i in all_h3: # each dated section is in h3; metadata falls within dated sections
    k = i.text[8:].strip() # capture date
    for sib in i.next_siblings:
        if sib.name == 'h3': # 'h3' denotes a new dated section
            break
        else:
            # identify which exhibit is in the current segment
            exhib = re.findall(expatt, str(sib))
            if len(exhib)>0:
                exhibit = exhib[-1].split('>')[-1][:-4] # use the name between > and .pdf
            row = it # update the DF row to put the new info into
            for n in range(len(category_titles)): # see which column heading in the DF to update
                try:
                    ts = re.search(category_titles[n],sib.text).span()[1] # find the start of the value to update
                    # If it's acknowledged the Location column,
                    #  Populate the TextDate and exhibit number on the current row
                    #  Prepare for the next row in the dataframe
                    if n==2 and re.search(category_titles[n],sib.text).span()[1]>0:
                        df_meta.update(pd.DataFrame({"txtdate": [k],
                                                     "exhibit_number": exhibit}, index = [row]))
                        it +=1 # Iterate the next row in the dataframe each time the 0'th column is identified
                    try:
                        te = re.search(category_titles[n+1],sib.text).span()[0] # find the end of the value to update
                    except: te=None
                except: continue
                df_meta.update(pd.DataFrame({category_titles[n]: [sib.text[ts+1:te].strip().replace(u'\t',"")]}, index=[row]))
                

In [7]:
# rename columns for best practices
new_names = {k:k.strip().replace(':','').replace(' ','_').lower() for k in df_meta.columns}
df_meta.rename(columns=new_names, inplace=True)

In [8]:
# Check for duplicates
print(len(df_meta['exhibit_number'].unique()) < len(df_meta.index))

# Drop records without Transcripts
df_meta.drop(df_meta[df_meta['cassette_number_/_minutes'].str.contains("Not")].index,
             axis=0, inplace=True)

# re-check for duplicates
print(len(df_meta['exhibit_number'].unique()) < len(df_meta.index))

True
False


In [9]:
# Set Row Index to exhibit
df_meta.set_index(['exhibit_number'], inplace=True)

In [10]:
# Review the df_meta DataFrame
pd.options.display.max_rows = 1000
display(df_meta)
pd.reset_option("display.max_rows")

Unnamed: 0_level_0,cassette_number_/_minutes,conversation_number,location,participants,txtdate
exhibit_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
472-004_472-005_472-006,E - 25 (3 minutes),472-004,White House Oval Office,President Nixon; Stephen B. Bull,"March 23, 1971"
051-001,E - 26 (49 minutes) [Concludes on E-27],051-001,Cabinet Room,President Nixon; Leaders of the Dairy Industry,"March 23, 1971"
472-021,E - 28 (31 minutes),472-021,White House Oval Office,President Nixon; John B. Connally; John D. Ehr...,"March 23, 1971"
482-017_482-018,E - 29 (31 minutes),482-017,White House Oval Office,"President Nixon; John D, Ehrlichman; George P....","April 19, 1971"
002-001_002-002,E - 30 (1 minute),002-001,White House Telephone,President Nixon; Richard G. Kleindienst,"April 19, 1971"
485-004,E - 31 (3 minutes),485-004,White House Oval Office,President Nixon; John N. Mitchell,"April 21, 1971"
491-014,E - 32 (28 minutes),491-014,White House Oval Office,"President Nixon; H. R. (""Bob"") Haldeman","May 5, 1971"
538-015,E - 33 (19 minutes),538-015,White House Oval Office,President Nixon; John N. Mitchell; John D. Ehr...,"July 6, 1971"
545-003,E - 34 (12 minutes),545-003,White House Oval Office,"President Nixon; John D. Ehrlichman; Egil (""Bu...","July 24, 1971"
587-003,E - 35 (9 minutes),587-003,White House Oval Office,President Nixon; John N. Mitchell; John D. Ehr...,"October 8, 1971"


#### Perform initial datagrooming on exhibit metadata

In [11]:
# Parse the Cassette number and Minutes, update dataframe
def cnm(text):
    epatt = re.compile('\d{1,3}') # assumes no conference is over 999 minutes and no exhibit is higher than 999
    result = re.findall(epatt, text)
    eresult = [result[0]] # assumes exhibit numbers are the first and 3rd+ digits in the text
    if len(text)>2: eresult.extend(result[2:])
    eresult = ["E-"+str(e) for e in eresult]
    mresult = result[1] # assumes minutes is the second digit in the text
    return eresult, mresult

dfmmeta = df_meta['cassette_number_/_minutes'].apply(lambda x: cnm(x))
df_meta['cassette_number'] = dfmmeta.apply(lambda x: x[0])
df_meta['minutes'] = dfmmeta.apply(lambda x: x[1])

In [12]:
# Make Participants a list
def part(text):
    p = text.replace('\xa0','').strip().split(';')
    return p

df_meta['participants'] = df_meta['participants'].apply(lambda x: part(x))

In [13]:
# Add Date field with datetime value of text date
def fixdate(txtdate):
    try: dt = pd.to_datetime(txtdate) 
    except: dt=None
    return dt

df_meta['date'] = df_meta['txtdate'].apply(lambda x: fixdate(x))

In [14]:
# Manage Exceptions:

#  Dictabelt recordings do not show date or conversation number.
#  Use exhibit ID for conversation, and pdf doc dates. select 1st of the month for 11/1972

df_meta.update(pd.DataFrame({'conversation_number': ['35d', '37d'],
                'date':[pd.to_datetime("11/01/1972"), pd.to_datetime("04/18/1973")]},
               index=['000-000_35d', '000-000_37d']), overwrite=True)

In [15]:
# review full DataFrame
df_meta.info()
pd.options.display.max_rows = 1000
display(df_meta)
pd.reset_option("display.max_rows")

<class 'pandas.core.frame.DataFrame'>
Index: 85 entries, 472-004_472-005_472-006 to 039-083
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   cassette_number_/_minutes  85 non-null     object        
 1   conversation_number        85 non-null     object        
 2   location                   85 non-null     object        
 3   participants               85 non-null     object        
 4   txtdate                    85 non-null     object        
 5   cassette_number            85 non-null     object        
 6   minutes                    85 non-null     object        
 7   date                       85 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(7)
memory usage: 6.0+ KB


Unnamed: 0_level_0,cassette_number_/_minutes,conversation_number,location,participants,txtdate,cassette_number,minutes,date
exhibit_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
472-004_472-005_472-006,E - 25 (3 minutes),472-004,White House Oval Office,"[President Nixon, Stephen B. Bull]","March 23, 1971",[E-25],3,1971-03-23
051-001,E - 26 (49 minutes) [Concludes on E-27],051-001,Cabinet Room,"[President Nixon, Leaders of the Dairy Industry]","March 23, 1971","[E-26, E-27]",49,1971-03-23
472-021,E - 28 (31 minutes),472-021,White House Oval Office,"[President Nixon, John B. Connally, John D. Eh...","March 23, 1971",[E-28],31,1971-03-23
482-017_482-018,E - 29 (31 minutes),482-017,White House Oval Office,"[President Nixon, John D, Ehrlichman, George P...","April 19, 1971",[E-29],31,1971-04-19
002-001_002-002,E - 30 (1 minute),002-001,White House Telephone,"[President Nixon, Richard G. Kleindienst]","April 19, 1971",[E-30],1,1971-04-19
485-004,E - 31 (3 minutes),485-004,White House Oval Office,"[President Nixon, John N. Mitchell]","April 21, 1971",[E-31],3,1971-04-21
491-014,E - 32 (28 minutes),491-014,White House Oval Office,"[President Nixon, H. R. (""Bob"") Haldeman]","May 5, 1971",[E-32],28,1971-05-05
538-015,E - 33 (19 minutes),538-015,White House Oval Office,"[President Nixon, John N. Mitchell, John D. Eh...","July 6, 1971",[E-33],19,1971-07-06
545-003,E - 34 (12 minutes),545-003,White House Oval Office,"[President Nixon, John D. Ehrlichman, Egil (""B...","July 24, 1971",[E-34],12,1971-07-24
587-003,E - 35 (9 minutes),587-003,White House Oval Office,"[President Nixon, John N. Mitchell, John D. Eh...","October 8, 1971",[E-35],9,1971-10-08


### Process Transcript Data

#### Download transcripts per exhibit

In [16]:
# collect the pdf file links to the annotated transcripts
pdf_urls = []
for i in all_pdfs:
    txt = str(i.string)
    if "pdf" in txt: # Identify links with 'pdf'
        pdf_urls.append(i['href'])

#### Extract Text from downloaded PDFs

In [17]:
### From provided url, writes pdf file into the local directory
def download_file(url):

    # generate a filename to store the pdf in the local directory, based on the URL
    local_pdf_filename = url.split('/')[-1]
    
    # write the pdf file to the local directory, must be written in binary mode ('wb')
    with requests.get(url) as r:
        with open(local_pdf_filename, 'wb') as f:
            f.write(r.content) 
            
    return local_pdf_filename

## Thank you: https://stackoverflow.com/questions/64911851/cant-open-a-pdf-file-using-pdfplumber-open
# requests library and pdfplumber library must be imported

In [18]:
### From provided local filename, extracts text using pdfplumber
def extractText(local_pdf_filename):

    # extract the text
    with pdfplumber.open(local_pdf_filename) as pdf:
        pages = pdf.pages # stores all pages
        fulltext = []
        pagetext = []
            
        for n in range(len(pages)):
            try:
                pagetext = pages[n].extract_text()
                fulltext.append([pagetext])
            except:
                print("Encountered Error!")
                errorstring = ("error on page"+" "+ str(n+1))
                fulltext.insert(0,errorstring)
                continue
    
    title = local_pdf_filename[:-4]
    
    return title, fulltext

In [19]:
# Process the files
# store each fulltext dataset in a dictionary identified by the name of the file

## Create a dictionary to store the outputs
text_dict = {}

## Run the function on each transcript pdf
for p in pdf_urls:
    filelist = download_file(p)
    transcript = extractText(filelist)
    print(filelist, len(transcript[1]))

    k = transcript[0] # title - filename
    v = transcript[1] # transcript
    text_dict.update({k:v})

472-004_472-005_472-006.pdf 4
051-001.pdf 32
472-021.pdf 43
482-017_482-018.pdf 34
002-001_002-002.pdf 3
485-004.pdf 4
491-014.pdf 27
538-015.pdf 16
545-003.pdf 7
587-003.pdf 9
601-033.pdf 22
697-015.pdf 22
697-029.pdf 42
342-027.pdf 17
741-002.pdf 39
741-010.pdf 3
343-036.pdf 13
347-004.pdf 3
779-002.pdf 44
000-000_35d.pdf 18
393-013_393-014.pdf 18
394-021_395-001.pdf 15
854-017.pdf 11
855-010.pdf 21
856-004.pdf 16
858-003.pdf 28
862-004.pdf 18
862-006.pdf 33
864-004.pdf 22
865-014.pdf 49
866-003.pdf 21
872-001.pdf 2
878-014.pdf 77
882-012.pdf 27
884-007.pdf 4
885-007.pdf 27
037-175_037-176.pdf 18
886-008.pdf 127
421-018.pdf 39
037-204_037-205.pdf 12
422-020.pdf 59
422-033.pdf 105
423-003.pdf 85
890-019.pdf 13
044-158.pdf 18
428-019.pdf 99
896-004.pdf 13
896-005.pdf 57
428-028.pdf 52
038-034.pdf 12
038-037.pdf 30
896-006.pdf 22
038-042_038-043.pdf 22
897-003.pdf 16
897-004.pdf 56
897-009.pdf 12
897-011.pdf 23
427-005_427-006.pdf 24
427-010.pdf 27
898-006.pdf 8
898-012.pdf 97
898-023_8

In [20]:
len(text_dict)

85

In [21]:
# Place the transcripts in a dataframe
df_text = pd.DataFrame([text_dict]).T
df_text.rename(columns = {0:'full_extracted_text'}, 
            inplace = True)
df_text

Unnamed: 0,full_extracted_text
472-004_472-005_472-006,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...
051-001,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...
472-021,"[[MEETING AMONG PRESIDENT RICHARD M. NIXON, JO..."
482-017_482-018,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...
002-001_002-002,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...
...,...
905-008,[[TRANSCRIPT OF A MEETING BETWEEN THE\nPRESIDE...
431-009,[[TRANSCRIPT OF A RECORDING OF A MEETING AMONG...
442-001-069,[[*** DRAFT ***\nTRANSCRIPT PREPARED BY THE ...
039-080_039-081,[[TRANSCRIPT OF A RECORDING OF A TELEPHONE\nCO...


### Create the df_all dataframe containing all relevant metadata and transcript data

In [22]:
# Join the text and meta DataFrames

# check shapes
print("before: ",df_meta.shape, '\n\t', df_text.shape)

# merge the DFs
df_all = pd.merge(df_text, df_meta, left_index=True, right_index=True)

# verify shape
print("after:  ",df_all.shape)

df_all.head()

before:  (85, 8) 
	 (85, 1)
after:   (85, 9)


Unnamed: 0,full_extracted_text,cassette_number_/_minutes,conversation_number,location,participants,txtdate,cassette_number,minutes,date
472-004_472-005_472-006,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...,E - 25 (3 minutes),472-004,White House Oval Office,"[President Nixon, Stephen B. Bull]","March 23, 1971",[E-25],3,1971-03-23
051-001,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...,E - 26 (49 minutes) [Concludes on E-27],051-001,Cabinet Room,"[President Nixon, Leaders of the Dairy Industry]","March 23, 1971","[E-26, E-27]",49,1971-03-23
472-021,"[[MEETING AMONG PRESIDENT RICHARD M. NIXON, JO...",E - 28 (31 minutes),472-021,White House Oval Office,"[President Nixon, John B. Connally, John D. Eh...","March 23, 1971",[E-28],31,1971-03-23
482-017_482-018,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...,E - 29 (31 minutes),482-017,White House Oval Office,"[President Nixon, John D, Ehrlichman, George P...","April 19, 1971",[E-29],31,1971-04-19
002-001_002-002,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...,E - 30 (1 minute),002-001,White House Telephone,"[President Nixon, Richard G. Kleindienst]","April 19, 1971",[E-30],1,1971-04-19


In [23]:
# Manage Exception
# Exhibit 485-004 has no header and the speaker names are not listed and do not contain ':'
# Exhibit 872-001, 697-015, 545-003 speaker names are not listed and do not contain ':'
# remove these anomalies

df_all.drop(['485-004','872-001','697-015','545-003'], axis=0, inplace=True)

In [24]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 472-004_472-005_472-006 to 039-083
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   full_extracted_text        81 non-null     object        
 1   cassette_number_/_minutes  81 non-null     object        
 2   conversation_number        81 non-null     object        
 3   location                   81 non-null     object        
 4   participants               81 non-null     object        
 5   txtdate                    81 non-null     object        
 6   cassette_number            81 non-null     object        
 7   minutes                    81 non-null     object        
 8   date                       81 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(8)
memory usage: 6.3+ KB


## Clean Data

### Parse text to isolate speech and metadata within the documents

In [25]:
def flatten(line):
    out = []
    for item in line:
        if isinstance(item, (list, tuple)):
            out.extend(flatten(item))
        else:
            out.append(item)
    return out

# copied from - ref: http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html

In [26]:
# Flatten lists
flat_list = df_all.iloc[:,0].apply(lambda x: flatten(x))

# Then flatten text within listas
flat_text = flat_list.apply(lambda x: " ".join(map(str, x)))

df_all["flattened"] = flat_list.apply(lambda x: " ".join(map(str, x)))
#ref: https://stackoverflow.com/questions/12453580/how-to-concatenate-items-in-a-list-to-a-single-string

In [27]:
# functions for initial grooming

# prints all text contained in () for visual examination prior to removing
def examine_drops(text):
    regex = re.compile(".*?[\[\(](.*?)[\]\)]") # brackets are used instead of parentheses most of the time
    result = re.findall(regex, text)
    return(result)

# removes new line characters and all () including text
def transcript_initialgroom(text):
    #text = re.sub(r"\n", " ", text)
    text = re.sub(r"\s", " ", text)
    text = re.sub(r'\\\\',"",text)
    text = re.sub(r'\\',"",text)
    text = re.sub("[\[\(](.*?)[\]\)]", "", text) #updated with new drops pattern
    return(text)

# drops speakers and replaces with ""
def drop_speakers(text):
    #text = re.sub(r'\b[A-Z]{2,}\b:', "", text) #@MISHA - some of these have camelcase names
    text = re.sub(r'([A-Z][A-Za-z]+\b:)', "", text) #account for upper and camelcase names followed by colon
    return(text)

def drop_repeating(text):
    text = re.sub(r'\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\b.*(?!/n)', "", text)
#    text = re.sub(r'\b(?:APRIL|JUNE|SEPTEMBER)\b.*(?!/n)', "\nFOUND A FREAKING MATCH\n", text)
    return(text)

# extracts speech header, and splits into two panda series objects
def extract_speechheader(exhibit_speech):
    """Attempts the most conservative pattern of uppercase NAME: followed by a word with lowercase.
       Then, requires an uppercase NAME:, and finally allows a camelcase Name:
       If these fail, no header will be defined (span ends at 0)"""
    patt1 = r'(\b[A-Z]{2,}\b:\s+(..[a-z]|.[a-z]))' # the NAME: is uppercase and followed by a word with lowercase letters
    patt2 = r'(\b[A-Z]{2,}\b:)' # the NAME: is uppercase
    patt3 = r'([A-Z][A-Za-z]+\b:)' # the first colon following any set of letters
    if re.search(patt1, exhibit_speech)!=None: patt=patt1
    elif re.search(patt2, exhibit_speech)!=None: patt = patt2
    elif re.search(patt3, exhibit_speech)!=None: patt = patt3
    try:
        headerend = re.search(patt, exhibit_speech)[0]
        headerendspan = re.search(patt, exhibit_speech).span()[0]
    except: headerend = headerendspan = 0
    header = exhibit_speech[:headerendspan]
    speech = exhibit_speech[headerendspan:]
    return pd.Series([header, speech])


In [28]:
# extracts and returns start and end time from the speech header
def timehdr(text):
    """finds and calculates the FROM-TO times in the text, assuming from is first and to is last"""
    patt = re.compile(r'([0-9]+:\w\w[\sAaPp])') #[^/]+$ brings the rest of the line; +\b brings the rest of the word; ref Joey https://stackoverflow.com/questions/11347868/regex-to-get-last-word-from-sentence-of-words-separated-by
    iter = patt.finditer(text)
    wrds = [i for m in iter for i in m.span()]
    try:
        # Find the start and end times in the text
        s,e = min(wrds), max(wrds)+3
        times = text[s:e].replace(".","").replace("*","").replace("-"," ").strip().upper().split()
        s_time = re.sub("[A-Z]","1",times[0]) # assumes start time is first in the list; any letters will be replaced with 1
        e_merid = times[-1][0]+"M" # assumes AM or PM are the final element in the list
        e_time = re.sub("[A-Z]","1",times[-2]) # assumes end time is second to last in the list; any letters will be replaced with 1

        # determine AM or PM
        st = int(s_time.split(":")[0]) # find the hour of the start time
        et = int(e_time.split(":")[0]) # find the hour of the end time
        if st==12: s_merid=e_merid
        elif st>et or et==12:
            if e_merid=="PM": s_merid="AM"
            elif st>et and e_merid=="AM": s_merid="PM"
        else: s_merid=e_merid
        times = [s_time+s_merid,e_time+e_merid]
        
        # calculate date-time
        sdtime = pd.to_timedelta(str(datetime.strptime(times[0], '%I:%M%p').time())) # start time
        edtime = pd.to_timedelta(str(datetime.strptime(times[1], '%I:%M%p').time())) # end time
    except: sdtime = edtime = pd.to_timedelta('00:00:00')
    diff = edtime-sdtime # time difference between start/end
    return sdtime, edtime, diff

In [29]:
# Process Transcript parsing and grooming functions
# examine words in paragraphs
df_all['dropped'] = df_all['flattened'].apply(lambda row: examine_drops(row))

# apply initial grooming to flattened dataset (removes new line characters and all () including text
df_all[['speech_header', 'speech']] = df_all['flattened'].apply(lambda row: extract_speechheader(row))

# remove repeating row from flattened dataset
df_all['speech_flat_norepeat'] = df_all['speech'].apply(lambda row: drop_repeating(row))

# apply initial grooming to flattened dataset (removes new line characters and all () including text
df_all['speech_groomed'] = df_all['speech_flat_norepeat'].apply(lambda row: transcript_initialgroom(row))

# apply initial grooming to the speech header
df_all['speech_header'] = df_all['speech_header'].apply(
    lambda x: transcript_initialgroom(x))

# apply drop speakers to groomed speech
df_all['speech_final'] = df_all['speech_groomed'].apply(lambda row: drop_speakers(row))

In [30]:
# run the function to get time from speech header data
dfsh = df_all['speech_header'].copy()
timer = dfsh.apply(lambda row: timehdr(row))
dfsh = pd.DataFrame([[a,b,c] for a,b,c in timer.values], 
                    columns = ['start_time','end_time','time_diff'], index = timer.index)

# # add start, end, diff columns to df_all
df_all = pd.concat([df_all,dfsh], axis=1)
df_all['start_dtime'] = df_all['date']+df_all['start_time']
df_all['end_dtime'] = df_all['date']+df_all['end_time']

# ref: jezrael https://stackoverflow.com/questions/57847521/summing-two-datetime-columns
# ref: cs95 https://stackoverflow.com/questions/53402584/how-to-convert-a-series-of-tuples-into-a-pandas-dataframe 

### Finalize the dataframe columns

In [31]:
# Select df_all columns to carry forward
df_all = df_all[['conversation_number','cassette_number'
                ,'location','participants','minutes','txtdate',
                'date','start_dtime','end_dtime','time_diff','full_extracted_text',
                'dropped', 'speech_header','speech_flat_norepeat','speech_final']]

In [32]:
display(df_all.info())
pd.options.display.max_rows = 1000
display(df_all)
pd.reset_option("display.max_rows")

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 472-004_472-005_472-006 to 039-083
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype          
---  ------                --------------  -----          
 0   conversation_number   81 non-null     object         
 1   cassette_number       81 non-null     object         
 2   location              81 non-null     object         
 3   participants          81 non-null     object         
 4   minutes               81 non-null     object         
 5   txtdate               81 non-null     object         
 6   date                  81 non-null     datetime64[ns] 
 7   start_dtime           81 non-null     datetime64[ns] 
 8   end_dtime             81 non-null     datetime64[ns] 
 9   time_diff             81 non-null     timedelta64[ns]
 10  full_extracted_text   81 non-null     object         
 11  dropped               81 non-null     object         
 12  speech_header         81 non-null     object

None

Unnamed: 0,conversation_number,cassette_number,location,participants,minutes,txtdate,date,start_dtime,end_dtime,time_diff,full_extracted_text,dropped,speech_header,speech_flat_norepeat,speech_final
472-004_472-005_472-006,472-004,[E-25],White House Oval Office,"[President Nixon, Stephen B. Bull]",3,"March 23, 1971",1971-03-23,1971-03-23 10:16:00,1971-03-23 10:19:00,0 days 00:03:00,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...,"[Picks up telephone., Hangs up., Telephone buz...",TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUIRY...,PRESIDENT: [Picks up telephone.] Secretary Con...,"Secretary Connally please. Mr. President,..."
051-001,051-001,"[E-26, E-27]",Cabinet Room,"[President Nixon, Leaders of the Dairy Industry]",49,"March 23, 1971",1971-03-23,1971-03-23 10:35:00,1971-03-23 11:25:00,0 days 00:50:00,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...,"[Laughter, unintelligible, Laughter, Unintelli...",TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUIRY...,PRESIDENT: Let me get around to say hello to e...,Let me get around to say hello to everybody. ...
472-021,472-021,[E-28],White House Oval Office,"[President Nixon, John B. Connally, John D. Eh...",31,"March 23, 1971",1971-03-23,1971-03-23 17:05:00,1971-03-23 17:38:00,0 days 00:33:00,"[[MEETING AMONG PRESIDENT RICHARD M. NIXON, JO...","[coughing, unintelligible, unintelligible, Sev...","MEETING AMONG PRESIDENT RICHARD M. NIXON, JOHN...","PRESIDENT: Hi, Phil, how are you?\nCAMPBELL: M...","Hi, Phil, how are you? Mr. President. Sorry..."
482-017_482-018,482-017,[E-29],White House Oval Office,"[President Nixon, John D, Ehrlichman, George P...",31,"April 19, 1971",1971-04-19,1971-04-19 15:03:00,1971-04-19 15:34:00,0 days 00:31:00,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...,"[Picks up telephone., To telephone operator, u...",TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUIRY...,PRESIDENT: Kleindienst is in this? [Picks up ...,Kleindienst is in this? Yes. Dick Kleind...
002-001_002-002,002-001,[E-30],White House Telephone,"[President Nixon, Richard G. Kleindienst]",1,"April 19, 1971",1971-04-19,1971-04-19 15:04:00,1971-04-19 15:09:00,0 days 00:05:00,[[TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUI...,"[Laughs, Unintelligible, Laughs, unintelligibl...",TRANSCRIPT PREPARED BY THE IMPEACHMENT INQUIRY...,PRESIDENT: Dick Kleindienst.\nSECRETARY: Thank...,"Dick Kleindienst. Thank you, Mr. President. ..."
491-014,491-014,[E-32],White House Oval Office,"[President Nixon, H. R. (""Bob"") Haldeman]",28,"May 5, 1971",1971-05-05,1971-05-05 09:55:00,1971-05-05 09:55:00,0 days 00:00:00,[[TRANSCRIPT OF A RECORDING OF A MEETING BETWE...,"[Unintelligible, tape noise, Laughs, Tape Nois...",TRANSCRIPT OF A RECORDING OF A MEETING BETWEEN...,PRESIDENT: Picked up them all right.\nHALDEMAN...,Picked up them all right. Yesterday. How's ...
538-015,538-015,[E-33],White House Oval Office,"[President Nixon, John N. Mitchell, John D. Eh...",19,"July 6, 1971",1971-07-06,1971-07-06 00:00:00,1971-07-06 00:00:00,0 days 00:00:00,[[TRANSCRIPT OF A RECORDING OF A PORTION OF A\...,"[unintelligible, unintelligible, Unintelligibl...",TRANSCRIPT OF A RECORDING OF A PORTION OF A ME...,"NIXON: Uh, I wanted to, uh, check with you bef...","Uh, I wanted to, uh, check with you before yo..."
587-003,587-003,[E-35],White House Oval Office,"[President Nixon, John N. Mitchell, John D. Eh...",9,"October 8, 1971",1971-10-08,1971-10-08 10:04:00,1971-10-08 10:46:00,0 days 00:42:00,[[TRANSCRIPT OF A RECORDING OF A PORTION OF A\...,"[unintelligible, unintelligible, unintelligibl...",TRANSCRIPT OF A RECORDING OF A PORTION OF A ME...,"MITCHELL: Mr. President, two other quick thing...","Mr. President, two other quick things if you ..."
601-033,601-033,[E-36],White House Oval Office,"[President Nixon, John D. Ehrlichman, Stephen ...",26,"October 25, 1971",1971-10-25,1971-10-25 12:35:00,1971-10-25 14:05:00,0 days 01:30:00,[[TRANSCRIPT OF A RECORDING OF A\nMEETING IN T...,"[unintelligible, unintelligible, Unintelligibl...",TRANSCRIPT OF A RECORDING OF A MEETING IN THE ...,"EHRLICHMAN: A lot of, uh\nPRESIDENT: Yeah comp...","A lot of, uh Yeah complicated matters heavy..."
697-029,697-029,[E-38],White House Oval Office,"[President Nixon, H. R. (""Bob"") Haldeman, Char...",46,"March 30, 1972",1972-03-30,1972-03-30 13:30:00,1972-03-30 14:30:00,0 days 01:00:00,[[Transcript of a Recording of a Meeting betwe...,"[Unintelligible, Unintelligible, unintelligibl...",Transcript of a Recording of a Meeting between...,"Colson: ...I don't know, maybe the, maybe the ...","...I don't know, maybe the, maybe the ITT thi..."


## Store Data

### Pickle the cleaned dataframes 

In [33]:
# pickle initial dataframe to avoid processing datagrooming
output = open('cleaned_WSPF.pkl', 'wb')
sys.setrecursionlimit(100000)

pickle.dump(df_all, output)

output.close()

### Pickle Import block
Copy this cell to any notebook to retrieve and unpickle the dataframes

In [34]:
import pprint, pickle
import pandas as pd

# unpickle preserved dataframes to continue EDA and Statistical Analysis
pkl_file = open('cleaned_WSPF.pkl', 'rb')

df_all_wspf = pickle.load(pkl_file)
print("\n~~~df_all_wspf~~~\n")
print(df_all_wspf.info())

# Expected Results:

# Index: 81 entries, 472-004_472-005_472-006 to 039-083
# Data columns (total 15 columns):
#  #   Column                Non-Null Count  Dtype          
# ---  ------                --------------  -----          
#  0   conversation_number   81 non-null     object         
#  1   cassette_number       81 non-null     object         
#  2   location              81 non-null     object         
#  3   participants          81 non-null     object         
#  4   minutes               81 non-null     object         
#  5   txtdate               81 non-null     object         
#  6   date                  81 non-null     datetime64[ns] 
#  7   start_dtime           81 non-null     datetime64[ns] 
#  8   end_dtime             81 non-null     datetime64[ns] 
#  9   time_diff             81 non-null     timedelta64[ns]
#  10  full_extracted_text   81 non-null     object         
#  11  dropped               81 non-null     object         
#  12  speech_header         81 non-null     object         
#  13  speech_flat_norepeat  81 non-null     object         
#  14  speech_final          81 non-null     object         
# dtypes: datetime64[ns](3), object(11), timedelta64[ns](1)


~~~df_all_wspf~~~

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 472-004_472-005_472-006 to 039-083
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype          
---  ------                --------------  -----          
 0   conversation_number   81 non-null     object         
 1   cassette_number       81 non-null     object         
 2   location              81 non-null     object         
 3   participants          81 non-null     object         
 4   minutes               81 non-null     object         
 5   txtdate               81 non-null     object         
 6   date                  81 non-null     datetime64[ns] 
 7   start_dtime           81 non-null     datetime64[ns] 
 8   end_dtime             81 non-null     datetime64[ns] 
 9   time_diff             81 non-null     timedelta64[ns]
 10  full_extracted_text   81 non-null     object         
 11  dropped               81 non-null     object         
 12  speech_header         81