# Notebook 01: Obtain, Store, and Clean the Watergate Trial Transcripts

**Project: Data Triage of Transcribed Nixon Tapes** <br>
*Michelle Ballard and April Crompton* <br>
Loyola University Maryland Data Science Project 

## Import statements

In [1]:
import pickle
import pprint
import re
import sys
from collections import Counter, defaultdict
from datetime import date, datetime, time, timedelta # to capture date and time metadata

import pandas as pd
import requests
from bs4 import BeautifulSoup

import pdfplumber

## Obtain Watergate Trial Data

### Retrieve page content from the Nixon Library website

In [2]:
response=requests.get("https://www.nixonlibrary.gov/watergate-trial-tapes")
if response.status_code == requests.codes.ALL_OK:
    # Process the page
    page = BeautifulSoup(response.content)
else: print(response.status_code)

print("Files Last Updated:", (date.today()))

Files Last Updated: 2022-04-16


In [3]:
# Store data relevant to each exhibit 
all_h2 = page.find_all("h2")  # exhibit metadata
all_pdfs = page.find_all("a", href=True)  # exhibit transcripts

### Process exhibit metadata 

#### Acquire metadata per exhibit from the web page

In [4]:
# Determine metadata categories

# collect the metadata categories within each section, create a dataframe with those categories occurring more than once
category_titles = []
for i in all_h2:
    txt = str(i.string)
    if "DATE" in txt: # Each transcript is contained in a section initially identified by date
        uls = i.findNext('ul') # Metadta exists within the ULs under each dated H2
        category_titles.extend([t.text for t in uls.findAll('strong')]) # store text for each instance of a <strong> item in a single flat list
category_titles = [k for k, v in Counter(category_titles).items() if v>1]  # this removed single occurences of strong text

category_titles

['Cassette Number / Minutes:',
 'Conversation Number:',
 'Location:',
 'Exhibit Number:',
 'Abstract:',
 'Participants:']

In [5]:
# Populate the Dataframe with Metadata

df_meta = pd.DataFrame(columns=(*category_titles,"TxtDate:"), index=([i for i in range(35)])) #Create a blank DF with column titles
it = 0
stript = u'\t'

for i in all_h2:
    txt = str(i.string)
    if "DATE" in txt: # Each transcript is contained in a section initially identified by date
        k = i.text[8:] # capture the date in text format, less the preceding "DATE:" characters
        uls = i.findNext('ul') # each UL within each H2 contains the metatdata
        lis = uls.findAll('li') # each li within each UL contains the metadata
        for l in lis: # review each li
            row = it # update the DF row to put the new info into
            for n in range(len(category_titles)): # see which column heading in the DF to update
                try:
                    ts = re.search(category_titles[n],l.text).span()[1] # find the start of the value to update
                    # If the 0th column is the current column,
                    #  Populate the Text Date on the current row
                    #  Go to the next row in the dataframe each time the 0'th column is identified
                    if n == 0 and re.search(category_titles[n],l.text).span()[1]>0:
                        df_meta.update(pd.DataFrame({"TxtDate:": [k]}, index = [row]))
                        it +=1 # Iterate the next row in the dataframe each time the 0'th column is identified
                    try:
                        te = re.search(category_titles[n+1],l.text).span()[0] # find the end of the value to update
                    except: te=None
                except: continue
                df_meta.update(pd.DataFrame({category_titles[n]: [l.text[ts+1:te].strip().replace(u'\t',"")]}, index=[row]))

In [6]:
df_meta.info()
df_meta.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 0 to 34
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Cassette Number / Minutes:  35 non-null     object
 1   Conversation Number:        35 non-null     object
 2   Location:                   35 non-null     object
 3   Exhibit Number:             35 non-null     object
 4   Abstract:                   35 non-null     object
 5   Participants:               35 non-null     object
 6   TxtDate:                    35 non-null     object
dtypes: object(7)
memory usage: 2.2+ KB


Unnamed: 0,Cassette Number / Minutes:,Conversation Number:,Location:,Exhibit Number:,Abstract:,Participants:,TxtDate:
30,E - 22 Segment 3 (7 minutes) | \n\n\n\nFile...,430-4,Old Executive Office Building Office,"Exhibit 31 – U.S. v. John N. Mitchell, et al.",Topics discussed include: implications of Dean...,President Nixon \nH.R.Haldeman \nJohn D. Ehrli...,"Wednesday, April 25, 1973"
31,E - 23 Segment 1 (42 minutes) | \n\n\nFile\...,430-22,Old Executive Office Building Office,"Exhibit 32 – U.S. v. John N. Mitchell, et al.",Topics discussed include: listening to the Tap...,President Nixon \nH.R.Haldeman,"Wednesday, April 25, 1973"
32,E - 24 Segment 1 (9 minutes) |\n\n\nFile\n\n...,38-150,Old Executive Office Building Office,"Exhibit 33 – U.S. v. John N. Mitchell, et al.",Subjects covered include: discussion of the po...,President Nixon \nH.R.Haldeman,"Wednesday, April 25, 1973"
33,E - 24 Segment 2 (18 minutes) | \n\n\n\nFil...,,[Dictabelt Recording],"Exhibit 35 – U.S. v. John N. Mitchell, et al.",Topics include: Colson's need to avoid specifi...,Charles W. Colson \nE. Howard Hunt \nH.R.Haldeman,Unknown
34,E - 24 Segment 3 (11 minutes) | \n\n\n\nFil...,,[Dictabelt Recording],"Exhibit 37 – U.S. v. John N. Mitchell, et al.",Topics discussed include Kalmbach's upcoming t...,John D. Ehrlichman \nHerbert W. Kalmbach \nH.R...,"Thursday, April 19, 1973"


#### Perform data grooming on exhibit metadata

In [7]:
# Parse the relevant info in df_meta and remove the unicode characters
df_meta['Cassette Number / Minutes:'] = df_meta['Cassette Number / Minutes:'].apply(
    lambda x: re.split('\(|\)|File|\.pdf', x.replace(u'\n',"").replace(u'\xa0',"").strip()))

In [8]:
# Add relevant columns
df_meta['Cassette Number:'] = df_meta['Cassette Number / Minutes:'].apply(lambda x: x[0].strip())
df_meta['Minutes:'] = df_meta['Cassette Number / Minutes:'].apply(lambda x: int(x[1][:-7].strip()))
df_meta['Exhibit:'] = df_meta['Cassette Number / Minutes:'].apply(lambda x: x[3].strip())

In [9]:
# Manage exceptions
e11 = df_meta.iloc[7,0] # exhibit 11 has 2 descriptions with minutes timed; combine
e12 = df_meta.iloc[8:11].copy() # exhibit 12 has 3 descriptions with minutes timed; combine
e18 = df_meta.iloc[16:19].copy() # exhibit 18 has 3 descriptions with minutes timed; combine
e35 = 'Wednesday, November 1, 1972'# exhibit 35 has an 'unknown' date; update to November 1972 per pdf header, use 1st day

df_meta.update(pd.DataFrame({'Cassette Number / Minutes:':[e11],
                           'Cassette Number:': e11[0].strip()+ e11[2].strip(),
                           'Minutes:': int(e11[1][:-7].strip())+int(e11[3][:-7].strip()),
                           'Exhibit:': e11[5].strip(),},
                           index=[7]))

df_meta.update(pd.DataFrame({'Cassette Number:': [e12['Cassette Number:']],
                           'Minutes:': sum(e12['Minutes:'])}
                           , index=[8]))

df_meta.update(pd.DataFrame({'Cassette Number:': [e18['Cassette Number:']],
                           'Minutes:': sum(e18['Minutes:'])}
                           , index=[16]))

df_meta.update(pd.DataFrame({'TxtDate:': e35}
                           , index=[33]))

#drop duplicate rows
df_meta = df_meta.drop(index=[9,10,17,18])
              
#drop unused column
df_meta = df_meta.drop(columns=['Cassette Number / Minutes:'])

In [10]:
# Set Row Index
df_meta.set_index(['Exhibit:'], inplace=True)

In [11]:
# Clean up remaining fields
df_meta['Abstract:'] = df_meta['Abstract:'].apply(lambda x: x.replace(u'\n',"").replace(u'\xa0',"").strip())
df_meta['Participants:'] = df_meta['Participants:'].apply(lambda x: x.replace(u'\xa0',"").strip(). splitlines())

In [12]:
# Add Date field with datetime value of text date
def fixdate(txtdate):
    try: dt = datetime.strptime(txtdate, '%A, %B %d, %Y') 
    except: dt=None
    return dt

df_meta['Date:'] = df_meta['TxtDate:'].apply(lambda x: fixdate(x))

In [13]:
df_meta.info()
df_meta # review full DataFrame
# there should be 31 records.

<class 'pandas.core.frame.DataFrame'>
Index: 31 entries, connally_exhibit_1 to exhibit_37
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Conversation Number:  31 non-null     object        
 1   Location:             31 non-null     object        
 2   Exhibit Number:       31 non-null     object        
 3   Abstract:             31 non-null     object        
 4   Participants:         31 non-null     object        
 5   TxtDate:              31 non-null     object        
 6   Cassette Number:      31 non-null     object        
 7   Minutes:              31 non-null     float64       
 8   Date:                 31 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(7)
memory usage: 2.4+ KB


Unnamed: 0_level_0,Conversation Number:,Location:,Exhibit Number:,Abstract:,Participants:,TxtDate:,Cassette Number:,Minutes:,Date:
Exhibit:,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
connally_exhibit_1,472-21,White House Oval Office,Exhibit 1 – U.S. v. John B. Connally,A discussion of the Associated Milk Producers ...,"[President Nixon, John D. Ehrlichman, John B. ...","Tuesday, March 23, 1971",E - 1 Segment 1,30.0,1971-03-23
exhibit_01,741-2,White House Oval Office,"Exhibit 1 – U.S. v. John B. Connally, et al.","""TheSmoking Gun"" conversation: Haldeman and Ni...","[President Nixon, H.R.Haldeman]","Friday, June 23, 1972",E - 2 Segment 1,8.0,1972-06-23
exhibit_02,741-10,White House Oval Office,"Exhibit 2 – U.S. v. John N. Mitchell, et al.",A discussion of how Hunt's involvement in the ...,"[President Nixon, H.R.Haldeman]","Friday, June 23, 1972",E - 2 Segment 2,4.0,1972-06-23
exhibit_03,343-36,Old Executive Office Building Office,"Exhibit 3 – U.S. v. John N. Mitchell, et al.",Haldeman discusses his meeting with Vernon Wal...,"[President Nixon, H.R.Haldeman]","Friday, June 23, 1972",E - 2 Segment 3,6.0,1972-06-23
exhibit_04,779-2,White House Oval Office,"Exhibit 4 – U.S. v. John N. Mitchell, et al.",A discussion of press treatment of the break-i...,"[President Nixon, H.R.Haldeman, John W. Dean III]","Friday, September 15, 1972",E - 3 Segment 1,34.0,1972-09-15
exhibit_05,394-21,Old Executive Office Building Office,"Exhibit 5 – U.S. v. John N. Mitchell, et al.",The President and Colson discuss the possible ...,"[President Nixon, Charles W. Colson]","Monday, January 08, 1973",E - 4 Segment 1,7.0,1973-01-08
exhibit_10,882-12,White House Oval Office,"Exhibit 10 – U.S. v. John N. Mitchell, et al.","A discussion between the President, Dean, and ...","[President Nixon, H.R.Haldeman, John W. Dean III]","Saturday, March 17, 1973",E - 4 Segment 2,21.0,1973-03-17
exhibit_11,885-7,White House Oval Office,"Exhibit 11 – U.S. v. John N. Mitchell, et al.",This conversation between the President and Ha...,"[President Nixon, H.R.Haldeman]","Tuesday, March 20, 1973",E - 5 Segment 1and E - 6 Segment 1,47.0,1973-03-20
exhibit_12,886-8,White House Oval Office,"Exhibit 12 – U.S. v. John N. Mitchell, et al.\...","The ""Cancer on the Presidency"" conversation: D...","[President Nixon, H.R.Haldeman, John W. Dean III]","Wednesday, March 21, 1973",8 E - 7 Segment 1 9 E - 8 Segment 1 10...,83.0,1973-03-21
exhibit_13,421-18,Old Executive Office Building Office,"Exhibit 13 – U.S. v. John N. Mitchell, et al.",Dean advises the President about Hunt's involv...,"[President Nixon, H.R.Haldeman, John W. Dean I...","Wednesday, March 21, 1973",E - 10 Segment 1,36.0,1973-03-21


### Process Transcript Data

#### Download transcripts per exhibit

In [14]:
# collect the pdf file links to the annotated transcripts
pdf_urls = []
for i in all_pdfs:
    txt = str(i.string)
    if "pdf" in txt: # Identify links with 'pdf'
        pdf_urls.append(i['href'])

#### Extract Text from downloaded PDFs

In [15]:
### From provided url, writes pdf file into the local directory
def download_file(url):

    # generate a filename to store the pdf in the local directory, based on the URL
    local_pdf_filename = url.split('/')[-1]
    
    # write the pdf file to the local directory, must be written in binary mode ('wb')
    with requests.get(url) as r:
        with open(local_pdf_filename, 'wb') as f:
            f.write(r.content) 
            
    return local_pdf_filename

## Thank you: https://stackoverflow.com/questions/64911851/cant-open-a-pdf-file-using-pdfplumber-open
# requests library and pdfplumber library must be imported

In [16]:
### From provided local filename, extracts text using pdfplumber
def extractText(local_pdf_filename):

    # extract the text
    with pdfplumber.open(local_pdf_filename) as pdf:
        pages = pdf.pages # stores all pages
        fulltext = []
        pagetext = []
            
        for n in range(len(pages)):
            try:
                pagetext = pages[n].extract_text()
                fulltext.append([pagetext])
            except:
                print("Encountered Error!")
                errorstring = ("error on page"+" "+ str(n+1))
                fulltext.insert(0,errorstring)
                continue
    
    title = local_pdf_filename[:-4]
    
    return title, fulltext

In [17]:
# Process the files
# store each fulltext dataset in a dictionary identified by the name of the file

## Create a dictionary to store the outputs
text_dict = {}

# remove exhibit_10 pdf from list, it will be added manually in the next cell
pdf_urls.remove('https://www.nixonlibrary.gov/sites/default/files/forresearchers/find/tapes/watergate/trial/exhibit_10.pdf')

## Run the function on each transcript pdf
for p in pdf_urls:
    filelist = download_file(p)
    transcript = extractText(filelist)
    print(filelist, len(transcript[1]))

    k = transcript[0] # title - filename
    v = transcript[1] # transcript
    text_dict.update({k:v})

connally_exhibit_1.pdf 25
exhibit_01.pdf 9
exhibit_02.pdf 2
exhibit_03.pdf 3
exhibit_04.pdf 30
exhibit_05.pdf 5
exhibit_11.pdf 26
exhibit_12.pdf 108
exhibit_12.pdf 108
exhibit_12.pdf 108
exhibit_13.pdf 27
exhibit_14.pdf 5
exhibit_15.pdf 30
exhibit_16.pdf 44
exhibit_17.pdf 20
exhibit_18.pdf 85
exhibit_18.pdf 85
exhibit_18.pdf 85
exhibit_19.pdf 23
exhibit_20.pdf 13
exhibit_21.pdf 9
exhibit_22.pdf 10
exhibit_23.pdf 21
exhibit_24.pdf 15
exhibit_25.pdf 43
exhibit_26.pdf 13
exhibit_27.pdf 4
exhibit_28.pdf 11
exhibit_29.pdf 5
exhibit_31.pdf 5
exhibit_32.pdf 36
exhibit_33.pdf 8
exhibit_35.pdf 17
exhibit_37.pdf 11


In [18]:
# Manage exception: 
# manually converted Exhibit 10 from pdf to txt.
# read txt version of file

with open('exhibit_10.txt') as f:
    e10 = f.readlines()
    
# Update the exhibit 10 content in TextDict
text_dict['exhibit_10'] = e10

In [19]:
# Place the transcripts in a dataframe

df_text = pd.DataFrame([text_dict]).T
df_text.rename(columns = {0:'Full_Extracted_Text'}, 
            inplace = True)

df_text

Unnamed: 0,Full_Extracted_Text
connally_exhibit_1,"[[MEETING AMONG PRESIDENT RICHARD M. NIXON,\nJ..."
exhibit_01,[[TRANSCRIPT OF A RECORDING OF A\nMEETING BETW...
exhibit_02,[[TRANSCRIPT OF A RECORDING OF A MEETING\nBETW...
exhibit_03,[[TRANSCRIPT OF A RECORDING OF A MEETING\nBETW...
exhibit_04,[[TRANSCRIPT OF A RECORDING OF A MEETING\nAMON...
exhibit_05,[[TRANSCRIPT OF RECORDING OF A MEETING\nBETWEE...
exhibit_11,[[TRANSCRIPT OF A RECORDING OF A MEETING\nBETW...
exhibit_12,[[TRANSCRIPT OF A RECORDING OF A\nMEETING AMON...
exhibit_13,[[TRANSCRIPT OF A RECORDING OF A\nMEETING AMON...
exhibit_14,[[TRANSCRIPT OF A TELEPHONE CONVERSATION\nBETW...


### Create the df_all dataframe containing all relevant metadata and transcript data

In [20]:
# Join the text and meta DataFrames

# check shapes
print("before: ",df_meta.shape, '\n\t', df_text.shape)

# merge the DFs
df_all = pd.merge(df_text, df_meta, left_index=True, right_index=True)

# verify shape
print("after:  ",df_all.shape)

df_all.head()

before:  (31, 9) 
	 (31, 1)
after:   (31, 10)


Unnamed: 0,Full_Extracted_Text,Conversation Number:,Location:,Exhibit Number:,Abstract:,Participants:,TxtDate:,Cassette Number:,Minutes:,Date:
connally_exhibit_1,"[[MEETING AMONG PRESIDENT RICHARD M. NIXON,\nJ...",472-21,White House Oval Office,Exhibit 1 – U.S. v. John B. Connally,A discussion of the Associated Milk Producers ...,"[President Nixon, John D. Ehrlichman, John B. ...","Tuesday, March 23, 1971",E - 1 Segment 1,30.0,1971-03-23
exhibit_01,[[TRANSCRIPT OF A RECORDING OF A\nMEETING BETW...,741-2,White House Oval Office,"Exhibit 1 – U.S. v. John B. Connally, et al.","""TheSmoking Gun"" conversation: Haldeman and Ni...","[President Nixon, H.R.Haldeman]","Friday, June 23, 1972",E - 2 Segment 1,8.0,1972-06-23
exhibit_02,[[TRANSCRIPT OF A RECORDING OF A MEETING\nBETW...,741-10,White House Oval Office,"Exhibit 2 – U.S. v. John N. Mitchell, et al.",A discussion of how Hunt's involvement in the ...,"[President Nixon, H.R.Haldeman]","Friday, June 23, 1972",E - 2 Segment 2,4.0,1972-06-23
exhibit_03,[[TRANSCRIPT OF A RECORDING OF A MEETING\nBETW...,343-36,Old Executive Office Building Office,"Exhibit 3 – U.S. v. John N. Mitchell, et al.",Haldeman discusses his meeting with Vernon Wal...,"[President Nixon, H.R.Haldeman]","Friday, June 23, 1972",E - 2 Segment 3,6.0,1972-06-23
exhibit_04,[[TRANSCRIPT OF A RECORDING OF A MEETING\nAMON...,779-2,White House Oval Office,"Exhibit 4 – U.S. v. John N. Mitchell, et al.",A discussion of press treatment of the break-i...,"[President Nixon, H.R.Haldeman, John W. Dean III]","Friday, September 15, 1972",E - 3 Segment 1,34.0,1972-09-15


In [21]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31 entries, connally_exhibit_1 to exhibit_10
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Full_Extracted_Text   31 non-null     object        
 1   Conversation Number:  31 non-null     object        
 2   Location:             31 non-null     object        
 3   Exhibit Number:       31 non-null     object        
 4   Abstract:             31 non-null     object        
 5   Participants:         31 non-null     object        
 6   TxtDate:              31 non-null     object        
 7   Cassette Number:      31 non-null     object        
 8   Minutes:              31 non-null     float64       
 9   Date:                 31 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 2.7+ KB


## Clean Data


### Standardize the dataframe

In [22]:
# rename columns for best practices
new_names = {k:k.strip().replace(':','').replace(' ','_').lower() for k in df_all.columns}
df_all.rename(columns=new_names, inplace=True)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31 entries, connally_exhibit_1 to exhibit_10
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   full_extracted_text  31 non-null     object        
 1   conversation_number  31 non-null     object        
 2   location             31 non-null     object        
 3   exhibit_number       31 non-null     object        
 4   abstract             31 non-null     object        
 5   participants         31 non-null     object        
 6   txtdate              31 non-null     object        
 7   cassette_number      31 non-null     object        
 8   minutes              31 non-null     float64       
 9   date                 31 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 2.7+ KB


### Parse text to isolate speech and metadata within the documents

In [23]:
def flatten(line):
    out = []
    for item in line:
        if isinstance(item, (list, tuple)):
            out.extend(flatten(item))
        else:
            out.append(item)
    return out

# copied from - ref: http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html

In [24]:
# Flatten lists
flat_list = df_all.iloc[:,0].apply(lambda x: flatten(x))

# Then flatten text within listas
flat_text = flat_list.apply(lambda x: " ".join(map(str, x)))

df_all["flattened"] = flat_list.apply(lambda x: " ".join(map(str, x)))
#ref: https://stackoverflow.com/questions/12453580/how-to-concatenate-items-in-a-list-to-a-single-string

In [25]:
# functions for initial grooming

# prints all text contained in () for visual examination prior to removing
def examine_drops(text):
  #  print(text)
    regex = re.compile(".*?\((.*?)\)")
    result = re.findall(regex, text)
    # print(result)
    return(result)

# removes new line characters and all () including text
def transcript_initialgroom(text):
    #text = re.sub(r"\n", " ", text)
    text = re.sub(r"\s", " ", text)
    text = re.sub(r'\\\\',"",text)
    text = re.sub(r'\\',"",text)
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    return(text)

# drops speakers and replaces with ""
def drop_speakers(text):
    text = re.sub(r'\b[A-Z]{2,}\b:', "", text)
    return(text)

def drop_repeating(text):
    text = re.sub(r'\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\b.*(?!/n)', "", text)
#    text = re.sub(r'\b(?:APRIL|JUNE|SEPTEMBER)\b.*(?!/n)', "\nFOUND A FREAKING MATCH\n", text)
    return(text)

# extracts speech header, and splits into two panda series objects
def extract_speechheader(exhibit_speech):
    """Attempts the most conservative pattern of uppercase NAME: followed by a word with lowercase.
       Then, requires an uppercase NAME:, and finally allows a camelcase Name:
       If these fail, no header will be defined (span ends at 0)"""
    patt1 = r'(\b[A-Z]{2,}\b:\s+(..[a-z]|.[a-z]))' # the NAME: is uppercase and followed by a word with lowercase letters
    patt2 = r'(\b[A-Z]{2,}\b:)' # the NAME: is uppercase
    patt3 = r'([A-Z][A-Za-z]+\b:)' # the first colon following any set of letters
    if re.search(patt1, exhibit_speech)!=None: patt=patt1
    elif re.search(patt2, exhibit_speech)!=None: patt = patt2
    elif re.search(patt3, exhibit_speech)!=None: patt = patt3
    try:
        headerend = re.search(patt, exhibit_speech)[0]
        headerendspan = re.search(patt, exhibit_speech).span()[0]
    except: headerend = headerendspan = 0
    header = exhibit_speech[:headerendspan]
    speech = exhibit_speech[headerendspan:]
    return pd.Series([header, speech])

In [26]:
# extracts and returns start and end time from the speech header
def timehdr(text):
    """finds and calculates the FROM-TO times in the text, assuming from is first and to is last"""
    patt = re.compile(r'([0-9]+:\w\w[\sAaPp])') #[^/]+$ brings the rest of the line; +\b brings the rest of the word; ref Joey https://stackoverflow.com/questions/11347868/regex-to-get-last-word-from-sentence-of-words-separated-by
    iter = patt.finditer(text)
    wrds = [i for m in iter for i in m.span()]
    try:
        # Find the start and end times in the text
        s,e = min(wrds), max(wrds)+3
        times = text[s:e].replace(".","").replace("*","").replace("-"," ").strip().upper().split()
        s_time = re.sub("[A-Z]","1",times[0]) # assumes start time is first in the list; any letters will be replaced with 1
        e_merid = times[-1][0]+"M" # assumes AM or PM are the final element in the list
        e_time = re.sub("[A-Z]","1",times[-2]) # assumes end time is second to last in the list; any letters will be replaced with 1

        # determine AM or PM
        st = int(s_time.split(":")[0]) # find the hour of the start time
        et = int(e_time.split(":")[0]) # find the hour of the end time
        if st==12: s_merid=e_merid
        elif st>et or et==12:
            if e_merid=="PM": s_merid="AM"
            elif st>et and e_merid=="AM": s_merid="PM"
        else: s_merid=e_merid
        times = [s_time+s_merid,e_time+e_merid]
        
        # calculate date-time
        sdtime = pd.to_timedelta(str(datetime.strptime(times[0], '%I:%M%p').time())) # start time
        edtime = pd.to_timedelta(str(datetime.strptime(times[1], '%I:%M%p').time())) # end time
    except: sdtime = edtime = pd.to_timedelta('00:00:00')
    diff = edtime-sdtime # time difference between start/end
    return sdtime, edtime, diff

In [27]:
# examine words in paragraphs
df_all['dropped'] = df_all['flattened'].apply(lambda row: examine_drops(row))

In [28]:
# apply initial grooming to flattened dataset (removes new line characters and all () including text
df_all[['speech_header', 'speech']] = df_all['flattened'].apply(lambda row: extract_speechheader(row))

In [29]:
# remove repeating row from flattened dataset
df_all['speech_flat_norepeat'] = df_all['speech'].apply(lambda row: drop_repeating(row))

In [30]:
# apply initial grooming to flattened dataset (removes new line characters and all () including text
df_all['speech_groomed'] = df_all['speech_flat_norepeat'].apply(lambda row: transcript_initialgroom(row))

In [31]:
# apply initial grooming to the speech header
df_all['speech_header'] = df_all['speech_header'].apply(
    lambda x: transcript_initialgroom(x))

In [32]:
# apply drop speakers to groomed speech
df_all['speech_final'] = df_all['speech_groomed'].apply(lambda row: drop_speakers(row))

In [33]:
# run the function to get time from speech header data
dfsh = df_all['speech_header'].copy()
timer = dfsh.apply(lambda row: timehdr(row))
dfsh = pd.DataFrame([[a,b,c] for a,b,c in timer.values], 
                    columns = ['start_time','end_time','time_diff'], index = timer.index)

# # add start, end, diff columns to df_all
df_all = pd.concat([df_all,dfsh], axis=1)
df_all['start_dtime'] = df_all['date']+df_all['start_time']
df_all['end_dtime'] = df_all['date']+df_all['end_time']

# ref: jezrael https://stackoverflow.com/questions/57847521/summing-two-datetime-columns
# ref: cs95 https://stackoverflow.com/questions/53402584/how-to-convert-a-series-of-tuples-into-a-pandas-dataframe 

### Finalize the dataframe columns

In [34]:
# Select df_all columns to carry forward
df_all = df_all[['exhibit_number','conversation_number','cassette_number',
                'abstract','location','participants','minutes','txtdate',
                'date','start_dtime','end_dtime','time_diff','full_extracted_text',
                'dropped', 'speech_header','speech_flat_norepeat','speech_final']]

In [35]:
display(df_all.info())
df_all.head()

<class 'pandas.core.frame.DataFrame'>
Index: 31 entries, connally_exhibit_1 to exhibit_10
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype          
---  ------                --------------  -----          
 0   exhibit_number        31 non-null     object         
 1   conversation_number   31 non-null     object         
 2   cassette_number       31 non-null     object         
 3   abstract              31 non-null     object         
 4   location              31 non-null     object         
 5   participants          31 non-null     object         
 6   minutes               31 non-null     float64        
 7   txtdate               31 non-null     object         
 8   date                  31 non-null     datetime64[ns] 
 9   start_dtime           31 non-null     datetime64[ns] 
 10  end_dtime             31 non-null     datetime64[ns] 
 11  time_diff             31 non-null     timedelta64[ns]
 12  full_extracted_text   31 non-null     object  

None

Unnamed: 0,exhibit_number,conversation_number,cassette_number,abstract,location,participants,minutes,txtdate,date,start_dtime,end_dtime,time_diff,full_extracted_text,dropped,speech_header,speech_flat_norepeat,speech_final
connally_exhibit_1,Exhibit 1 – U.S. v. John B. Connally,472-21,E - 1 Segment 1,A discussion of the Associated Milk Producers ...,White House Oval Office,"[President Nixon, John D. Ehrlichman, John B. ...",30.0,"Tuesday, March 23, 1971",1971-03-23,1971-03-23 17:05:00,1971-03-23 17:38:00,0 days 00:33:00,"[[MEETING AMONG PRESIDENT RICHARD M. NIXON,\nJ...",[],"MEETING AMONG PRESIDENT RICHARD M. NIXON, JOHN...","PRESIDENT: Hi, Phil, how are you?\nCAMPBELL: M...","Hi, Phil, how are you? Mr. President. Sorry..."
exhibit_01,"Exhibit 1 – U.S. v. John B. Connally, et al.",741-2,E - 2 Segment 1,"""TheSmoking Gun"" conversation: Haldeman and Ni...",White House Oval Office,"[President Nixon, H.R.Haldeman]",8.0,"Friday, June 23, 1972",1972-06-23,1972-06-23 10:04:00,1972-06-23 11:39:00,0 days 01:35:00,[[TRANSCRIPT OF A RECORDING OF A\nMEETING BETW...,"[coughs, Unintelligible, unintelligible, repl....",TRANSCRIPT OF A RECORDING OF A MEETING BETWEEN...,"HALDEMAN: okay -that's fine. Now, on the inves...","okay -that's fine. Now, on the investi- gatio..."
exhibit_02,"Exhibit 2 – U.S. v. John N. Mitchell, et al.",741-10,E - 2 Segment 2,A discussion of how Hunt's involvement in the ...,White House Oval Office,"[President Nixon, H.R.Haldeman]",4.0,"Friday, June 23, 1972",1972-06-23,1972-06-23 13:04:00,1972-06-23 13:13:00,0 days 00:09:00,[[TRANSCRIPT OF A RECORDING OF A MEETING\nBETW...,"[On the phone, Unintelligible, 40 second pause...",TRANSCRIPT OF A RECORDING OF A MEETING BETWEEN...,HALDEMAN: (On the phone) (Unintelligible) Wher...,Where are they? Okay. I'll be up in just a...
exhibit_03,"Exhibit 3 – U.S. v. John N. Mitchell, et al.",343-36,E - 2 Segment 3,Haldeman discusses his meeting with Vernon Wal...,Old Executive Office Building Office,"[President Nixon, H.R.Haldeman]",6.0,"Friday, June 23, 1972",1972-06-23,1972-06-23 14:20:00,1972-06-23 14:45:00,0 days 00:25:00,[[TRANSCRIPT OF A RECORDING OF A MEETING\nBETW...,"[Background noise, unintelligible, telephone r...",TRANSCRIPT OF A RECORDING OF A MEETING BETWEEN...,"HALDEMAN: Well, it's no problem. Had the...two...","Well, it's no problem. Had the...two of them ..."
exhibit_04,"Exhibit 4 – U.S. v. John N. Mitchell, et al.",779-2,E - 3 Segment 1,A discussion of press treatment of the break-i...,White House Oval Office,"[President Nixon, H.R.Haldeman, John W. Dean III]",34.0,"Friday, September 15, 1972",1972-09-15,1972-09-15 17:27:00,1972-09-15 18:17:00,0 days 00:50:00,[[TRANSCRIPT OF A RECORDING OF A MEETING\nAMON...,"[FIRST INSTALLMENT, Unintelligible, Laughs, Un...",TRANSCRIPT OF A RECORDING OF A MEETING AMONG T...,"DEAN: Yes sir.\nPRESIDENT: Well, you had quite...","Yes sir. Well, you had quite a day today, di..."


## Store Data

### Pickle the cleaned dataframes 

In [36]:
# pickle initial dataframe to avoid processing datagrooming
output = open('cleaned_Watergate.pkl', 'wb')
sys.setrecursionlimit(100000)

pickle.dump(df_all, output)

output.close()

### Pickle Import block
Copy this cell to any notebook to retrieve and unpickle the dataframes

In [37]:
import pprint, pickle
import pandas as pd

# unpickle preserved dataframes to continue EDA and Statistical Analysis
pkl_file = open('cleaned_Watergate.pkl', 'rb')

df_all_wgtrial = pickle.load(pkl_file)
print("\n~~~df_all_wgtrial~~~\n")
print(df_all_wgtrial.info())

# Expected Results:

# Index: 31 entries, connally_exhibit_1 to exhibit_10
# Data columns (total 17 columns):
#  #   Column                Non-Null Count  Dtype          
# ---  ------                --------------  -----          
#  0   exhibit_number        31 non-null     object         
#  1   conversation_number   31 non-null     object         
#  2   cassette_number       31 non-null     object         
#  3   abstract              31 non-null     object         
#  4   location              31 non-null     object         
#  5   participants          31 non-null     object         
#  6   minutes               31 non-null     float64        
#  7   txtdate               31 non-null     object 
#  8   date                  31 non-null     datetime64[ns] 
#  9   start_dtime           31 non-null     datetime64[ns] 
#  10  end_dtime             31 non-null     datetime64[ns] 
#  11  time_diff             31 non-null     timedelta64[ns]
#  12  full_extracted_text   31 non-null     object         
#  13  dropped               31 non-null     object         
#  14  speech_header         31 non-null     object         
#  15  speech_flat_norepeat  31 non-null     object         
#  16  speech_final          31 non-null     object         
# dtypes: datetime64[ns](3), float64(1), object(12), timedelta64[ns](1)


~~~df_all_wgtrial~~~

<class 'pandas.core.frame.DataFrame'>
Index: 31 entries, connally_exhibit_1 to exhibit_10
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype          
---  ------                --------------  -----          
 0   exhibit_number        31 non-null     object         
 1   conversation_number   31 non-null     object         
 2   cassette_number       31 non-null     object         
 3   abstract              31 non-null     object         
 4   location              31 non-null     object         
 5   participants          31 non-null     object         
 6   minutes               31 non-null     float64        
 7   txtdate               31 non-null     object         
 8   date                  31 non-null     datetime64[ns] 
 9   start_dtime           31 non-null     datetime64[ns] 
 10  end_dtime             31 non-null     datetime64[ns] 
 11  time_diff             31 non-null     timedelta64[ns]
 12  full_extracted_text   3