In [7]:
import os
import re
import pandas as pd
import hashlib  # for detecting duplicate
from tqdm.notebook import tqdm  # for progress bars

dir_name = "/content/data/"
dir_name_csv = "/content/data_preprocessed_csv/"

In [8]:
def replace_space_with_underscore():
    for f in os.listdir(dir_name):
        new_name = f.replace(" ", "_")
        os.rename(dir_name + f, dir_name + new_name)
    for f in os.listdir(dir_name):
        print(f)


def strip_lines(lines):
    """
    strip spaces, newlines, carriage returns from each of the lines
    """
    lines_new = []

    for line in lines:
        lines_new.append(line.strip(" \n\x0c"))

    return lines_new


def delete_blank_lines(lines):
    """
    given list of lines with \n removed, deletes those lines that are empty
    """
    lines_new = []

    for line in lines:
        if line == "":
            continue
        else:
            lines_new.append(line)

    return lines_new


def remove_time_tags(lines):
    """
    given list of lines, remove time tags at the start or end of the lines
    """
    # added requirement for 2 spaces in pat_end because it is possible a
    # time is mentioned in a question or answer
    # the ?: at the start of each group makes them non-capturing groups
    timestamp = r"(?:\d\d:\d\d(?::\d\d)?(?:[AP]M)?)"
    pat_start = "^" + timestamp + " +(.*)"
    pat_end = "(.*) {2,}" + timestamp + "$"

    lines_new = []

    for line in lines:
        match_start = re.match(pat_start, line)
        match_end = re.match(pat_end, line)

        if match_start:
            lines_new.append(match_start.groups()[0])
        elif match_end:
            lines_new.append(match_end.groups()[0])
        else:
            lines_new.append(line)

    return lines_new


def is_start_of_question(text):
    """
    determine if the text is the start of a question
    """
    pat = r"^Q[\. ] +[^ ]"

    match = re.match(pat, text)
    if match:
        return True
    else:
        return False


def is_identifying_questioner(text):
    """
    given text, determine if it is of the form:
    BY MS. SMITH:
    """
    pat = r"B[Yy][ -](M[RrSs]\. ?([A-Z][A-Za-z-]+\b ?)+|([A-Z][A-Za-z-]+\b ?)+(, ESQ))"
    match = re.search(pat, text)

    if match:
        return match.groups()[0].upper()
    else:
        return False


def find_first_question(lines):
    """
    find first question in deposition.
    returns index from lines if found
    """

    found = False
    for i, line in enumerate(lines):
        splits = split_into_num_text(line)
        if splits is None:
            continue

        _, text = splits

        if is_start_of_question(text):
            found = i
            break

    return found


def is_start_of_answer(text):
    """
    determine if the text is the start of an answer
    """
    pat = r"^A[\. ] +[^ ]"

    match = re.match(pat, text)
    if match:
        return True
    else:
        return None


def find_bad_files(dir_name):
    """
    a file is considered bad if the function find_first_question returns false
    manually checking suggests these files are not depositions
    """
    bad_files = []

    for filename in os.listdir(dir_name):
        with open(dir_name + filename, "r", encoding="windows-1252") as f:
            lines = f.readlines()

        lines = strip_lines(lines)
        lines = delete_blank_lines(lines)
        lines = remove_time_tags(lines)

        if not find_first_question(lines):
            bad_files.append(filename)

    return bad_files


def is_page_number(line):
    """
    given line from file, determine if corresponds to a page numbering
    if yes, return the page number
    if not, return 0
    """
    pat = r"0*(\d+)$"
    match = re.match(pat, line)
    if not match:
        return 0
    else:
        return int(match.groups()[0])


def find_current_page_number(lines):
    """
    given lines from file (after selecting core) determine current page number
    """
    # keep reading through lines until you find a page number
    # this will be the page number of the second page in lines
    # so subtract 1 to get page number of current page
    for line in lines:
        page_number = is_page_number(line)
        if page_number:
            return page_number - 1


def split_into_num_text(line):
    """
    given line, split it into line number and the text.
    """
    pat = r"^(\d*:)?(\d+) +(.*)$"
    match = re.match(pat, line)
    if match:
        return int(match.groups()[1]), match.groups()[2]
    else:
        return None


def is_start_of_side_chat(text):
    """
    given text, determine if it is first line of 'side chat'
    e.g. the second line in following:
    Q.  Doesn't this document imply x did y?
         MR. SMITH: Objection form.
    if yes, return the person speaking
    if not, return False
    """
    # this pattern is known to match too many things, e.g. WITNESS NAME:
    pat = r"^([A-Z]+\.?( [A-Z]+)+): +[\w\(-]"
    match = re.match(pat, text)

    if match:
        return match.groups()[0]
    else:
        return False


def is_start_of_brackets(text):
    """
    given text, determine it is start of text that is contained in brackets
    """
    pat = r"^\([^\(\)]*\)?$"
    match = re.match(pat, text)

    if match:
        return match.group()
    else:
        return False


def is_only_symbols(text):
    """
    given string, determine if it is made up of only symbols
    intention is to find things like linebreaks '- - -'
    """
    pat = "[a-zA-Z0-9]"
    match = re.search(pat, text)

    if match:
        return False
    # -- is used to indicate somebody was interrupted just before they were going to speak
    elif text == "--":
        return False
    else:
        return True


def create_dataframe_from_file(filename):
    """
    given a deposition in filename, create a dataframe
    """
    with open(dir_name + filename, "r", encoding="windows-1252") as f:
        lines = f.readlines()

    lines = strip_lines(lines)
    lines = delete_blank_lines(lines)
    lines = remove_time_tags(lines)

    return create_dataframe_from_lines(lines)


def create_dataframe_from_lines(lines):
    """
    this is the most important function in preprocessing. it loops through all the lines,
    checks what kind of line it is, and combines that line with previous lines as appropriate.
    then outputs a dataframe
    """
    # choose starting point as 5 lines above first question. this is because the questioner is almost always
    # identified within the few lines before first question
    start = find_first_question(lines) - 5
    if start is None:
        print("could not find start")
        return None

    # initialize various parameters
    ongoing_indice = 0
    current_page_number = find_current_page_number(lines[start:])
    current_line_number = 0
    ongoing_page_number = 0
    ongoing_line_number = 0
    ongoing_text = ""
    ongoing_line_type = None
    ongoing_questioner = None
    ongoing_speaker = None

    # the indice and time_added columns are included to help with debugging. they have no use for end-users
    columns = [
        "indice",
        "page_number",
        "line_number",
        "text",
        "text_type",
        "speaker",
        "time_added",
    ]
    data = []

    for i, line in enumerate(lines):
        # ignore preamble
        if i < start:
            continue

        # page_numbering is now determined by using line numbers and first page number
        # see few lines below
        if is_page_number(line):
            continue

        splits = split_into_num_text(line)
        if splits is None:
            continue

        line_number, text = splits

        if line_number < current_line_number:
            current_page_number += 1
        current_line_number = line_number

        if is_start_of_question(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "start_question",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "q"

            questioner = is_identifying_questioner(text)
            if questioner:
                ongoing_questioner = questioner
                # include something here to remove the identification of questioner from question
            ongoing_speaker = ongoing_questioner

        elif is_start_of_answer(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "start_answer",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "a"
            ongoing_speaker = "THE WITNESS"

        elif is_start_of_side_chat(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "start_chat",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "side_chat"
            ongoing_speaker = is_start_of_side_chat(text)

        elif is_identifying_questioner(text):
            ongoing_questioner = is_identifying_questioner(text)

        elif is_start_of_brackets(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "is brackets",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "brackets"
            ongoing_speaker = None

        elif is_only_symbols(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "symbols",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "symbols"
            ongoing_speaker = None

        else:
            ongoing_text += " " + text

    data.append(
        [
            ongoing_indice,
            ongoing_page_number,
            ongoing_line_number,
            ongoing_text,
            ongoing_line_type,
            ongoing_speaker,
            "end",
        ]
    )

    return pd.DataFrame(data, columns=columns)


def remove_a_q_from_text(text):
    """
    given a string, remove 'A. ' and 'Q. ' at the start of the string
    """
    pat = r"^[A|Q]\.? +(.*$)"
    match = re.match(pat, text)

    if match:
        return match.groups()[0]
    else:
        return text


def remove_a_q_from_text_in_frame(df):
    """
    given dataframe outputted from 'create_dataframe_from...',
    remove the 'A. ' and 'Q. ' at the beginning of texts of type 'a' or 'q'
    """
    df_temp = df.copy()

    aq_indices = df_temp.text_type.isin(["a", "q"])
    df_temp.loc[aq_indices, "text"] = df_temp.loc[aq_indices, "text"].map(
        remove_a_q_from_text
    )
    return df_temp


def remove_names_from_sidechat_text(df):
    """
    given dataframe outputted from 'create_dataframe_from...',
    remove the 'MR SMITH: ' or 'THE WITNESS: ' or similar
    from start of side chat
    """
    df_temp = df.copy()

    indices = df_temp.text_type == "side_chat"

    df_temp.loc[indices, "text"] = (
        df_temp.loc[indices, "text"].str.split(pat=": +").map(lambda x: x[1])
    )

    return df_temp


def create_csvs_from_directory(directory):
    """
    given a directory of text files of depositions,
    create csv files that extract data from them
    """
    for filename in tqdm(os.listdir(directory)):
        print(f"starting on {filename}")
        df = create_dataframe_from_file(filename)
        df = remove_a_q_from_text_in_frame(df)
        df = remove_names_from_sidechat_text(df)
        df.to_csv(dir_name_csv + filename[:-4] + ".csv")

In [9]:
create_csvs_from_directory(dir_name)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

starting on Bogel v. Jolly Trolley 02-11-20 Dean DiPietro - Inconsistencies.txt



In [17]:
df = pd.read_csv('/content/data_preprocessed_csv/Bogel v. Jolly Trolley 02-11-20 Dean DiPietro - Inconsistencies.csv')
q_a=[]
text = list (df['text'])
text_type = list (df['text_type'])
for i in range(0,len(text)):
  if(text_type[i]=='q'):
    c=i+1
    while(text_type[c]!='a' and c<len(text)):
      c=c+1
    if(c<len(text)):
      q_a.append((text[i],text[c]))

In [19]:
q_a

[('Good afternoon, Mr. DiPietro.', 'Hi.'),
 ("My name is Sarah Cole.  Thank you for being here for your deposition today. You were present for your wife Cecilia -- I'm sorry. You were present for your wife's deposition, correct?",
  'Yes.'),
 ('And you also heard the instructions that I gave to her prior or as we started the deposition?',
  'Yes.'),
 ("Okay.  Just in case, I'm going to go through them just briefly for you because it's been several hours since we went through them.",
  'Sure.'),
 ("We have a court reporter here with us today and he's going to be writing down everything that's being stated in this room.  So any question I give, any response you provide, any objections that are made, everything is going to be encapsulated for all time in a book and someone might need to go back and read that transcript.  To make that process easier for that person, I want to make sure that the transcript is as clear as possible. For that to happen, I'm going to ask us to take turns speaki