In [1]:
import os
import re
import pandas as pd
import hashlib  # for detecting duplicate
from tqdm.notebook import tqdm  # for progress bars

dir_name = "data/"
dir_name_csv = "data_preprocessed_csv/"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/contradiction

In [None]:
def replace_space_with_underscore():
    for f in os.listdir(dir_name):
        new_name = f.replace(" ", "_")
        os.rename(dir_name + f, dir_name + new_name)
    for f in os.listdir(dir_name):
        print(f)


def strip_lines(lines):
    """
    strip spaces, newlines, carriage returns from each of the lines
    """
    lines_new = []

    for line in lines:
        lines_new.append(line.strip(" \n\x0c"))

    return lines_new


def delete_blank_lines(lines):
    """
    given list of lines with \n removed, deletes those lines that are empty
    """
    lines_new = []

    for line in lines:
        if line == "":
            continue
        else:
            lines_new.append(line)

    return lines_new


def remove_time_tags(lines):
    """
    given list of lines, remove time tags at the start or end of the lines
    """
    # added requirement for 2 spaces in pat_end because it is possible a
    # time is mentioned in a question or answer
    # the ?: at the start of each group makes them non-capturing groups
    timestamp = r"(?:\d\d:\d\d(?::\d\d)?(?:[AP]M)?)"
    pat_start = "^" + timestamp + " +(.*)"
    pat_end = "(.*) {2,}" + timestamp + "$"

    lines_new = []

    for line in lines:
        match_start = re.match(pat_start, line)
        match_end = re.match(pat_end, line)

        if match_start:
            lines_new.append(match_start.groups()[0])
        elif match_end:
            lines_new.append(match_end.groups()[0])
        else:
            lines_new.append(line)

    return lines_new


def is_start_of_question(text):
    """
    determine if the text is the start of a question
    """
    pat = r"^Q[\. ] +[^ ]"

    match = re.match(pat, text)
    if match:
        return True
    else:
        return False


def is_identifying_questioner(text):
    """
    given text, determine if it is of the form:
    BY MS. SMITH:
    """
    pat = r"B[Yy][ -](M[RrSs]\. ?([A-Z][A-Za-z-]+\b ?)+|([A-Z][A-Za-z-]+\b ?)+(, ESQ))"
    match = re.search(pat, text)

    if match:
        return match.groups()[0].upper()
    else:
        return False


def find_first_question(lines):
    """
    find first question in deposition.
    returns index from lines if found
    """

    found = False
    for i, line in enumerate(lines):
        splits = split_into_num_text(line)
        if splits is None:
            continue

        _, text = splits

        if is_start_of_question(text):
            found = i
            break

    return found


def is_start_of_answer(text):
    """
    determine if the text is the start of an answer
    """
    pat = r"^A[\. ] +[^ ]"

    match = re.match(pat, text)
    if match:
        return True
    else:
        return None


def find_bad_files(dir_name):
    """
    a file is considered bad if the function find_first_question returns false
    manually checking suggests these files are not depositions
    """
    bad_files = []

    for filename in os.listdir(dir_name):
        with open(dir_name + filename, "r", encoding="windows-1252") as f:
            lines = f.readlines()

        lines = strip_lines(lines)
        lines = delete_blank_lines(lines)
        lines = remove_time_tags(lines)

        if not find_first_question(lines):
            bad_files.append(filename)

    return bad_files


def is_page_number(line):
    """
    given line from file, determine if corresponds to a page numbering
    if yes, return the page number
    if not, return 0
    """
    pat = r"0*(\d+)$"
    match = re.match(pat, line)
    if not match:
        return 0
    else:
        return int(match.groups()[0])


def find_current_page_number(lines):
    """
    given lines from file (after selecting core) determine current page number
    """
    # keep reading through lines until you find a page number
    # this will be the page number of the second page in lines
    # so subtract 1 to get page number of current page
    for line in lines:
        page_number = is_page_number(line)
        if page_number:
            return page_number - 1


def split_into_num_text(line):
    """
    given line, split it into line number and the text.
    """
    pat = r"^(\d*:)?(\d+) +(.*)$"
    match = re.match(pat, line)
    if match:
        return int(match.groups()[1]), match.groups()[2]
    else:
        return None


def is_start_of_side_chat(text):
    """
    given text, determine if it is first line of 'side chat'
    e.g. the second line in following:
    Q.  Doesn't this document imply x did y?
         MR. SMITH: Objection form.
    if yes, return the person speaking
    if not, return False
    """
    # this pattern is known to match too many things, e.g. WITNESS NAME:
    pat = r"^([A-Z]+\.?( [A-Z]+)+): +[\w\(-]"
    match = re.match(pat, text)

    if match:
        return match.groups()[0]
    else:
        return False


def is_start_of_brackets(text):
    """
    given text, determine it is start of text that is contained in brackets
    """
    pat = r"^\([^\(\)]*\)?$"
    match = re.match(pat, text)

    if match:
        return match.group()
    else:
        return False


def is_only_symbols(text):
    """
    given string, determine if it is made up of only symbols
    intention is to find things like linebreaks '- - -'
    """
    pat = "[a-zA-Z0-9]"
    match = re.search(pat, text)

    if match:
        return False
    # -- is used to indicate somebody was interrupted just before they were going to speak
    elif text == "--":
        return False
    else:
        return True


def create_dataframe_from_file(filename):
    """
    given a deposition in filename, create a dataframe
    """
    with open(dir_name + filename, "r", encoding="windows-1252") as f:
        lines = f.readlines()

    lines = strip_lines(lines)
    lines = delete_blank_lines(lines)
    lines = remove_time_tags(lines)

    return create_dataframe_from_lines(lines)


def create_dataframe_from_lines(lines):
    """
    this is the most important function in preprocessing. it loops through all the lines,
    checks what kind of line it is, and combines that line with previous lines as appropriate.
    then outputs a dataframe
    """
    # choose starting point as 5 lines above first question. this is because the questioner is almost always
    # identified within the few lines before first question
    start = find_first_question(lines) - 5
    if start is None:
        print("could not find start")
        return None

    # initialize various parameters
    ongoing_indice = 0
    current_page_number = find_current_page_number(lines[start:])
    current_line_number = 0
    ongoing_page_number = 0
    ongoing_line_number = 0
    ongoing_text = ""
    ongoing_line_type = None
    ongoing_questioner = None
    ongoing_speaker = None

    # the indice and time_added columns are included to help with debugging. they have no use for end-users
    columns = [
        "indice",
        "page_number",
        "line_number",
        "text",
        "text_type",
        "speaker",
        "time_added",
    ]
    data = []

    for i, line in enumerate(lines):
        # ignore preamble
        if i < start:
            continue

        # page_numbering is now determined by using line numbers and first page number
        # see few lines below
        if is_page_number(line):
            continue

        splits = split_into_num_text(line)
        if splits is None:
            continue

        line_number, text = splits

        if line_number < current_line_number:
            current_page_number += 1
        current_line_number = line_number

        if is_start_of_question(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "start_question",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "q"

            questioner = is_identifying_questioner(text)
            if questioner:
                ongoing_questioner = questioner
                # include something here to remove the identification of questioner from question
            ongoing_speaker = ongoing_questioner

        elif is_start_of_answer(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "start_answer",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "a"
            ongoing_speaker = "THE WITNESS"

        elif is_start_of_side_chat(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "start_chat",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "side_chat"
            ongoing_speaker = is_start_of_side_chat(text)

        elif is_identifying_questioner(text):
            ongoing_questioner = is_identifying_questioner(text)

        elif is_start_of_brackets(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "is brackets",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "brackets"
            ongoing_speaker = None

        elif is_only_symbols(text):
            data.append(
                [
                    ongoing_indice,
                    ongoing_page_number,
                    ongoing_line_number,
                    ongoing_text,
                    ongoing_line_type,
                    ongoing_speaker,
                    "symbols",
                ]
            )

            ongoing_indice = i
            ongoing_page_number = current_page_number
            ongoing_line_number = current_line_number
            ongoing_text = text
            ongoing_line_type = "symbols"
            ongoing_speaker = None

        else:
            ongoing_text += " " + text

    data.append(
        [
            ongoing_indice,
            ongoing_page_number,
            ongoing_line_number,
            ongoing_text,
            ongoing_line_type,
            ongoing_speaker,
            "end",
        ]
    )

    return pd.DataFrame(data, columns=columns)


def remove_a_q_from_text(text):
    """
    given a string, remove 'A. ' and 'Q. ' at the start of the string
    """
    pat = r"^[A|Q]\.? +(.*$)"
    match = re.match(pat, text)

    if match:
        return match.groups()[0]
    else:
        return text


def remove_a_q_from_text_in_frame(df):
    """
    given dataframe outputted from 'create_dataframe_from...',
    remove the 'A. ' and 'Q. ' at the beginning of texts of type 'a' or 'q'
    """
    df_temp = df.copy()

    aq_indices = df_temp.text_type.isin(["a", "q"])
    df_temp.loc[aq_indices, "text"] = df_temp.loc[aq_indices, "text"].map(
        remove_a_q_from_text
    )
    return df_temp


def remove_names_from_sidechat_text(df):
    """
    given dataframe outputted from 'create_dataframe_from...',
    remove the 'MR SMITH: ' or 'THE WITNESS: ' or similar
    from start of side chat
    """
    df_temp = df.copy()

    indices = df_temp.text_type == "side_chat"

    df_temp.loc[indices, "text"] = (
        df_temp.loc[indices, "text"].str.split(pat=": +").map(lambda x: x[1])
    )

    return df_temp


def create_csvs_from_directory(directory):
    """
    given a directory of text files of depositions,
    create csv files that extract data from them
    """
    for filename in tqdm(os.listdir(directory)):
        if(filename.startswith('.')):
          continue
        print(f"starting on {filename}")
        df = create_dataframe_from_file(filename)
        df = remove_a_q_from_text_in_frame(df)
        df = remove_names_from_sidechat_text(df)
        df.to_csv(dir_name_csv + filename[:-4] + ".csv")

In [None]:
create_csvs_from_directory(dir_name)

NameError: ignored

In [None]:
df = pd.read_csv('/content/drive/MyDrive/contradiction/data_preprocessed_csv/Dixon v. Chen 03-12-20 Chun-Ming Chen.csv')
q_a=[]
text = list (df['text'])
text_type = list (df['text_type'])
for i in range(0,len(text)):
  if(text_type[i]=='q'):
    c=i+1
    while(text_type[c]!='a' and c<len(text)):
      c=c+1
    if(c<len(text)):
      q_a.append((text[i],text[c]))

In [None]:
q_a

[("Mr. Chen, this is Andrew Rahaim. I'm the attorney for the plaintiffs. Can you hear me okay?",
  'Yes.'),
 ('All right.  Can you start by giving the court reporter your full name?',
  'My name is Chun-Ming Chen.'),
 ('And can you spell that for us?',
  "Sure.  It's C, as in Charlie, H, as in Harry, U, as in umbrella, N, as in Nancy, hyphen, M, as in Michael, I, as in India, N, as in Nancy, G, as in George. Last name, C, as in Charlie, H, as in Harry, E, as in Edward, N, as in Nancy."),
 ("All right.  And where do you live? What's your residence address right now?",
  "I'm currently living in Texas."),
 ("Okay.  We couldn't hear you.", '3900 Woodchase Drive --'),
 ('So give us your current address.', '3900 Woodchase Drive --'),
 ("Wait.  Say that again.  I'm sorry.",
  '3900 Woodchase Drive, Houston, Texas.'),
 ("What's the ZIP Code?", '77042.'),
 ('And who do you reside there with?', 'Myself and my mother.'),
 ('How old are you?', 'Twenty-seven.'),
 ("And what's your date of birth, s

In [None]:
for i in range(len(q_a)-1):
  question1 = q_a[i][0]
  batch_of_pairs=[]
  for j in range(i+1,len(q_a)):
    question2 = q_a[j][0]
    batch_of_pairs.append([question1,question2])

In [None]:
%cd fairseq-master/

/content/drive/My Drive/contradiction/fairseq-master


In [None]:
!pip install --editable .
%cd ..

Obtaining file:///content/drive/My%20Drive/contradiction/fairseq-master
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting omegaconf<2.1
  Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)
Collecting sacrebleu>=1.4.12
  Downloading sacrebleu-1.5.1-py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 3.5 MB/s 
Collecting hydra-core<1.1
  Downloading hydra_core-1.0.7-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 11.6 MB/s 
[?25hCollecting antlr4-python3-runtime==4.8
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[K     |████████████████████████████████| 112 kB 12.5 MB/s 
Collecting PyYAML>=5.1.*
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 13.8 MB/s 
[?25hCollecting portalock

In [None]:
!pip install transformers
!pip3 install vaderSentiment
!pip install pylcs

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 9.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 59.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 55.5 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.12 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.9.1
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 7.1 MB/s 
Installing collected packages: vaderSentiment
S

In [None]:
# Q.    All right.  So I understand you were fatigued at the time.  Can you describe the weather conditions at the time
# of the accident?
# A.    At that time it was a clear day.
#       It was sunny.  It was sunny and bright out.

# Q.    Do you remember what the weather was like around the time of the accident
# A.    Lots of snow.


# Q.    How long did it take the police to arrive?
# A.    It took a while.
# Q.    Did you have any more discussions with the driver of the other car after that?
# A.    I don't remember.
# Q.    How long did you have to wait for the police to arrive?
# A.    They showed up quickly.

In [None]:
!python main.py "Dixon v. Chen 03-12-20 Chun-Ming Chen"

2021-08-05 15:38:25.974066: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a B

In [None]:
# And my understanding from looking at the Christiana Care emergency room records were that you complained to
# them of neck and right shoulder pain.  Is that correct?
# A.    No, my feet hurt.


# What part of your body hurt the most?
# A.    My elbows and knees.

In [None]:
!python main.py "Dixon v. Chen 4-28-20 Jacqueline Dixon"

2021-08-05 15:58:43.538106: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a B

In [None]:
# Q.  Do you see your blue truck in that photo?
# A.  As far as I can tell, that's my truck on the right.

# Q.  Okay.  Do you know who the person in the hard hat is next to that vehicle?
# A.  No.  I don't recall who it is.

# Q.  Is that your police vehicle on the right there?
# A.  No, that truck belongs to my supervisor.

In [None]:
!python main.py "Kelley v. DeLucca Fence Company - 3-13-20 - Robert Peabody - FINAL.d20200323-u204511"

2021-08-05 16:30:45.426831: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a B