#### CS109 Project - Aidi Adnan Brian John

In [1]:
import string
import re
import numpy as np
import pandas as pd
import operator
import os
import sys

In [2]:
#reads in text file, replace path of "wut.txt" to relevant txt; only processes one text file currently
text_file = open("wut.txt", "r")
text = text_file.read()

In [3]:
def get_petitioners_and_respondents(text):
    '''
    This function takes in input text file as string and outputs 2 lists of speakers speaking for petitioners and
    respondents sides.
    '''
    #get portion of transcript between APPEARANCES and CONTENTS that specifies speakers for petitioners/respondents
    start = text.find('APPEARANCES:') + len('APPEARNACES')
    end = text.find('C O N T E N T S')
    speakers_text = text[start:end]
    split_speakers_text = re.split('\.[ ]*\n', speakers_text)
    #for each speaker, get name (capitalized) and side (Pet/Res) he/she is speaking for
    pet_speakers, res_speakers, other_speakers = [], [], []
    for speaker in split_speakers_text:
        name = speaker.strip().split(',')[0]
        #search for first index of capitalized word (which will be start of speaker name)
        start = 0
        for idx, char in enumerate(name):
            if str.isupper(char):
                start = idx
                break
        #actual name to be appended to correct list
        name = name[start:]
        #print name
        
        #if words Petition, Plaintiff, etc occur in speaker blurb, speaker belongs to Pet
        if any(x in speaker for x in ['etition' , 'ppellant', 'emand', 'evers', 'laintiff']):
            pet_speakers.append(name)
        #otherwise if words Respondent, Defendant, etc occur, speaker belongs to Res
        elif any(x in speaker for x in ['espond' , 'ppellee', 'efendant']):
            res_speakers.append(name)
        #otherwise if neither side is specified in blurb, speaking belongs to Other
        elif 'neither' in speaker:
            other_speakers.append(name)
    return pet_speakers, res_speakers, other_speakers

In [9]:
# For example, for wut.txt, there's 1 petitioner and 2 respondents
pet_speakers, res_speakers, other_speakers = get_petitioners_and_respondents(text)
pet_speakers, res_speakers, other_speakers 

(['MR. H. BARTOW FARR'],
 ['MR. ROY L. REARDON', 'MS. BARBARA D. UNDERWOOD'],
 [])

In [5]:
def get_argument_portion(text):
    '''
    This function gets just the argument portion of the text.
    '''
    #start and end defines bounds of argument portion of text
    start = text.find('P R O C E E D')
    end = text.rfind('Whereupon')
    return text[start:end]

In [6]:
argument_portion = get_argument_portion(text)
argument_portion[:500]

"P R O C E E D I N G S\n\n2\n\n[10:13 a.m.]\n\n3\n4\n\nCHIEF JUSTICE REHNQUIST:\n\nWe'll hear argument on\n\nNumber 00-24, PGA Tour, Inc. vs. Casey Martin.\n\n5\n\nORAL ARGUMENT OF H. BARTOW FARR, III\n\n6\n\nON BEHALF OF THE PETITIONER\n\n7\n\nMR. FARR:\n\nMr. Farr?\n\nMr. Chief Justice and may it please\n\n8\n\nthe Court:\n\nThe Ninth Circuit in our view made two\n\n9\n\ncritical mistakes in applying the Disabilities Act to this\n\n10\n\ntype of claim by a professional athlete. First it failed\n\n11\n\nto recognize that Title 3 of the act, "

In [7]:
def count_words(s):
    '''
    This function counts number of proper English words in a string s (not non-words like - or --)
    '''
    s = s.split()
    non_words = ['-', '--']
    return sum([x not in non_words for x in s])

In [47]:
def modify_speaker_names(speakers):
    '''
    This function modifies speaker names like 'QUESTION' to 'QUESTION: ', for word count parsing later on
    '''
    return map(lambda x: x+': ', speakers)

In [33]:
'''
This function takes in the portions of text, and gets rid of the \n and the line numbers. 
'''
def clean_text(text):
    text_arr=text.splitlines()
    text_arr.remove('')
    text_clean=[]
    for each in text_arr:
        if each != '':
            try:
                int(each)
            except ValueError: #assummption: if the item only has integers, it is a line number.
                text_clean.append(each)
    out_text=' '.join(text_clean)
    return out_text

In [34]:
clean_argument=clean_text(argument_portion)
clean_argument[:500]

"P R O C E E D I N G S [10:13 a.m.] CHIEF JUSTICE REHNQUIST: We'll hear argument on Number 00-24, PGA Tour, Inc. vs. Casey Martin. ORAL ARGUMENT OF H. BARTOW FARR, III ON BEHALF OF THE PETITIONER MR. FARR: Mr. Farr? Mr. Chief Justice and may it please the Court: The Ninth Circuit in our view made two critical mistakes in applying the Disabilities Act to this type of claim by a professional athlete. First it failed to recognize that Title 3 of the act, the public accommodations provision, apply on"

In [72]:
def total_wordcount(text):
    '''
    POSSIBLE FEATURE 1:
    This function returns a dictionary with key: name of speaker/justice and value: total number of words they
    spoke in total throughout argument.
    '''
    arg_text = get_argument_portion(text)
    #keeps track of current speaker
    current_speaker = 'N/A'
    clean_argument = clean_text(arg_text)
    
    #clean argument text split by instances where speakers change
    #TODO: cleanup - these should not be hardcorded and instead be result of 
    #modify_speaker_names(pet_speakers + res_speakers + other_speakers)!!!
    #this is currently kept this way cuz of QUESTION: ..........ugh
    split_argument = re.split('(MR. FARR: |QUESTION: |MR. REARDON: |CHIEF JUSTICE REHNQUIST: )', clean_argument)
    all_speakers = ['MR. FARR: ', 'QUESTION: ', 'MR. REARDON: ', 'CHIEF JUSTICE REHNQUIST: ']
    
    #num_words is a dictionary that maps all speaker names to number of words they spoke
    num_words = dict(zip(all_speakers + [current_speaker], [0] * (len(all_speakers)+1)))
    
    #iterate through split argument, accumulating word counts for all speakers
    for s in split_argument:
        #if split chunk signifies change in speaker
        if s in all_speakers:
            current_speaker = s
        #if split chunk is part of speech of current speaker, append to word count
        else:
            num_words[current_speaker] = num_words[current_speaker] + count_words(s)
    
    return num_words

In [73]:
#for example, this gives us total number of words uttered by each speaker
#we just need to find list of all speakers in the form they're referred to in the argument, "JUSTICE SCALIA: " for ex.
non_justice_words(text)

{'CHIEF JUSTICE REHNQUIST: ': 24,
 'MR. FARR: ': 3433,
 'MR. REARDON: ': 1480,
 'N/A': 13,
 'QUESTION: ': 5170}

In [20]:
text_file.close()