# Step 1: Create Dataframe
Anndi Russell: Capstone


The purpose of this program is to extract all of the speech of a specified Supreme Court justice from a directory of pdfs of oral argument transcripts. A number of smaller helper functions are defined first; the build_df function brings them all together.  The output of the script is a pandas DataFrame saved to .pkl for use in the next notebook. 

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import pdfplumber
import time

### Define smaller helper functions

In [12]:
def get_case_list(justice):
    '''Takes in the name of a justice and returns a list of all of the case ids
    for cases the justice heardin their career thus far'''
    
    df=pd.read_csv('SCDB_2020_01_justiceCentered_Docket_updated.csv') #import csv with case information
    df=df[['docket','justiceName']]
    justice=justice.lower()
    justice_dict={'sotomayor':'SSotomayor','gorsuch':'NMGorsuch','kavanaugh':'BMKavanaugh','kagan':'EKagan',
                  'thomas':'CThomas','breyer':'SGBreyer','alito':'SAAlito','roberts':'JGRoberts'}
    justice_code=justice_dict[justice]    
    df=df[df.justiceName==justice_code] #filter dataframe down to the requested justice
    cases=list(df['docket'].values)
    return cases

In [13]:
def text_extractor(path):
    '''Takes in a file path and extracts all text from the pdf using the
    pdfplumber library. Removesthe page headers, extra white space, and numbers.
    Returns text as string'''
    
    with pdfplumber.open(path) as pdf:
        alltext=''
        for i in range(len(pdf.pages)):
            page=pdf.pages[i]
            page_text=page.extract_text()
            if page_text is None: #empty page
                page_text=''
            alltext=alltext+' '+page_text
            
        alltext=alltext.replace("Heritage Reporting Corporation","") #two possible reporting corps as footer
        alltext=alltext.replace("Alderson Reporting Company","")
        alltext=alltext.replace("Official - Subject to Final Review","")  #header of every page
        alltext=alltext.replace("Official -Subject to Final Review","")
        
        alltext=' '.join(alltext.split()) #get rid of extra white space
        alltext = ''.join([i for i in alltext if not i.isdigit()]) #get rid of numbers
            
        return alltext

Function that will extract all file paths from directory:

In [14]:
def get_file_paths():
    '''Returns a list of all file paths from the relevant directory
    using the os.walk functionality'''
    
    directory = "cases/"
    filelist=[]

    for root, subdirectories, files in os.walk(directory):
        for subdirectory in subdirectories:
            f=os.path.join(root, subdirectory)
            if f.endswith('pdf'):
                filelist.append(f)
        for file in files:
            f=os.path.join(root, file)
            if f.endswith('pdf'):
                filelist.append(f)
                
    return filelist

In [15]:
def get_other_justices(justice):
    '''Given the name of a justice, this function will return a list of
    all the other justices that have served since 2004 (earliest date I'm examining),
    not including the one specified. This is necessary so speech from other 
    justices can be removed from the text, while retaining speech by the justice of interest'''
    
    justice=justice.upper()
    alljustice=['JUSTICE GINSBURG',
                  'CHIEF JUSTICE ROBERTS',
                  'JUSTICE KAVANAUGH',
                  'JUSTICE THOMAS',
                  'JUSTICE KAGAN',
                  'JUSTICE BREYER',
                  'JUSTICE ALITO',
                  'JUSTICE GORSUCH',
                   'JUSTICE SOTOMAYOR',
                'JUSTICE STEVENS',
                "JUSTICE O'CONNOR",
                'JUSTICE SCALIA',
                'JUSTICE KENNEDY',
                'JUSTICE SOUTER',
                'JUSTICE BARRETT',
                'CHIEF JUSTICE REHNQUIST'
               ]
    
    for name in alljustice:
        if justice in name:
            alljustice.remove(name)            
    return alljustice

In [16]:
def get_text(speakindex_list,text,justice,otherspeakers):
    '''This function takes in a list of indeces at which the justice of interest
    begins speaking, the whole text, the name of the justice, and the names
    of all other possible speakers. Text is parsed out and only the text spoken
    by the justice of interst is returned'''
    
    justice_text=''
    for i in range(len(speakindex_list)):
        justice_startindex=speakindex_list[i] #justice starts speaking
        if i!=len(speakindex_list)-1: #if not the last time the justice speaks
            interval=text[justice_startindex:speakindex_list[i+1]] #interval includes the text until the justice speaks again
        else:
            interval=text[justice_startindex:] #interval last time just speaks until end of text
            
        foundlist=[]
        for s in otherspeakers:
            found=interval.find(s) #find the name of all other spakers in that interval
            if found!=-1: #if another speaker is found
                foundlist.append(found)
                
        if foundlist!=[]: #if there is another speaker in the interval
            newspeaker=min(foundlist) #find the speaker who speaks first in the interval
            justice_speaks=interval[:newspeaker] #subset the interval to stop when the new speaker starts
            
        else:
            justice_speaks=interval[:] #if there was no other speaker, the whole interval will remain
            
        justice_text=justice_text+' '+justice_speaks 
        
    justice=justice.upper()
    justice_text=re.sub(justice+': ','',justice_text)  #remove the justice's name from the text
    justice_text=justice_text.lower()
    justice_text=re.sub(r'[^\w\s]', '', justice_text)
    justice_text=re.sub('mr','',justice_text)
    justice_text=re.sub('ms','',justice_text)
            
    return justice_text

In [17]:
def get_name(position,start,text):   
    '''Given the lawyer's title (petitioner p or respondent r), a starting index, and the text,
    returns the name of the lawyer'''
    
    name=text[start:start+200]    
    if position=='p':
        name=name.replace('ON BEHALF OF THE PETITIONER','')        
    elif position=='r':
        name=name.replace('ON BEHALF OF THE RESPONDENT','')        
    colon=name.find(':') #the colon follows the name; finds the colon and works backwards to get the name
    name=name[:colon]
    name=name[::-1] #reverse the string
    space=name.find(' ') #find the space
    name=name[:space][::-1] #slice and reverse it back
    return name

In [18]:
def get_case_num(filename):
    '''Given the name of a file, returns the case number (files are named with their
    case number and some extra info) '''
    
    filename=filename[filename.rfind('/')+1:]
    if '_' in filename:
        case_num=filename[filename.rfind('/')+1:filename.find('_')]  
    else:
        case_num=filename[filename.rfind('/')+1:filename.find('.pdf')]
    return case_num

### Define main function that brings smaller helper functions together

In [19]:
def build_df(filelist,justice):
    '''This function takes in a file list and a justice of interest, uses the helper functions
    defined above, and returns a dataframe that records the case number, names of lawyers, and text
    spoken by the justice in the petitioner and respondent portions of the oral argument. Text is
    in the form of a string in the dataframe. Cases are skipped if the text cannot be unimbiguously
    parsed out; some cases follow a different format and are not relevant for my research.'''    

    justice_text_df=pd.DataFrame(columns = ['case_num', 'pet_name', 'res_name', 'pet_text','res_text'])
    cases_counted=0
    cases_skipped=0
    case_list=get_case_list(justice)
    skipped_list=[] #record the cases that were skipped
    


    for filename in filelist:
        case_num=get_case_num(filename)
        if case_num in case_list: #if the justice heard the case
            print(case_num)
            text=text_extractor(filename) #get all text from that file

            rebstart = [] #initialize list; records the starting index of the rebuttal portion
            petstart=[] #petitioner portion
            resstart=[] #respondent portion
            
            if case_num=='07-512': #this case has a typo 
                end='Wherepon'
            else:
                end='Whereupon'
                 
            petstart_all = [i.start() for i in re.finditer('ON BEHALF OF THE PETITIONER', text)] #find petitioner portions (includes rebuttal)
            resstart = [i.start() for i in re.finditer('ON BEHALF OF THE RESPONDENT', text)] #find respondent portions
            
            if resstart!=[]:
                resstart=resstart[0]

            if len(petstart_all)==2:
                petstart=petstart_all[0] #petitioner
                rebstart=petstart_all[1] #rebuttal

            found_blank=False 
            if (petstart==[] or resstart==[] or rebstart==[]): #if any are blank, text could not be parsed as this code is written and the case is skipped
                cases_skipped+=1
                skipped_list.append(case_num)
                found_blank=True

            if not found_blank: #if the case is not skipped
                cases_counted+=1

                pet_name=get_name('p',petstart,text) #name of petitioner lawyer
                res_name=get_name('r',resstart,text) #name of respondent lawyer
                            
                endindex=[i.start() for i in re.finditer(end, text)][-1] #find index where transcript ends (cuts off extraneous info at end of pdf)

                petitioner_text=text[petstart:resstart] #all text in the petitioner portion of the argument
                respondent_text=text[resstart:rebstart] #in the respondent portion
                rebuttal_text=text[rebstart:endindex] #in the rebuttal portion

                justice=justice.upper()
                justicespeaks_pet_index=[i.start() for i in re.finditer(justice, petitioner_text)] #find all locations at which the justice of interest speaks
                justicespeaks_res_index=[i.start() for i in re.finditer(justice, respondent_text)]
                justicespeaks_reb_index=[i.start() for i in re.finditer(justice, rebuttal_text)]

                otherjustice=get_other_justices(justice)      
                otherspeakers=otherjustice+[pet_name,res_name] #all possible speakers besides the justice of interest

                justice_text_pet=get_text(justicespeaks_pet_index,petitioner_text,justice,otherspeakers) #text spoken by justice of interest in petitioner portion
                justice_text_res=get_text(justicespeaks_res_index,respondent_text,justice,otherspeakers)
                justice_text_reb=get_text(justicespeaks_reb_index,rebuttal_text,justice,otherspeakers)


                justice_text_pet=justice_text_pet+justice_text_reb #petitioner and rebuttal portions are both spoken to the petitioner so these will be combined for my analysis
                justice_text_df = justice_text_df.append({'case_num':case_num,
                                                          'pet_name':pet_name,
                                                          'res_name':res_name,
                                                          'pet_text':justice_text_pet,
                                                          'res_text':justice_text_res},ignore_index=True)

    print('num counted:',cases_counted)
    print('num skipped:',cases_skipped)
    print('list of skipped:', skipped_list)
    return justice_text_df


### Get file list and build the dataframe for each justice

Print the time it takes for each and save to pkl

##### Thomas

In [20]:
filelist = get_file_paths()
t0 = time.time()
df_thomas = build_df(filelist,'thomas')
t1 = time.time()
total = t1-t0
print(f'Total time: {total}')
print(df_thomas.head())
df_thomas.to_pickle("df_thomas.pkl")

15-927
15-513
15-1191
15-423
15-866
15-497
15-1111
15-1251
14-1055
15-1358
15-1406
15-1256
15-827
15-1498
15-1500
15-1293
16-348
15-1391
15-8544
15-1262
15-797
15-649
15-680
14-1538
15-1204
15-7250
15-5991
14-9496
15-777
15-537
15-628
15-8049
15-606
16-5294
16-466
16-605
16-240
16-373
15-1039
16-6219
16-399
16-405
16-309
16-349
16-529
15-577
16-341
16-369
15-1189
15-1031
15-214
15-1503
15-457
16-254
16-327
16-74
16-142
16-149
16-54
15-1248
16-32
15-1194
15-118
15-9260
16-460
16-784
16-6855
16-498
16-6795
15-1509
16-1220
16-1215
16-1519
17-494
17-269
17-130
17-586
17-965
16-1011
17-459
17-530
17-5639
16-1371
16-1362
16-1027
16-1150
16-961
16-980
16-8255
16-424
15-1498
16-285
16-499
15-1485
16-299
16-658
15-1204
16-1161
17-312
17-155
17-432
16-1432
16-1140
17-387
17-5716
17-333
16-1348
16-1466
16-1495
17-21
16-1454
16-9493
16-1435
17-2
17-43
16-402
16-1144
16-476
16-534
15-1439
16-111
16-1276
16-969
16-1067
16-712
10-577
10-788
10-895
10-699
10-879
10-1104
10-444
10-1259
10-209
10-8974
1

17-1268
18-1584
18-8369
num counted: 806
num skipped: 301
list of skipped: ['15-927', '15-1262', '15-680', '15-5991', '15-777', '15-537', '15-628', '15-1039', '15-1503', '16-498', '16-6795', '17-130', '17-586', '16-961', '16-8255', '16-285', '16-299', '16-1161', '16-1140', '17-5716', '17-333', '16-1466', '17-43', '16-402', '16-1144', '16-476', '16-111', '16-712', '10-1293', '11-713', '10-1399', '11-246', '10-507', '10-553', '11-262', '09-1233', '09-1279', '10-6', '09-1498', '09-10245', '09-1227', '09-987', '09-737', '05-595', '05-380', '05-593', '05-7058', '05-848', '05-8820', '05-1382', '05-9222', '05-1256', '05-1240', '05-1448', '06-6407', '06-134', '06-593', '06-618', '06-340', '06-427', '06-376', '06-8120', '06-969', '06-413', '05-85', '05-11284', '05-1575', '05-1589', '05-1345', '05-1508', '06-102', '06-116', '05-608', '05-547', '05-746', '05-669', '05-493', '05-6551', '06-5247', '06-278', '06-484', '06-219', '05-1157', '06-313', '06-5306', '06-5754', '06-5618', '06-157', '05-1541

##### Sotomayor

In [21]:
filelist = get_file_paths()
t0 = time.time()
df_sotomayor = build_df(filelist,'sotomayor')
t1 = time.time()
total = t1-t0
print(f'Total time: {total}')
print(df_sotomayor.head())
df_sotomayor.to_pickle("df_sotomayor.pkl")

15-927
15-513
15-1191
15-423
15-866
15-497
15-1111
15-1251
14-1055
15-1358
15-1406
15-1256
15-827
15-1498
15-1500
15-1293
16-348
15-1391
15-8544
15-1262
15-797
15-649
15-680
14-1538
15-1204
15-7250
15-5991
14-9496
15-777
15-537
15-628
15-8049
15-606
16-5294
16-466
16-605
16-240
16-373
15-1039
16-6219
16-399
16-405
16-309
16-349
16-529
15-577
16-341
16-369
15-1189
15-1031
15-214
15-1503
15-457
16-254
16-327
16-74
16-142
16-149
16-54
15-1248
16-32
15-1194
15-118
15-9260
16-460
16-784
16-6855
16-498
16-6795
15-1509
16-1220
16-1215
16-1519
17-494
17-269
17-130
17-586
17-965
16-1011
17-459
17-530
17-5639
16-1371
16-1362
16-1027
16-1150
16-961
16-980
16-8255
16-424
15-1498
16-285
16-499
15-1485
16-299
16-658
15-1204
16-1161
17-312
17-155
17-432
16-1432
16-1140
17-387
17-5716
17-333
16-1348
16-1466
16-1495
17-21
16-1454
16-9493
16-1435
17-2
17-43
16-402
16-1144
16-476
16-534
15-1439
16-111
16-1276
16-969
16-1067
16-712
10-577
10-788
10-895
10-699
10-879
10-1104
10-444
10-1259
10-209
10-8974
1

##### Breyer

In [22]:
filelist = get_file_paths()
t0 = time.time()
df_breyer = build_df(filelist,'breyer')
t1 = time.time()
total = t1-t0
print(f'Total time: {total}')
print(df_breyer.head())
df_breyer.to_pickle("df_breyer.pkl")

15-927
15-513
15-1191
15-423
15-866
15-497
15-1111
15-1251
14-1055
15-1358
15-1406
15-1256
15-827
15-1498
15-1500
15-1293
16-348
15-1391
15-8544
15-1262
15-797
15-649
15-680
14-1538
15-1204
15-7250
15-5991
14-9496
15-777
15-537
15-628
15-8049
15-606
16-5294
16-466
16-605
16-240
16-373
15-1039
16-6219
16-399
16-405
16-309
16-349
16-529
15-577
16-341
16-369
15-1189
15-1031
15-214
15-1503
15-457
16-254
16-327
16-74
16-142
16-149
16-54
15-1248
16-32
15-1194
15-118
15-9260
16-460
16-784
16-6855
16-498
16-6795
15-1509
16-1220
16-1215
16-1519
17-494
17-269
17-130
17-586
17-965
16-1011
17-459
17-530
17-5639
16-1371
16-1362
16-1027
16-1150
16-961
16-980
16-8255
16-424
15-1498
16-285
16-499
15-1485
16-299
16-658
15-1204
16-1161
17-312
17-155
17-432
16-1432
16-1140
17-387
17-5716
17-333
16-1348
16-1466
16-1495
17-21
16-1454
16-9493
16-1435
17-2
17-43
16-402
16-1144
16-476
16-534
15-1439
16-111
16-1276
16-969
16-1067
16-712
10-577
10-788
10-895
10-699
10-879
10-1104
10-444
10-1259
10-209
10-8974
1

17-1268
18-1584
18-8369
num counted: 806
num skipped: 301
list of skipped: ['15-927', '15-1262', '15-680', '15-5991', '15-777', '15-537', '15-628', '15-1039', '15-1503', '16-498', '16-6795', '17-130', '17-586', '16-961', '16-8255', '16-285', '16-299', '16-1161', '16-1140', '17-5716', '17-333', '16-1466', '17-43', '16-402', '16-1144', '16-476', '16-111', '16-712', '10-1293', '11-713', '10-1399', '11-246', '10-507', '10-553', '11-262', '09-1233', '09-1279', '10-6', '09-1498', '09-10245', '09-1227', '09-987', '09-737', '05-595', '05-380', '05-593', '05-7058', '05-848', '05-8820', '05-1382', '05-9222', '05-1256', '05-1240', '05-1448', '06-6407', '06-134', '06-593', '06-618', '06-340', '06-427', '06-376', '06-8120', '06-969', '06-413', '05-85', '05-11284', '05-1575', '05-1589', '05-1345', '05-1508', '06-102', '06-116', '05-608', '05-547', '05-746', '05-669', '05-493', '05-6551', '06-5247', '06-278', '06-484', '06-219', '05-1157', '06-313', '06-5306', '06-5754', '06-5618', '06-157', '05-1541

##### Kavanaugh

In [23]:
filelist = get_file_paths()
t0 = time.time()
df_kavanaugh = build_df(filelist,'kavanaugh')
t1 = time.time()
total = t1-t0
print(f'Total time: {total}')
print(df_kavanaugh.head())
df_kavanaugh.to_pickle("df_kavanaugh.pkl")

17-587
16-1363
17-1104
17-6086
17-647
17-340
17-7505
17-71
17-765
17-5554
18-481
18-6210
18-389
18-525
17-9560
18-459
18-485
18-431
18-966
18-302
18-457
18-489
17-778
18-422
18-281
18-15
18-315
18-726
17-1606
18-266
17-1705
17-9572
16-1094
17-1026
17-961
16-1498
17-8151
17-988
17-1042
17-1011
17-1272
17-773
16-1275
17-949
17-1184
17-1091
17-419
17-1077
17-1094
17-646
17-1174
17-1229
17-204
17-8995
17-1672
17-1657
17-1594
17-1702
17-1717
17-1471
17-1625
18-96
17-1484
17-290
17-532
17-571
17-1201
17-1299
17-1307
18-877
18-260
18-725
17-1678
18-556
18-938
18-587
18-565
18-1165
18-1171
19-431
18-9526
19-715
19-518
19-46
19-631
19-267
19-465
19-177
19-635
18-1334
17-834
18-6135
18-328
17-1618
18-801
18-5924
18-107
18-7739
18-776
18-1109
18-280
18-6943
18-1116
18-1023
18-1269
18-1150
17-1498
18-935
18-916
18-882
18-1233
18-1195
18-1086
18-1048
18-6662
17-1712
18-1059
19-161
18-1501
19-7
18-1323
18-1432
19-67
17-1268
18-1584
18-8369
num counted: 103
num skipped: 23
list of skipped: ['17-765',

##### Alito

In [24]:
filelist = get_file_paths()
t0 = time.time()
df_alito = build_df(filelist,'alito')
t1 = time.time()
total = t1-t0
print(f'Total time: {total}')
print(df_alito.head())
df_alito.to_pickle("df_alito.pkl")

15-927
15-513
15-1191
15-423
15-866
15-497
15-1111
15-1251
14-1055
15-1358
15-1406
15-1256
15-827
15-1498
15-1500
15-1293
16-348
15-1391
15-8544
15-1262
15-797
15-649
15-680
14-1538
15-1204
15-7250
15-5991
14-9496
15-777
15-537
15-628
15-8049
15-606
16-5294
16-466
16-605
16-240
16-373
15-1039
16-6219
16-399
16-405
16-309
16-349
16-529
15-577
16-341
16-369
15-1189
15-1031
15-214
15-1503
15-457
16-254
16-327
16-74
16-142
16-149
16-54
15-1248
16-32
15-1194
15-118
15-9260
16-460
16-784
16-6855
16-498
16-6795
15-1509
16-1220
16-1215
16-1519
17-494
17-269
17-130
17-586
17-965
16-1011
17-459
17-530
17-5639
16-1371
16-1362
16-1027
16-1150
16-961
16-980
16-8255
16-424
15-1498
16-285
16-499
15-1485
16-299
16-658
15-1204
16-1161
17-312
17-155
17-432
16-1432
16-1140
17-387
17-5716
17-333
16-1348
16-1466
16-1495
17-21
16-1454
16-9493
16-1435
17-2
17-43
16-402
16-1144
16-476
16-534
15-1439
16-111
16-1276
16-969
16-1067
16-712
10-577
10-788
10-895
10-699
10-879
10-1104
10-444
10-1259
10-209
10-8974
1

##### Kagan

In [25]:
filelist = get_file_paths()
t0 = time.time()
df_kagan = build_df(filelist,'kagan')
t1 = time.time()
total = t1-t0
print(f'Total time: {total}')
print(df_kagan.head())
df_kagan.to_pickle("df_kagan.pkl")

15-927
15-513
15-1191
15-423
15-866
15-497
15-1111
15-1251
14-1055
15-1358
15-1406
15-1256
15-827
15-1498
15-1500
15-1293
16-348
15-1391
15-8544
15-1262
15-797
15-649
15-680
14-1538
15-1204
15-7250
15-5991
14-9496
15-777
15-537
15-628
15-8049
15-606
16-5294
16-466
16-605
16-240
16-373
15-1039
16-6219
16-399
16-405
16-309
16-349
16-529
15-577
16-341
16-369
15-1189
15-1031
15-214
15-1503
15-457
16-254
16-327
16-74
16-142
16-149
16-54
15-1248
16-32
15-1194
15-118
15-9260
16-460
16-784
16-6855
16-498
16-6795
15-1509
16-1220
16-1215
16-1519
17-494
17-269
17-130
17-586
17-965
16-1011
17-459
17-530
17-5639
16-1371
16-1362
16-1027
16-1150
16-961
16-980
16-8255
16-424
15-1498
16-285
16-499
15-1485
16-299
16-658
15-1204
16-1161
17-312
17-155
17-432
16-1432
16-1140
17-387
17-5716
17-333
16-1348
16-1466
16-1495
17-21
16-1454
16-9493
16-1435
17-2
17-43
16-402
16-1144
16-476
16-534
15-1439
16-111
16-1276
16-969
16-1067
16-712
10-577
10-788
10-895
10-699
10-879
10-1104
10-444
10-1259
10-209
10-8974
1

##### Gorsuch

In [26]:
filelist = get_file_paths()
t0 = time.time()
df_gorsuch = build_df(filelist,'gorsuch')
t1 = time.time()
total = t1-t0
print(f'Total time: {total}')
print(df_gorsuch.head())
df_gorsuch.to_pickle("df_gorsuch.pkl")

15-1191
15-423
15-1111
15-1358
15-1406
15-1256
15-1498
15-1500
15-1293
16-348
15-1262
15-1204
15-7250
16-5294
16-466
16-605
16-240
16-373
15-1039
16-6219
16-399
16-405
16-309
16-349
16-529
15-577
16-341
16-369
15-1189
15-1031
15-214
15-1503
15-457
16-254
16-327
16-74
16-142
16-149
16-54
16-32
15-1194
15-118
16-460
16-784
16-6855
16-498
16-6795
15-1509
16-1220
16-1215
16-1519
17-494
17-269
17-130
17-586
17-965
16-1011
17-459
17-530
17-5639
16-1371
16-1362
16-1027
16-1150
16-961
16-980
16-8255
16-424
15-1498
16-285
16-499
15-1485
16-299
16-658
15-1204
16-1161
17-312
17-155
17-432
16-1432
16-1140
17-387
17-5716
17-333
16-1348
16-1466
16-1495
17-21
16-1454
16-9493
16-1435
17-2
17-43
16-402
16-1144
16-476
16-534
15-1439
16-111
16-1276
16-969
16-1067
16-712
17-587
16-1363
17-1104
17-6086
17-647
17-340
17-7505
17-71
17-765
17-5554
18-481
18-6210
18-389
18-525
17-9560
18-459
18-485
18-431
18-966
18-302
18-457
18-489
17-778
18-422
18-281
18-15
18-315
18-726
17-1606
18-266
17-1705
17-9572
16-109

##### Roberts

In [27]:
filelist = get_file_paths()
t0 = time.time()
df_roberts = build_df(filelist,'roberts')
t1 = time.time()
total = t1-t0
print(f'Total time: {total}')
print(df_roberts.head())
df_roberts.to_pickle("df_roberts.pkl")

15-927
15-513
15-1191
15-423
15-866
15-497
15-1111
15-1251
14-1055
15-1358
15-1406
15-1256
15-827
15-1498
15-1500
15-1293
16-348
15-1391
15-8544
15-1262
15-797
15-649
15-680
14-1538
15-1204
15-7250
15-5991
14-9496
15-777
15-537
15-628
15-8049
15-606
16-5294
16-466
16-605
16-240
16-373
15-1039
16-6219
16-399
16-405
16-309
16-349
16-529
15-577
16-341
16-369
15-1189
15-1031
15-214
15-1503
15-457
16-254
16-327
16-74
16-142
16-149
16-54
15-1248
16-32
15-1194
15-118
15-9260
16-460
16-784
16-6855
16-498
16-6795
15-1509
16-1220
16-1215
16-1519
17-494
17-269
17-130
17-586
17-965
16-1011
17-459
17-530
17-5639
16-1371
16-1362
16-1027
16-1150
16-961
16-980
16-8255
16-424
15-1498
16-285
16-499
15-1485
16-299
16-658
15-1204
16-1161
17-312
17-155
17-432
16-1432
16-1140
17-387
17-5716
17-333
16-1348
16-1466
16-1495
17-21
16-1454
16-9493
16-1435
17-2
17-43
16-402
16-1144
16-476
16-534
15-1439
16-111
16-1276
16-969
16-1067
16-712
10-577
10-788
10-895
10-699
10-879
10-1104
10-444
10-1259
10-209
10-8974
1