In [None]:
import pandas as pd
import os
import re
import nltk
nltk.download('punkt')
from google.colab import drive
drive.mount('/content/gdrive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Mounted at /content/gdrive


## Load Data

In [None]:
load_path = '/content/gdrive/MyDrive/RAADD/7. Data/1. Judges/judge_bios/'
judge_text = [open(load_path + filename).read() for filename in os.listdir(load_path) if filename.split('.')[-1] == 'txt']

## Explore Data

In [None]:
judge_text[0]

'Michael A. Kilroy\n\nJudge Kilroy was appointed Immigration Judge in May 2001.  He received his undergraduate degree in 1970 from the University of Montana and his Juris Doctorate in 1973 from the University of Montana School of Law.  Judge Kilroy served as an Air Force Judge Advocate from August 1973 to January 2001 and retired as a Colonel.  His Air Force career included three assignments as a military criminal trial judge at Travis AFB, CA, from 1990 to 1991, and Yakota AB, Japan, from 1991 to 1993 and again from 1998 to 2000.  Judge Kilroy is a member of the Montana Bar.'

In [None]:
judge_text[10]

'Gaylyn N. Boone\n\nJudge Boone was appointed Immigration Judge in August 1995.  She received her B.A. from Vassar College in 1965, and her J.D. from the University of San Francisco in 1979.  From 1994 to 1995, she served as acting district counsel for the U.S. Immigration and Naturalization Service (INS) in Arlington, Virginia.  From 1991 to 1994, Judge Boone worked as deputy district counsel for INS in Arlington, and from 1987 to 1991, she was assistant district counsel for INS, also in Arlington. From 1982 to 1987, she worked as an equal employment opportunity specialist for the Department of Justice in Washington, DC.  From 1966 to 1981, she was regional coordination specialist and program manager for the Office of Economic Opportunity/Community Service Administration  in San Francisco, California, and Washington, DC.  Judge Boone is a member of the Virginia Bar.'

In [None]:
judge_text[100]

In [None]:
judge_text[250]

In [None]:
judge_text[400]

## Extract Features

In [None]:
judge_data = pd.DataFrame({'TEXT': judge_text})
judge_data

Unnamed: 0,TEXT
0,Michael A. Kilroy\n\nJudge Kilroy was appointe...
1,Steven R. Abrams\n\nJudge Abrams was appointed...
2,Charles Adkins-Blanch\n\nJudge Adkins-Blanch w...
3,Matthew T. Adrian\n\nJudge Adrian was appointe...
4,Anthony M. Atenaide\n\nJudge Atenaide was appo...
...,...
628,Hayden E. Windrow\n\nAttorney General William ...
629,Virna A. Wright\n\nAttorney General Eric Holde...
630,Elizabeth L. Young\n\nAttorney General Loretta...
631,Randa Zagzoug\n\nJudge Zagzoug received a bach...


**Feature Name:** NAME

**Feature Description:** Name of immigration judge

**Example:** `'Abraham Burgess'` (str)

In [None]:
judge_data['NAME'] = judge_data.TEXT.map(lambda t: t.split('\n\n')[0])

**Feature Name:** APPT_YR

**Feature Description:** Appointment year

**Example:** `2016` (int)

In [None]:
def get_appt_date(text):
  if 'appoint' in text:
    if text.split('\n\n')[0] == 'Amanda Jeannopoulos':
      return 'June 2020'
    if text.split('\n\n')[0] == 'Robert B.C. McSeveney':
      return 'November 2018'
    if text.split('\n\n')[0] == 'Brian H. Simpson':
      return 'March 1980'
    return re.search(' in *(.*? *\d\d\d\d)', text).group(1)

In [None]:
judge_data['DATE'] = pd.to_datetime(judge_data.TEXT.map(get_appt_date))

In [None]:
judge_data['APPT_YR'] = judge_data.DATE.map(lambda d: None if pd.isna(d) else str(d.year))

**Feature Name:** ATTN_GEN

**Feature Description:** Appointing attorney general

**Example:** `'Loretta Lynch'` (str)

*List of attorneys general extracted from [Wikipedia](https://en.wikipedia.org/wiki/United_States_Attorney_General).

In [None]:
attorneys_general = pd.read_csv(load_path + 'attn_gen.csv')
attorneys_general

Unnamed: 0,No.,Portrait,Name,Prior experience,State of residence,Took office,Left office,President(s)
0,1,,Edmund Randolph,"Lawyer, 7th Governor of Virginia",Virginia,"September 26, 1789","January 26, 1794",George Washington
1,2,,William Bradford,"Lawyer, judge, Attorney General of Pennsylvania",Pennsylvania,"January 27, 1794","August 23, 1795",George Washington
2,3,,Charles Lee,"Lawyer, Acting United States Secretary of State",Virginia,"December 10, 1795","February 19, 1801",George Washington
3,3,,Charles Lee,"Lawyer, Acting United States Secretary of State",Virginia,"December 10, 1795","February 19, 1801",John Adams
4,4,,Levi Lincoln Sr.,"Lawyer, Acting United States Secretary of Stat...",Massachusetts,"March 5, 1801","March 2, 1805",Thomas Jefferson
...,...,...,...,...,...,...,...,...
109,–,,Matthew Whitaker,,Iowa,"November 7, 2018","February 14, 2019",Donald Trump
110,85,,William Barr,77th United States Attorney General (1991–1993...,Virginia,"February 14, 2019","December 23, 2020",Donald Trump
111,–,,Jeffrey A. Rosen,,Massachusetts,"December 24, 2020","January 20, 2021",Donald Trump
112,–,,Monty Wilkinson,,"Washington, D.C.","January 20, 2021","March 11, 2021",Joe Biden


In [None]:
def attn_gen(date):
  if pd.notna(date):
    prior = pd.to_datetime(attorneys_general['Took office']) < date
    post = (pd.to_datetime(attorneys_general['Left office'].iloc[:-1]) > date).append(pd.Series(True), ignore_index=True)
    return attorneys_general[prior & post].Name.iloc[0]

In [None]:
judge_data['ATTN_GEN'] = judge_data.DATE.map(attn_gen)

**Feature Name:** REPUBLICAN

**Feature Description:** Whether the administration that appointed them is Republican

**Example:** `0` (int)

In [None]:
def republican(attn_gen):
  if attn_gen:
    president = attorneys_general[attorneys_general.Name == attn_gen]['President(s)'].iloc[0]
    return int(president in ['Ronald Reagan', 'George H. W. Bush', 'George W. Bush', 'Donald Trump'])
  return 0

def democrat(attn_gen):
  if attn_gen:
    president = attorneys_general[attorneys_general.Name == attn_gen]['President(s)'].iloc[0]
    return int(president not in ['Ronald Reagan', 'George H. W. Bush', 'George W. Bush', 'Donald Trump'])
  return 0

In [None]:
judge_data['REPUBLICAN'] = judge_data.ATTN_GEN.map(republican)
judge_data['DEMOCRAT'] = judge_data.ATTN_GEN.map(democrat)

**Feature Name:** T14_SCHOOL

**Feature Description:** Whether they attended a T14 law school

**Example:** `1` (int)

(Aayush)

In [None]:
t14_list = ['yale', 'harvard', 'boalt', 'berkeley', 'stanford', 'columbia university school of law', 'columbia law', 'university of virginia', 
            'new york university', 'university of michigan', 'northwestern', 'cornell', 'georgetown']

def t14(text):
  sentences = text.split('.')
  sentence = None
  for sent in sentences:
      if 'juris' in sent.lower():
        sentence = sent.lower().split('juris')[1].split('and')[0]
  if sentence:
    return int(any(school in sentence for school in t14_list))
  return 0

def not_t14(text):
  sentences = text.split('.')
  sentence = None
  for sent in sentences:
      if 'juris' in sent.lower():
        sentence = sent.lower().split('juris')[1].split('and')[0]
  if sentence:
    return int(all(school not in sentence for school in t14_list))
  return 0

In [None]:
judge_data['T14_SCHOOL'] = judge_data.TEXT.map(t14)
judge_data['NOT_T14_SCHOOL'] = judge_data.TEXT.map(not_t14)

**Feature Name:** JD_YR

**Feature Description:** JD graduation year

**Example:** `1994` (int)

(Aayush)

In [None]:
def JD_yr(text):
  for juris in ["Juris", "juris"]:
    if juris in text:
      search_string = juris + '.*? (\d\d\d\d)'
      return re.search(search_string, text).group(1)

In [None]:
judge_data['JD_YR'] = judge_data.TEXT.map(JD_yr)

**Feature Name:** XX_BAR

**Feature Description:** Binary column for each state marking where the judge is a member of the state bar

**Example:** `1` (int)

(Aayush)

In [None]:
bars = ["Alabama", "Alaska", "American Samoa", "Arizona", "Arkansas", "California", "Colorado", 
        "Commonwealth of the Northern Mariana Islands (CNMI)", "Connecticut", 
        "Delaware", "District of Columbia", "Florida", "Georgia", "Guam", 
        "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", 
        "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota",
        "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", 
        "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", 
        "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina",
        "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virgin Islands", 
        "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming", "Puerto Rico", "American", "Federal"]
bar_cols = ["AL_BAR", "AK_BAR", "AS_BAR", "AZ_BAR", "AR_BAR", "CA_BAR", "CO_BAR", "CNMI_BAR", "CT_BAR", 
        "DE_BAR", "DC_BAR", "FL_BAR", "GA_BAR", "GU_BAR", "HI_BAR", "ID_BAR", "IL_BAR", "IN_BAR", "IA_BAR", 
        "KS_BAR", "KY_BAR", "LA_BAR", "ME_BAR", "MD_BAR", "MA_BAR", "MI_BAR", "MN_BAR", "MS_BAR", "MO_BAR", 
        "MT_BAR", "NE_BAR", "NV_BAR", "NH_BAR", "NJ_BAR", "NM_BAR", "NY_BAR", "NC_BAR", "ND_BAR", "OH_BAR", 
        "OK_BAR", "OR_BAR", "PA_BAR", "RI_BAR", "SC_BAR", "SD_BAR", "TN_BAR", "TX_BAR", "UT_BAR", "VT_BAR", 
        "VI_BAR", "VA_BAR", "WA_BAR", "WV_BAR", "WI_BAR", "WY_BAR", "PR_BAR", "AM_BAR", "FD_BAR"]
judge_data = judge_data.join(judge_data.reindex(columns=bar_cols, fill_value=0))

In [None]:
def statebar(text):
  sentences = text.split(".")
  for sent in sentences:
    for org in bars:
      if 'bar' in sent.lower() and org in sent and 'juris' not in sent.lower():
          index = judge_data.index[judge_data['TEXT'] == text]
          judge_data[bar_cols[bars.index(org)]][index] = 1

judge_data['TEXT'].map(statebar)
judge_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,TEXT,NAME,DATE,APPT_YR,ATTN_GEN,REPUBLICAN,DEMOCRAT,T14_SCHOOL,NOT_T14_SCHOOL,JD_YR,AL_BAR,AK_BAR,AS_BAR,AZ_BAR,AR_BAR,CA_BAR,CO_BAR,CNMI_BAR,CT_BAR,DE_BAR,DC_BAR,FL_BAR,GA_BAR,GU_BAR,HI_BAR,ID_BAR,IL_BAR,IN_BAR,IA_BAR,KS_BAR,KY_BAR,LA_BAR,ME_BAR,MD_BAR,MA_BAR,MI_BAR,MN_BAR,MS_BAR,MO_BAR,MT_BAR,NE_BAR,NV_BAR,NH_BAR,NJ_BAR,NM_BAR,NY_BAR,NC_BAR,ND_BAR,OH_BAR,OK_BAR,OR_BAR,PA_BAR,RI_BAR,SC_BAR,SD_BAR,TN_BAR,TX_BAR,UT_BAR,VT_BAR,VI_BAR,VA_BAR,WA_BAR,WV_BAR,WI_BAR,WY_BAR,PR_BAR,AM_BAR,FD_BAR
0,Michael A. Kilroy\n\nJudge Kilroy was appointe...,Michael A. Kilroy,2001-05-01,2001,John Ashcroft,1,0,0,1,1973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Steven R. Abrams\n\nJudge Abrams was appointed...,Steven R. Abrams,1997-09-01,1997,Janet Reno,0,1,0,1,1974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Charles Adkins-Blanch\n\nJudge Adkins-Blanch w...,Charles Adkins-Blanch,2004-06-01,2004,John Ashcroft,1,0,0,1,1990,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,Matthew T. Adrian\n\nJudge Adrian was appointe...,Matthew T. Adrian,1996-03-01,1996,Janet Reno,0,1,0,0,,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Anthony M. Atenaide\n\nJudge Atenaide was appo...,Anthony M. Atenaide,1995-03-01,1995,Janet Reno,0,1,0,1,1975,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628,Hayden E. Windrow\n\nAttorney General William ...,Hayden E. Windrow,2020-12-01,2020,William Barr,1,0,1,0,2005,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
629,Virna A. Wright\n\nAttorney General Eric Holde...,Virna A. Wright,2010-10-01,2010,Eric Holder,1,0,1,0,1993,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
630,Elizabeth L. Young\n\nAttorney General Loretta...,Elizabeth L. Young,2016-09-01,2016,Loretta Lynch,0,1,0,1,2004,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
631,Randa Zagzoug\n\nJudge Zagzoug received a bach...,Randa Zagzoug,NaT,,,0,0,0,1,1990,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Feature Name:** FEMALE

**Feature Description:** Whether their gender (inferred from pronouns) is female

**Example:** `1` (int)

In [None]:
def female(text):
  tokens = nltk.word_tokenize(text)
  if 'she' in tokens or 'She' in tokens:
    return 1
  return 0

def male(text):
  tokens = nltk.word_tokenize(text)
  if 'he' in tokens or 'He' in tokens:
    return 1
  return 0

In [None]:
judge_data['FEMALE'] = judge_data.TEXT.map(female)
judge_data['MALE'] = judge_data.TEXT.map(male)

**Feature Name:** MILITARY_EXP

**Feature Description:** Whether they were in the military

**Example:** `0` (int)

In [None]:
def military(text):
  tokens = nltk.word_tokenize(text)
  for service in ['Army', 'Navy', 'Marine', 'military']:
    if service in tokens:
      return 1
  for service in ['Air Force', 'Coast Guard']:
    if service in text:
      return 1
  return 0

In [None]:
judge_data['MILITARY_EXP'] = judge_data.TEXT.map(military)

**Feature Name:** TEACHING_EXP

**Feature Description:** Whether they taught at a university

**Example:** `1` (int)

In [None]:
judge_data['TEACHING_EXP'] = judge_data.TEXT.map(lambda t: 1 if 'professor' in t else 0)

**Feature Name:** TEACHING_SCHOOL

**Feature Description:** Which university they taught at

**Example:** `Notre Dame` (str)

In [None]:
def teaching_school(row):
  if row.TEACHING_EXP:
    sentence = [sent for sent in row.TEXT.split('.') if 'professor' in sent][0] + ' '
    matches = re.findall('((?:[A-Z]\w+,* )(?:[A-Z]\w+,* |of )+)', sentence)
    universities = ' / '.join([m.rstrip(' ,') for m in matches if 'University' in m or 'College' in m])
    return universities

In [None]:
judge_data['TEACHING_SCHOOL'] = judge_data[['TEXT', 'TEACHING_EXP']].apply(teaching_school, axis=1)

**Feature Name:** DHS_EXP

**Feature Description:** Whether they worked for DHS

**Example:** `0` (int)

(Aayush)

In [None]:
def dhs(text):
  tokens = nltk.word_tokenize(text)
  for org in ['Homeland', 'homeland', 'DHS', 'HomelandSecurity', 'ICE', 'INS', 'CBP', 'USCIS']:
    if org in tokens:
      return 1
  for org in ['Homeland Security', 'Immigration and Customs Enforcement', 'Customs and Border Protection', 'Citizenship and Immigration Services', 'Immigration and Naturalization Service']:
    if org in text:
      return 1
  return 0

In [None]:
judge_data['DHS_EXP'] = judge_data.TEXT.map(dhs)

## Save CSV

In [None]:
judge_data.columns = ['JUDGE_' + col_name for col_name in judge_data.columns]

In [None]:
judge_data

In [None]:
save_path = '/content/gdrive/MyDrive/RAADD/4. Data/judge_data/judge_data.csv'
judge_data.to_csv(save_path, index=False)