In [7]:
import pandas as pd
from collections import defaultdict

### Check directory structure

In [1]:
ls

README.md               congresspeople_id.txt  hein_daily_qa.txt
[0m[01;32mcnt_speaker_id.sh[0m*      ethicity_aapia.txt     nltk.ipynb
cnt_speaker_id.txt      ethicity_black.txt     speakermap_qa.ipynb
collecting_demo.ipynb   ethicity_hispanic.txt  [01;32mspeakermap_qa.py[0m*
congresspeople_age.txt  [01;32mhein_daily_qa.sh[0m*      speakermap_qa.txt


In [3]:
ls ../sample/hein-daily

097_SpeakerMap.txt     byspeaker_2gram_097.txt  speeches_097.txt
byparty_2gram_097.txt  descr_097.txt


In [30]:
!head -3 ../sample/hein-daily/097_SpeakerMap.txt

speakerid|speech_id|lastname|firstname|chamber|state|gender|party|district|nonvoting
97105301|970000005|PERCY|CHARLES|S|IL|M|R||voting
97105071|970000006|HUDDLESTON|WALTER|S|KY|M|D||voting


In [33]:
filepath = "../sample/hein-daily/097_SpeakerMap.txt"

### Check the structure of a sample SpeakerMap file

In [11]:
with open(filepath) as f:
    cnt = 0
    speakermap = []
    for line in f:
        line = line.strip().split("|")
        if cnt == 0:
            colnames = line
        else:
            speakermap.append(line)
        cnt += 1
#         if cnt > 10:
#             break

In [12]:
speakermap = pd.DataFrame(speakermap, columns=colnames)
speakermap.shape

(202054, 10)

In [13]:
speakermap['fullname'] = speakermap.lastname.str.cat(speakermap.firstname, sep = ', ')

In [14]:
speakermap.head()

Unnamed: 0,speakerid,speech_id,lastname,firstname,chamber,state,gender,party,district,nonvoting,fullname
0,97105301,970000005,PERCY,CHARLES,S,IL,M,R,,voting,"PERCY, CHARLES"
1,97105071,970000006,HUDDLESTON,WALTER,S,KY,M,D,,voting,"HUDDLESTON, WALTER"
2,97106761,970000007,JACKSON,HENRY,S,WA,M,D,,voting,"JACKSON, HENRY"
3,97106981,970000008,STAFFORD,ROBERT,S,VT,M,R,,voting,"STAFFORD, ROBERT"
4,97104811,970000010,BAKER,HOWARD,S,TN,M,R,,voting,"BAKER, HOWARD"


### Checking if there are multiple congresspeople with the same name

In [15]:
speakermap['speakerid'].nunique()

545

In [16]:
speakermap['fullname'].nunique()

541

In [28]:
uniques = speakermap[['fullname','speakerid','chamber','state']]\
                .groupby(['speakerid'])\
                .first()\
                .reset_index()
uniques.shape

(545, 4)

In [29]:
uniques[uniques.fullname.duplicated(keep=False)].sort_values(by='fullname')

Unnamed: 0,speakerid,fullname,chamber,state
284,97108800,"BROWN, GEORGE",H,CO
442,97112310,"BROWN, GEORGE",H,CA
118,97106650,"EDWARDS, WILLIAM",H,AL
368,97110170,"EDWARDS, WILLIAM",H,CA
54,97105980,"EVANS, THOMAS",H,DE
166,97107200,"EVANS, THOMAS",H,IA
61,97106051,"GOLDWATER, BARRY",S,AZ
62,97106060,"GOLDWATER, BARRY",H,CA


In [49]:
with open(filepath) as f:
    speakermap = defaultdict(list)
    for line in f:
        line = line.strip().split("|")
        if line[0] != 'speakerid':
            fullname = line[2]+', '+line[3]
            payload = line[5]
            full_key = fullname+"\t"+payload
            if len(line[0])==8:
                congress = line[0][:2]
            else:
                congress = line[0][:3]
            if len(speakermap[full_key]):
                if congress not in speakermap[full_key][0]:
                    speakermap[full_key][0].append(congress)
                    speakermap[full_key][1].append(line[4:8])
            else:
                speakermap[full_key] = [[congress]]
                speakermap[full_key].append([line[4:8]])

In [50]:
len(speakermap.keys())

545

In [51]:
list(speakermap.keys())[:10]

['PEYSER, PETER\tNY',
 'ATKINSON, EUGENE\tPA',
 'DECONCINI, DENNIS\tAZ',
 'LOEFFLER, THOMAS\tTX',
 'SPENCE, FLOYD\tSC',
 'ROGERS, HAROLD\tKY',
 'ECKART, DENNIS\tOH',
 'MAVROULES, NICHOLAS\tMA',
 'STOKES, LOUIS\tOH',
 'PANETTA, LEON\tCA']

### Getting full map of congresspeople

In [15]:
#!pip install -U gspread oauth2client df2gspread

In [1]:
!pwd

/tf/notebooks/data/QA


In [5]:
ls

README.md               congresspeople_id.txt  hein_daily_qa.txt
[0m[01;34m__pycache__[0m/            ethicity_aapia.txt     nltk.ipynb
[01;32mcnt_speaker_id.sh[0m*      ethicity_black.txt     speakermap_qa.ipynb
cnt_speaker_id.txt      ethicity_hispanic.txt  [01;32mspeakermap_qa.py[0m*
collecting_demo.ipynb   [01;36mgs_connect.py[0m@         speakermap_qa.txt
congresspeople_age.txt  [01;32mhein_daily_qa.sh[0m*


### Reading in the demo information

In [1]:
# make sure to create a symbolic link to this file with this command
# from current directory
# ln -s ../../environment/gs_connect.py gs_connect.py
from gs_connect import get_from_gs

In [2]:
demo_df = get_from_gs("Congresspeople")

In [3]:
demo_df.shape

(1800, 14)

In [4]:
demo_df.head()

Unnamed: 0,Full Name,Last Name,First Name,First Last,List of Congresses,Chamber,State,Gender,Party,Ethnicity,BirthYear,Congress Name,Alternative name (Wikipedia),Congress ID
0,"ABDNOR, JAMES",ABDNOR,JAMES,JAMES ABDNOR,97 98 99,S,SD,M,R,ME,1923,"ABDNOR, JAMES",JAMES ABDNOR,A000009
1,"ABERCROMBIE, NEIL",ABERCROMBIE,NEIL,NEIL ABERCROMBIE,99 102 103 104 105 106 107 108 109 110 111,H,HI,M,D,W,1938,"ABERCROMBIE, NEIL",NEIL ABERCROMBIE,A000014
2,"ABRAHAM, RALPH",ABRAHAM,RALPH,RALPH ABRAHAM,114,H,LA,M,R,W,1954,"ABRAHAM, RALPH LEE",RALPH ABRAHAM,A000374
3,"ABRAHAM, SPENCER",ABRAHAM,SPENCER,SPENCER ABRAHAM,104 105 106,S,MI,M,R,ME,1952,"ABRAHAM, SPENCER",SPENCER ABRAHAM,A000355
4,"ACEVEDO-VILA, ANIBAL",ACEVEDO-VILA,ANIBAL,ANIBAL ACEVEDO-VILA,107 108,H,PR,M,A,H,1962,"ACEVEDO-VILA, ANIBAL",ANÍBAL ACEVEDO VILÁ,A000359


In [58]:
demo_df['Full Name'].nunique()

1781

### Fixing names for father/son

In [5]:
errors = ['FORD, HAROLD','KENNEDY, JOSEPH','JONES, WALTER','HUNTER, DUNCAN',
         'PAYNE, DONALD','DUNCAN, JOHN','MACK, CONNIE','RHODES, JOHN','PERKINS, CARL']
fixes = ['FORD, HAROLD E., JR.','KENNEDY, JOSEPH P., III','JONES, WALTER B., JR.','HUNTER, DUNCAN D.',
         'PAYNE, DONALD M., JR.','DUNCAN, JOHN J., JR.','MACK, CONNIE, III','RHODES, JOHN J., III','PERKINS, CARL C.']

In [8]:
demo_df['BirthYear'] = pd.to_numeric(demo_df['BirthYear'])

In [9]:
demo_df.iloc[[(demo_df['Full Name']=='FORD, HAROLD') & (demo_df['BirthYear']==1970)],[demo_df.columns.get_loc('Full Name')]]='FORD, HAROLD E., JR.'
demo_df.iloc[[(demo_df['Full Name']=='KENNEDY, JOSEPH') & (demo_df['BirthYear']==1980)],[demo_df.columns.get_loc('Full Name')]]='KENNEDY, JOSEPH P., III'
demo_df.iloc[[(demo_df['Full Name']=='JONES, WALTER') & (demo_df['BirthYear']==1943)],[demo_df.columns.get_loc('Full Name')]]='JONES, WALTER B., JR.'
demo_df.iloc[[(demo_df['Full Name']=='HUNTER, DUNCAN') & (demo_df['BirthYear']==1976)],[demo_df.columns.get_loc('Full Name')]]='HUNTER, DUNCAN D.'
demo_df.iloc[[(demo_df['Full Name']=='PAYNE, DONALD') & (demo_df['BirthYear']==1958)],[demo_df.columns.get_loc('Full Name')]]='PAYNE, DONALD M., JR.'
demo_df.iloc[[(demo_df['Full Name']=='DUNCAN, JOHN') & (demo_df['BirthYear']==1947)],[demo_df.columns.get_loc('Full Name')]]='DUNCAN, JOHN J., JR.'
demo_df.iloc[[(demo_df['Full Name']=='MACK, CONNIE') & (demo_df['BirthYear']==1940)],[demo_df.columns.get_loc('Full Name')]]='MACK, CONNIE, III'
demo_df.iloc[[(demo_df['Full Name']=='RHODES, JOHN') & (demo_df['BirthYear']==1943)],[demo_df.columns.get_loc('Full Name')]]='RHODES, JOHN J., III'
demo_df.iloc[[(demo_df['Full Name']=='PERKINS, CARL') & (demo_df['BirthYear']==1954)],[demo_df.columns.get_loc('Full Name')]]='PERKINS, CARL C.'

  values[indexer] = value


In [10]:
demo_df['Full Name'].nunique()

1790

In [11]:
for e in errors:
    print(e, demo_df[demo_df['Full Name']==e]['Full Name'].values)

FORD, HAROLD ['FORD, HAROLD']
KENNEDY, JOSEPH ['KENNEDY, JOSEPH']
JONES, WALTER ['JONES, WALTER']
HUNTER, DUNCAN ['HUNTER, DUNCAN']
PAYNE, DONALD ['PAYNE, DONALD']
DUNCAN, JOHN ['DUNCAN, JOHN']
MACK, CONNIE ['MACK, CONNIE']
RHODES, JOHN ['RHODES, JOHN']
PERKINS, CARL ['PERKINS, CARL']


In [12]:
for f in fixes:
    print(f, demo_df[demo_df['Full Name']==f]['Full Name'].values)

FORD, HAROLD E., JR. ['FORD, HAROLD E., JR.']
KENNEDY, JOSEPH P., III ['KENNEDY, JOSEPH P., III']
JONES, WALTER B., JR. ['JONES, WALTER B., JR.']
HUNTER, DUNCAN D. ['HUNTER, DUNCAN D.']
PAYNE, DONALD M., JR. ['PAYNE, DONALD M., JR.']
DUNCAN, JOHN J., JR. ['DUNCAN, JOHN J., JR.']
MACK, CONNIE, III ['MACK, CONNIE, III']
RHODES, JOHN J., III ['RHODES, JOHN J., III']
PERKINS, CARL C. ['PERKINS, CARL C.']


In [13]:
demo_df.to_csv('./congresspeople_demo.txt', sep='|', index=False)