In [20]:
# importing libraries
%matplotlib inline
import pandas as pd
import json
import numpy as np
import os

In [38]:
# Initial testing and working around with chamber counts on different congress year sessions
# Documentation on the votes https://github.com/unitedstates/congress/wiki/votes

content_list = []
years = []
yearVotes = {}
congress_no = "data/108/"
congress_votes = congress_no + "votes/"

# loops through the congress directory and collects years present
for content in os.listdir(congress_no):
    if content != '.DS_Store':
        content_list.append(content)
        
# for each year extract the vote sessions completed in the House and the Senate 
for year in os.listdir(congress_votes):
    years.append(year)    
    house_count = 0
    senate_count = 0
    for chamber in os.listdir(congress_votes + year):
        if chamber[0] == "h":
            house_count +=1
        if chamber[0] == "s":
            senate_count += 1
    yearVotes[year] = {'senate_count':senate_count, 'house_count':house_count}

yearVotes

{'2003': {'house_count': 677, 'senate_count': 459},
 '2004': {'house_count': 544, 'senate_count': 216}}

In [44]:
# A look into "Foreign" bills and amendments from the House of Representatives.
# This test run returns an array of vote indices of all Vote Documents form the House
# related to the query param "Foreign".

sample_year= '2003/'
house_path = congress_votes+sample_year+"h"
res_docs = []
wordSearch = "Foreign"
for i in range(1, yearVotes['2003']['house_count']+1):
    # Specify path, open and read relevant json file
    path = house_path + str(i) + '/data.json'
    f = open(path, 'r')
    x = json.loads(f.read())
    if wordSearch in x["question"]:
        res_docs.append(i)
        
res_docs

[314, 369, 424, 429, 539]

In [145]:
# Relevant Vote Document parsed
# https://www.govtrack.us/data/congress/108/votes/2003/h424/data.json
# https://www.govtrack.us/congress/bills/108/hres372
class Processor():
    def __init__(self, congress_num, word_search):
        self.congress_num = congress_num
        self.vote_types = ["Nay", "Not Voting", "Present", "Yea"]
        self.bill_types = ['hconres', 'hjres', 'hres', 'sconres', 'sjres', 'sres', 'hr', 's']
        self.res_dict = {}
        self.res_dict['bill_title'] = []
        self.res_dict['isAmendment'] = []
        self.res_dict['result'] = []
        self.res_dict['date'] = []
        self.res_dict['question'] = []
        self.res_dict['chamber'] = []
        self.res_dict['year'] = []
        self.res_dict['bill_long_text'] = []
        self.res_dict["top_subject"] = []
        self.res_dict['amendment_type_num'] = []
        self.res_dict["bill_sponsor_name"] = []
        self.res_dict["list_of_voters_obj"] = []

        for vote_type in self.vote_types:
            self.res_dict[vote_type + " (Democrats)"] = []
            self.res_dict[vote_type + " (Republicans)"] = []
            self.res_dict[vote_type + " (Others)"] = []
        self.word_search = word_search
        self.chambers = {"/h":"house_count", "/s":"senate_count"}

    def find_chamber_count(self):
        """ 
            Lists all vote sessions completed in the House and the Senate (both chambers) 
            for a given congress and year.
            
            Returns dictionary of year and chamber count arrangements 
        """
        self.years = []
        self.count_arrangements = {}
        self.congress_votes = self.congress_num+"votes/"
        for year in os.listdir(self.congress_votes):
            self.years.append(year)    
            self.house_count = 0
            self.senate_count = 0
            for chamber in os.listdir(self.congress_votes + year):
                if chamber[0] == "h":
                    self.house_count +=1
                if chamber[0] == "s":
                    self.senate_count += 1
            self.count_arrangements[year] = {'senate_count':self.senate_count, 'house_count':self.house_count}
        return self.count_arrangements

    def read_chamber_data(self):
        """
            Finds all indices of votes related to the query search word and returns resulting dictionary
        """
        chambers_in_years = self.find_chamber_count()
        res_all = {}
        self.res_docs = []
        for chamber in self.chambers:
            for year in self.years:
                chamber_path = self.congress_votes+year+chamber
                for i in range(1, chambers_in_years[year][self.chambers[chamber]]+1):
                    path = chamber_path + str(i) + '/data.json'
                    f = open(path, 'r')
                    x = json.loads(f.read())
                    if self.word_search in x["question"]:
                        self.res_docs.append(i)
                        res_all = self.arrange_dict(i, path, self.chambers[chamber])
        return res_all
        
    def arrange_dict(self, vote_index, vote_path, chamber):
        """
            Alocates keys and values to the resulting dictionary and populates fields using helper API methods.
        """
        vote = json.loads(open(vote_path, 'r').read())
        votes_info = vote['votes']
        if "Aye" in votes_info:
            votes_info["Yea"] = votes_info["Aye"]
        if "No" in votes_info:
            votes_info["Nay"] = votes_info["No"]

        if vote["category"] != "amendment":
            self.res_dict['amendment_type_num'].append("no")
            self.res_dict['isAmendment'].append(False)
        else:
            self.res_dict['isAmendment'].append(True)
            if hasattr(vote, 'amendment'):
                self.res_dict['amendment_type_num'].append(str(vote['amendment']['type'][0])+"amdt"+str(vote['amendment']['number']))
            else:
                self.res_dict['amendment_type_num'].append("unknown")
        if "bill" in vote.keys():
            self.res_dict['bill_title'].append(str(vote['bill']['type'] + str(vote['bill']['number'])))
        else:
            self.res_dict['bill_title'].append("unknown")
        self.res_dict['bill_long_text'].append(self.get_bill_info(vote_index,vote_path)[0])
        self.res_dict['top_subject'].append(self.get_bill_info(vote_index,vote_path)[1])
        self.res_dict['bill_sponsor_name'].append(self.get_bill_info(vote_index,vote_path)[2])
        self.res_dict['result'].append((vote['result_text']).encode('utf-8'))
        self.res_dict['date'].append((vote['date']).encode('utf-8'))
        self.res_dict['year'].append(vote['date'].split('-')[0])
        self.res_dict['question'].append((vote['question']).encode('utf-8'))
        self.res_dict['chamber'].append((chamber.split('_')[0]).encode('utf-8'))
        
        list_of_people_for_vote = []
        for vote_type in self.vote_types:
            dem_counter = 0
            rep_counter = 0
            ind_counter = 0
            if vote_type in vote["votes"]:
                for voterObj in vote["votes"][vote_type]:
                    meaningfulObj = {}
                    if "display_name" in voterObj:
                        name = voterObj["display_name"].encode('utf-8')
                    else:
                        name = "unknown"
                    if "party" in voterObj:
                        party = voterObj["party"].encode('utf-8')
                    else:
                        party="unknown"
                    if "state" in voterObj:
                        state = voterObj["state"].encode('utf-8')
                    else:
                        state="unknown"
                    meaningfulObj = {"display_name":name, "party":party, "state":state, "vote":vote_type}

                    if party == 'D':
                        dem_counter +=1
                    if party == 'R':
                        rep_counter +=1
                    if party == 'I':
                        ind_counter +=1

                    list_of_people_for_vote.append(meaningfulObj)

            self.res_dict[vote_type + " (Democrats)"].append(dem_counter)
            self.res_dict[vote_type + " (Republicans)"].append(rep_counter)
            self.res_dict[vote_type + " (Others)"].append(ind_counter)
        self.res_dict["list_of_voters_obj"].append(list_of_people_for_vote)

        return self.res_dict

    def get_bill_info(self, vote_index, vote_path):
        """
            Extracts information from bills relevant to the votes investigated.
            Return array of information for bill, including its text, top subject and sponsor's name.
        """
        vote = json.loads(open(vote_path, 'r').read())
        if "bill" in vote.keys():
            bill_type_num = str(vote['bill']['type']) + str(vote['bill']['number'])
        else:
            bill_type_num = "unknown"
        for bill_type in self.bill_types:
            if bill_type in bill_type_num:
                bill_data_path = self.congress_num + "bills/" + bill_type + "/" + str(bill_type_num) + "/data.json"
                bill = json.loads(open(bill_data_path, 'r').read())
                if bill["sponsor"]:
                    long_text_top_subject_array = [bill["summary"]["text"].lower(), bill["subjects_top_term"], bill["sponsor"]["name"]]
                    return long_text_top_subject_array
                else:
                    long_text_top_subject_array = [bill["summary"]["text"].lower(), bill["subjects_top_term"], "unknown"]
                    return long_text_top_subject_array
        long_text_top_subject_array = ["unknown", "unknown", "unknown"]
        return long_text_top_subject_array

In [146]:
congress_years = ["104", "105", "106", "107"]
# test_congress_years = ["104", "105"]
frames = []

for congress in congress_years:
    print congress
    pr = Processor("/Volumes/usb1/"+congress+"/"," ")
    df = pd.DataFrame(data=pr.read_chamber_data())
    frames.append(df)

allframes = pd.concat(frames)
filename_dest = "word_Space_congress_104_107.csv"
allframes.to_csv(filename_dest, index=False)

104
105
106
107


In [147]:
data_info = pd.read_csv("./word_Space_congress_104_107.csv")
data_info.head(3)

Unnamed: 0,Nay (Democrats),Nay (Others),Nay (Republicans),Not Voting (Democrats),Not Voting (Others),Not Voting (Republicans),Present (Democrats),Present (Others),Present (Republicans),Yea (Democrats),...,bill_sponsor_name,bill_title,chamber,date,isAmendment,list_of_voters_obj,question,result,top_subject,year
0,19,0,0,5,0,0,0,0,0,23,...,"Dole, Robert J.",sres14,senate,1995-01-05T11:31:00-05:00,False,"[{'vote': 'Nay', 'party': 'D', 'state': 'NM', ...",On the Motion to Table S.Amdt. 1 to S.Res. 14 ...,Motion to Table Agreed to (76-19),Congress,1995
1,37,0,2,7,0,2,0,0,0,3,...,"Grassley, Chuck",s2,senate,1995-01-05T19:14:00-05:00,False,"[{'vote': 'Nay', 'party': 'R', 'state': 'MI', ...",On the Motion to Table S.Amdt. 3 to S. 2 (Cong...,Motion to Table Agreed to (52-39),Congress,1995
2,17,0,0,7,0,2,0,0,0,23,...,"Grassley, Chuck",s2,senate,1995-01-06T11:46:00-05:00,False,"[{'vote': 'Nay', 'party': 'D', 'state': 'MT', ...",On the Motion to Table S.Amdt. 5 to S. 2 (Cong...,Motion to Table Agreed to (74-17),Congress,1995


In [148]:
year_information = data_info.groupby('year').size()
year_information

year
1995    1498
1996     761
1997     938
1998     861
1999     985
2000     901
2001     892
2002     737
dtype: int64

In [152]:
counter = 0
bills = []
immigration_df = pd.DataFrame()

for index, row in data_info.iterrows():
    if "immigration" in row['bill_long_text']:
        bills.append(row["bill_title"])
        
bills_unique = np.unique(bills)

immigration_data = pd.DataFrame({"bill_title": bills_unique})
immigration_df = immigration_df.append(immigration_data)
        
print bills_unique

['hconres188' 'hconres284' 'hconres297' 'hjres122' 'hjres58' 'hjres94'
 'hr1119' 'hr1141' 'hr1158' 'hr1209' 'hr1271' 'hr1385' 'hr1401' 'hr1428'
 'hr1469' 'hr1493' 'hr1501' 'hr1555' 'hr1561' 'hr1617' 'hr1646' 'hr1658'
 'hr1757' 'hr1871' 'hr1885' 'hr1892' 'hr1906' 'hr1944' 'hr2015' 'hr2027'
 'hr2076' 'hr2155' 'hr2202' 'hr2215' 'hr2264' 'hr2267' 'hr2431' 'hr2464'
 'hr2500' 'hr2578' 'hr2586' 'hr2607' 'hr2670' 'hr2703' 'hr2886' 'hr2920'
 'hr2975' 'hr3004' 'hr3019' 'hr3073' 'hr3130' 'hr3162' 'hr3231' 'hr3244'
 'hr3259' 'hr3394' 'hr3448' 'hr3525' 'hr3581' 'hr3610' 'hr3734' 'hr3736'
 'hr3814' 'hr3989' 'hr4' 'hr4134' 'hr4194' 'hr4276' 'hr4278' 'hr4300'
 'hr4328' 'hr4425' 'hr4577' 'hr4678' 'hr4690' 'hr4775' 'hr4858' 'hr5005'
 'hr5063' 'hr5710' 'hr6' 'hr667' 'hr668' 'hr889' 'hres384' 'hres396'
 'hres528' 's1022' 's1059' 's1061' 's1161' 's1214' 's1217' 's1233' 's1357'
 's143' 's1510' 's1664' 's1723' 's1882' 's1956' 's2045' 's2260' 's2312'
 's2334' 's254' 's544' 's735' 's886' 's903' 's908' 's936' '

In [154]:
# Length of 441 and 43, respectively.
immigration_df = data_info[data_info['bill_long_text'].str.contains("immigration")]
unique_immigration_df = immigration_df.drop_duplicates(subset='bill_title', keep='last')

In [155]:
print len(immigration_df)
print len(unique_immigration_df)

1135
114


In [160]:
# Save the unique case in a csv and prepare it for clustering
filename_dest = "unique_immigration_104_107.csv"
unique_immigration_df.to_csv(filename_dest, index=False)