In [13]:
import numpy as np
import pandas as pd
import re

In [14]:
data_folder = "/Users/colleendriscoll/Dropbox/Classes/CS 109A/CS 109A Final project/data/"

In [15]:
## Reading in state abbreviations file to get the correct district ID columns
state_abbs = pd.read_csv(data_folder + "state_abbreviations_correspondence_table.csv",
                       sep = "\t")

Loading in raw html table data that lists candidates by congressional district. Cleaning it up into a dataframe of individual candidates.

In [16]:

def cand_info_clean(file_path, state_abb_df):
    can_raw = pd.read_csv(file_path)
    
    can_cleaned_dict = {'Location':[],
                       'Incumbent':[],
                       'Inc_party':[],
                       'Inc_first_elected':[],
                       'Candidate':[],
                       'Cand_party':[]
                       }
    for i in range(len(can_raw)):
        cand_str = can_raw.candidates[i]
        cand_str_split = cand_str.split("\n")
        cand_str_names = [re.search("([^\(]+) \(", x).group(1) for x in cand_str_split]
        cand_str_party = [re.search("\(([^\)]+)\)", x).group(1) for x in cand_str_split]

        n_cand = len(cand_str_split)
        location_i = [can_raw.Location[i] for x in range(n_cand)]
        incumbent_i = [can_raw.Incumbent[i] for x in range(n_cand)]
        inc_party_i = [can_raw.Party[i] for x in range(n_cand)]
        inc_first_elected_i = [can_raw.First_elected[i] for x in range(n_cand)]

        can_cleaned_dict['Location'].extend(location_i)
        can_cleaned_dict['Incumbent'].extend(incumbent_i)
        can_cleaned_dict['Inc_party'].extend(inc_party_i)
        can_cleaned_dict['Inc_first_elected'].extend(inc_first_elected_i)
        can_cleaned_dict['Candidate'].extend(cand_str_names)
        can_cleaned_dict['Cand_party'].extend(cand_str_party)

    can_clean_df = pd.DataFrame(can_cleaned_dict)
    
    ## Extracting state name and district number from "Location" column
    can_clean_df['state_name'] = [re.search("([A-Za-z ]+) [0-9]+",
                                            x).group(1)
                                  for x in can_clean_df.Location]
    can_clean_df['district_num'] = [re.search("([0-9]+)",
                                            x).group(1)
                                  for x in can_clean_df.Location]

    ## Merging candidate data with state abbreviation data

    can_clean_merged = pd.merge(can_clean_df, state_abb_df,
                               how = 'left', on = 'state_name')
    can_clean_merged['dist_id'] =  can_clean_merged['state_abb'] + "_" + can_clean_merged['district_num']

    ## Subsetting data to only include Democrats and Republicans

    can_clean_merged = can_clean_merged[(can_clean_merged.Cand_party == "Republican") | 
                                           (can_clean_merged.Cand_party == "Democratic")]
    
    ## First letter of party is the party abbreviation
    can_clean_merged['Cand_party_abb'] = can_clean_merged['Cand_party'].astype(str).str[0]

    ## Dummy if the candidate is the incumbent
    can_clean_merged['is_incumbent'] = (can_clean_merged['Incumbent'] ==
                                        can_clean_merged['Candidate']).astype(int)
    
    ## How long has the candidate been in Congress?
    can_clean_merged['years_in_congress'] = 0
    can_clean_merged['years_in_congress'][can_clean_merged.is_incumbent == 1] = \
       2018 - can_clean_merged['Inc_first_elected'][can_clean_merged.is_incumbent == 1]
    
    return can_clean_merged

In [17]:
can_clean_merged = cand_info_clean(data_folder + "candidates.csv", state_abbs)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [18]:
can_clean_merged[:5]

Unnamed: 0,Location,Incumbent,Inc_party,Inc_first_elected,Candidate,Cand_party,state_name,district_num,state_abb,dist_id,Cand_party_abb,is_incumbent,years_in_congress
0,Alabama 1,Bradley Byrne,Republican,2013.0,Bradley Byrne,Republican,Alabama,1,AL,AL_1,R,1,5
1,Alabama 1,Bradley Byrne,Republican,2013.0,Robert Kennedy Jr.,Democratic,Alabama,1,AL,AL_1,D,0,0
2,Alabama 2,Martha Roby,Republican,2010.0,Tabitha Isner,Democratic,Alabama,2,AL,AL_2,D,0,0
3,Alabama 2,Martha Roby,Republican,2010.0,Martha Roby,Republican,Alabama,2,AL,AL_2,R,1,8
4,Alabama 3,Mike Rogers,Republican,2002.0,Mallory Hagan,Democratic,Alabama,3,AL,AL_3,D,0,0


## NOMINATE scores 
https://voteview.com/data

In [19]:
def drop_secondary_members(nominate_df):
    ## Districts where there was more than one member of Congress serving, 
    ## assign the one who voted the most number of times to the district
    multiple_member_districts = nominate_df.dist_id[nominate_df.dist_id.duplicated()]
    
    nominate_df['main_member'] = 1
    for district in multiple_member_districts:
        member_votes = nominate_df.nominate_number_of_votes[nominate_df.dist_id == district]

        orders = np.argsort(member_votes)

        lowest_score_index = nominate_df['main_member'][nominate_df.dist_id \
                                                            == district][orders == 0].index

        nominate_df.loc[lowest_score_index, 'main_member'] = 0

    ## Only keeping the main member in each district
    nominate_df = nominate_df[nominate_df.main_member == 1]
    nominate_df.drop('main_member', axis = 1, inplace = True)

    return nominate_df

def nom_scores_clean(nom_file_name, cols_keep):
    nominate_scores = pd.read_csv(nom_file_name)
    nominate_scores = nominate_scores[cols_keep]
    
    ## Dropping president
    nominate_scores = nominate_scores[nominate_scores['state_abbrev'] != "USA"]

    ## Dropping members who didn't vote (they can't provide ideology measures then)
    missing_vote_num_indices = nominate_scores.nominate_number_of_votes.isna() == True
    nominate_scores = nominate_scores[~missing_vote_num_indices]

    ## District ID column
    nominate_scores['dist_id'] = nominate_scores.state_abbrev + '_' + \
    nominate_scores.district_code.astype(str)

    nominate_scores = drop_secondary_members(nominate_scores)

    nominate_scores.drop('nominate_number_of_votes', axis = 1,
                        inplace = True)

    ## Election year during which this Congress was in session (not the one that
    ## produced this Congress!)
    session_length = 2
    congress_start_year = 1788
    nominate_scores['year'] = congress_start_year + session_length*nominate_scores['congress']

    return nominate_scores

In [35]:
nom_cols_keep = ['congress', 'chamber', 'icpsr', 'district_code',
                'state_abbrev', 'party_code', 'bioname', 'born',
                'nominate_dim1', 'nominate_dim2','nominate_number_of_votes',
                'nokken_poole_dim1', 'nokken_poole_dim2']

In [21]:
nom_test = nom_scores_clean(data_folder + "nominate scores/H113_members.csv",
                           nom_cols_keep)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [22]:
nom_test[:5]

Unnamed: 0,congress,chamber,icpsr,district_code,state_abbrev,party_code,bioname,born,nominate_dim1,nominate_dim2,dist_id,year
2,113,House,20301,3,AL,200,"ROGERS, Mike Dennis",1958,0.338,0.446,AL_3,2014
3,113,House,21102,7,AL,100,"SEWELL, Terri",1965,-0.39,0.404,AL_7,2014
4,113,House,21192,2,AL,200,"ROBY, Martha",1976,0.367,0.659,AL_2,2014
5,113,House,21193,5,AL,200,"BROOKS, Mo",1954,0.601,-0.428,AL_5,2014
6,113,House,21376,1,AL,200,"BYRNE, Bradley",1955,0.544,0.304,AL_1,2014


## CLEA data
Elections data

In [23]:
def clea_clean(clea_file_name, state_abb_df):
    ## Read in data
    clea_results = pd.read_excel(clea_file_name)
    
    ## Subsetting to only Democrats and Republicans
    clea_results = clea_results[(clea_results.pty == 180) | (clea_results.pty == 583)]
    ## Only general elections (November)
    clea_results = clea_results[clea_results.mn == 11]

    ## Extracting district number from constituency name
    ## There are some states with only one district that then don't 
    ## have a district number listed -- therefore filling those NAs with 1s
    clea_results['dist_num'] = clea_results.cst_n.str.findall('[0-9]+').str[0].fillna(1)
    
    ## Lowercase state name
    state_abb_df['state_name_lower'] = state_abb_df.state_name.str.lower()

    ## Merging CLEA with state abbrevation correspondence table
    clea_merged = pd.merge(clea_results, state_abb_df,
                              how = 'right',
                              left_on = 'sub',
                              right_on = 'state_name_lower')
    
    ## Creating distict ID variable to merge on later
    clea_merged['dist_id'] = clea_merged['state_abb']+ "_"+ clea_merged['dist_num'].astype(str)

    ## Grouping CLEA by district-year to get the democratic share of the 
    ## two-party vote
    grouped = clea_merged.groupby(['dist_id', 'yr'])

    years = []
    dist_ids = []
    dem_shares = []

    for name, group in grouped:
        dem_share = 0
        years.append(group.yr.values[0])
        dist_ids.append(group.dist_id.values[0])

        if 180 in group.pty.values: ## If a Democrat ran
            total_votes = np.sum(group.cv1.values)
            dem_votes = np.sum(group.cv1[group.pty == 180].values)
            dem_share = dem_votes/total_votes
        dem_shares.append(dem_share)
    
    dem_vote_share_dict = {'year': years,
                       'dist_id': dist_ids,
                       'dem_vote_share': dem_shares
                      }
    dem_vote_share = pd.DataFrame(dem_vote_share_dict)
    
    return dem_vote_share

In [24]:
test_out = clea_clean(data_folder + "election_results/clea_20180507.xlsx",
                     state_abbs)

In [25]:
import os

In [36]:
nominate_csvs = os.listdir(data_folder + "nominate scores")

nom_combined = nom_scores_clean(data_folder + "nominate scores/" + nominate_csvs[0],
                                nom_cols_keep)

for file_end in nominate_csvs[1:]:
    file_path = data_folder + "nominate scores/" + file_end
    df = nom_scores_clean(file_path, nom_cols_keep)
    nom_combined = nom_combined.append(df, ignore_index = True)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [37]:
nom_combined[nom_combined.dist_id == "AL_1"]

Unnamed: 0,congress,chamber,icpsr,district_code,state_abbrev,party_code,bioname,born,nominate_dim1,nominate_dim2,nokken_poole_dim1,nokken_poole_dim2,dist_id,year
2,96,House,10721,1,AL,200,"EDWARDS, William Jackson (Jack)",1928.0,0.177,0.161,0.126,0.102,AL_1,1980
435,97,House,10721,1,AL,200,"EDWARDS, William Jackson (Jack)",1928.0,0.177,0.161,0.156,0.006,AL_1,1982
869,98,House,10721,1,AL,200,"EDWARDS, William Jackson (Jack)",1928.0,0.177,0.161,0.141,0.098,AL_1,1984
1308,99,House,15090,1,AL,200,"CALLAHAN, Herbert Leon (Sonny)",1932.0,0.373,0.202,0.352,0.281,AL_1,1986
1741,100,House,15090,1,AL,200,"CALLAHAN, Herbert Leon (Sonny)",1932.0,0.373,0.202,0.365,0.122,AL_1,1988
2174,101,House,15090,1,AL,200,"CALLAHAN, Herbert Leon (Sonny)",1932.0,0.373,0.202,0.374,0.217,AL_1,1990
2608,102,House,15090,1,AL,200,"CALLAHAN, Herbert Leon (Sonny)",1932.0,0.373,0.202,0.397,0.264,AL_1,1992
3041,103,House,15090,1,AL,200,"CALLAHAN, Herbert Leon (Sonny)",1932.0,0.373,0.202,0.457,0.342,AL_1,1994
3476,104,House,15090,1,AL,200,"CALLAHAN, Herbert Leon (Sonny)",1932.0,0.373,0.202,0.36,0.365,AL_1,1996
3910,105,House,15090,1,AL,200,"CALLAHAN, Herbert Leon (Sonny)",1932.0,0.373,0.202,0.358,0.215,AL_1,1998


In [38]:
merge_test = pd.merge(test_out, nom_combined, how = "left", 
                     on = ["year", "dist_id"])

In [39]:
merge_test[:15]

Unnamed: 0,year,dist_id,dem_vote_share,congress,chamber,icpsr,district_code,state_abbrev,party_code,bioname,born,nominate_dim1,nominate_dim2,nokken_poole_dim1,nokken_poole_dim2
0,1980,AK_1,0.259215,96.0,House,14066.0,1.0,AK,200.0,"YOUNG, Donald Edwin",1933.0,0.279,0.014,0.22,0.171
1,1982,AK_1,0.288493,97.0,House,14066.0,1.0,AK,200.0,"YOUNG, Donald Edwin",1933.0,0.279,0.014,0.249,-0.22
2,1984,AK_1,0.431049,98.0,House,14066.0,1.0,AK,200.0,"YOUNG, Donald Edwin",1933.0,0.279,0.014,0.216,-0.301
3,1986,AK_1,0.42111,99.0,House,14066.0,1.0,AK,200.0,"YOUNG, Donald Edwin",1933.0,0.279,0.014,0.196,-0.298
4,1988,AK_1,0.373454,100.0,House,14066.0,1.0,AK,200.0,"YOUNG, Donald Edwin",1933.0,0.279,0.014,0.212,-0.41
5,1990,AK_1,0.48079,101.0,House,14066.0,1.0,AK,200.0,"YOUNG, Donald Edwin",1933.0,0.279,0.014,0.292,-0.223
6,1992,AK_1,0.477895,102.0,House,14066.0,1.0,AK,200.0,"YOUNG, Donald Edwin",1933.0,0.279,0.014,0.252,-0.214
7,1994,AK_1,0.365124,103.0,House,14066.0,1.0,AK,200.0,"YOUNG, Donald Edwin",1933.0,0.279,0.014,0.271,0.006
8,1996,AK_1,0.380061,104.0,House,14066.0,1.0,AK,200.0,"YOUNG, Donald Edwin",1933.0,0.279,0.014,0.273,0.227
9,1998,AK_1,0.356059,105.0,House,14066.0,1.0,AK,200.0,"YOUNG, Donald Edwin",1933.0,0.279,0.014,0.319,0.187


In [40]:
merge_test.shape

(8211, 15)