In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import os



In [2]:
data_folder = "https://raw.githubusercontent.com/cdriscoll92/CS-109A-Final-Project/master/data/"
# local_data_folder = '/Users/poojatyagi/Dropbox (MIT)/CS 109A Final project/Data'
local_data_folder = "/Users/colleendriscoll/Dropbox/Classes/CS 109A/CS 109A Final project/data/"

In [3]:
## Reading in state abbreviations file to get the correct district ID columns
state_abbs = pd.read_csv(data_folder + "state_abbreviations_correspondence_table.csv")

In [4]:
## Grouping CLEA by district-year to get the 
## democratic share of the two-party vote
def group_to_D_vote(groupby_obj, democrat_code):
    ## Groupby object with "yr", "dist_id", "pty", ""
    years = []
    dist_ids = []
    dem_shares = []
    
    for name, group in groupby_obj:
        dem_share = 0
        years.append(group.yr.values[0])
        dist_ids.append(group.dist_id.values[0])

        if democrat_code in group.pty.values: ## If a Democrat ran
            total_votes = np.sum(group.cv1.values)
            dem_votes = np.sum(group.cv1[group.pty == democrat_code].values)
            dem_share = dem_votes/total_votes
        dem_shares.append(dem_share)
    
    dem_vote_share_dict = {'year': years,
                           'dist_id': dist_ids,
                           'dem_vote_share': dem_shares
                          }
    return(dem_vote_share_dict)


In [5]:
def clea_clean(clea_file_name, state_abb_df):
    ## Read in data
    clea_results = pd.read_csv(clea_file_name)
    democrat_code = 180
    republican_code = 583
    election_month_int = 11
    
    ## Subsetting to only Democrats and Republicans
    clea_results = clea_results[(clea_results.pty == democrat_code) | 
                                (clea_results.pty == republican_code)]
    ## Only general elections (November)
    clea_results = clea_results[clea_results.mn == election_month_int]

    ## Extracting district number from constituency name
    ## There are some states with only one district that then don't 
    ## have a district number listed -- therefore filling those NAs with 1s
    clea_results['dist_num'] = clea_results.cst_n.str.findall('[0-9]+').\
    str[0].fillna(1)
    
    ## Lowercase state name to match CLEA listing
    state_abb_df['state_name_lower'] = state_abb_df.state_name.str.lower()

    ## Merging CLEA with state abbrevation correspondence table
    clea_merged = pd.merge(clea_results, state_abb_df,
                              how = 'right',
                              left_on = 'sub',
                              right_on = 'state_name_lower')
    
    ## Creating distict ID variable to merge on later
    clea_merged['dist_id'] = clea_merged['state_abb']+ "_" + \
    clea_merged['dist_num'].astype(str)

    ## Grouping CLEA by district-year to get the democratic share of the 
    ## two-party vote
    grouped = clea_merged.groupby(['dist_id', 'yr'])
    
    dem_vote_share = pd.DataFrame(group_to_D_vote(grouped, 
                                                 democrat_code))
    
    return dem_vote_share

In [6]:
clea_cleaned = clea_clean(data_folder + "election_results/clea_20180507.csv",
                          state_abbs)

In [7]:
results_2018_df = pd.read_csv(local_data_folder + 
                           "election_results/2018_scraped_cleaned.csv")
grouped_2018 = results_2018_df.groupby(['dist_id', 'yr'])
results_2018 = pd.DataFrame(group_to_D_vote(grouped_2018, "D"))
election_results = pd.concat([clea_cleaned, results_2018],
                             ignore_index=True)

## Make sure that no observations were lost/added in the concatenation
assert (len(clea_cleaned)+ len(results_2018) ==\
        len(election_results)), \
"Combined DataFrame not same length as two DFs combined"

In [8]:

def drop_secondary_members(nominate_df):
    ## Support function for NOMINATE cleaning
    ## Districts where there was more than one member of Congress serving, 
    ## assign the one who voted the most number of times to the district
    multiple_member_districts = nominate_df.dist_id\
    [nominate_df.dist_id.duplicated()]
    
    nominate_df['main_member'] = 1
    for district in multiple_member_districts:
        member_votes = nominate_df.nominate_number_of_votes\
        [nominate_df.dist_id == district]

        orders = np.argsort(member_votes)

        lowest_score_index = nominate_df['main_member']\
        [nominate_df.dist_id == district][orders == 0].index

        nominate_df.loc[lowest_score_index, 'main_member'] = 0

    ## Only keeping the main member in each district
    nominate_df = nominate_df[nominate_df.main_member == 1]
    nominate_df = nominate_df.drop(columns = ['main_member'], axis = 1,
                                   inplace = False)

    return nominate_df

In [9]:
def nominate_scores_clean(nom_file_name, cols_keep):
    nominate_scores = pd.read_csv(nom_file_name)
    nominate_scores = nominate_scores[cols_keep]
    
    ## Dropping president
    nominate_scores = nominate_scores[nominate_scores['state_abbrev']\
                                      != "USA"]

    ## Dropping members who didn't vote (they can't provide ideology measures then)
    missing_vote_num_indices = nominate_scores.nominate_number_of_votes.isna()\
    == True
    nominate_scores = nominate_scores[~missing_vote_num_indices]

    ## District ID column
    nominate_scores['dist_id'] = nominate_scores.state_abbrev + '_' + \
    nominate_scores.district_code.astype(str)

    nominate_scores = drop_secondary_members(nominate_scores)

    nominate_scores.drop('nominate_number_of_votes', axis = 1,
                        inplace = True)

    ## Election year during which this Congress was in session (not the one that
    ## produced this Congress!)
    session_length = 2
    congress_start_year = 1788
    nominate_scores['year'] = congress_start_year + session_length*\
    nominate_scores['congress']

    return nominate_scores

In [10]:
nominate_csvs = os.listdir(local_data_folder + "nominate_scores")
nominate_csvs_full = [local_data_folder + "nominate_scores/" + x \
                     for x in nominate_csvs]

In [11]:
nom_cols_keep = ['congress', 'icpsr', 'district_code',
                'state_abbrev', 'party_code', 'bioname', 'born',
                'nominate_dim1', 'nominate_dim2','nominate_number_of_votes',
                'nokken_poole_dim1', 'nokken_poole_dim2']

In [12]:
nom_combined = nominate_scores_clean(nominate_csvs_full[0],
                                nom_cols_keep)

for file_path in nominate_csvs_full[1:]:
    df = nominate_scores_clean(file_path, nom_cols_keep)
    nom_combined = nom_combined.append(df, ignore_index = True)
    

In [13]:
merged_elections_ideology = pd.merge(election_results, nom_combined, how = "left", 
                                     on = ["year", "dist_id"])
merged_elections_ideology['age'] = merged_elections_ideology['year'] - \
merged_elections_ideology['born']

merged_elections_ideology['dem_incumbent'] = 0
merged_elections_ideology['dem_incumbent'][merged_elections_ideology.party_code == 100] = 1
merged_elections_ideology = merged_elections_ideology.drop(\
    ['district_code','state_abbrev', 'bioname', 'born', 'party_code'],
                                                           axis = 1, inplace = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
merged_elections_ideology.columns

Index(['year', 'dist_id', 'dem_vote_share', 'congress', 'icpsr',
       'nominate_dim1', 'nominate_dim2', 'nokken_poole_dim1',
       'nokken_poole_dim2', 'age', 'dem_incumbent'],
      dtype='object')

In [50]:
## Lagged vote share
merged_elections_ideology = lag_columns(merged_elections_ideology,
                                        ['dist_id', 'year'],
                                        ['dem_vote_share'], 1)

['colA0', 'colA1', 'colB0', 'colB1']

In [51]:
merged_elections_ideology[100:105]

Unnamed: 0,year,dist_id,dem_vote_share,congress,icpsr,nominate_dim1,nominate_dim2,nokken_poole_dim1,nokken_poole_dim2,age,dem_incumbent
100,1990,AL_5,0.671408,101.0,14419.0,-0.175,0.834,-0.138,0.698,53.0,1
101,1992,AL_5,0.67249,102.0,29100.0,-0.132,0.612,-0.135,0.547,45.0,1
102,1994,AL_5,0.505039,103.0,29100.0,-0.132,0.612,-0.195,0.798,47.0,1
103,1996,AL_5,0.568885,104.0,29100.0,-0.132,0.612,-0.145,0.66,49.0,1
104,1998,AL_5,0.697262,105.0,29100.0,-0.132,0.612,-0.123,0.56,51.0,1


In [17]:
ntl_df = pd.read_csv(local_data_folder + "national_government_makeup.csv")
merged_elections_ideology = pd.merge(merged_elections_ideology, ntl_df,
                                     how = "left", on = "year")

## President = binary(0,1) = [Republican, Democrat]
## House, Senate = float(0,1) = proportion seats held by Democrats

In [19]:
merged_elections_ideology[:5]

Unnamed: 0,year,dist_id,dem_vote_share,congress,icpsr,nominate_dim1,nominate_dim2,nokken_poole_dim1,nokken_poole_dim2,age,dem_incumbent,president,house,senate
0,1980,AK_1,0.259215,96.0,14066.0,0.279,0.014,0.22,0.171,47.0,0,1,0.501149,0.59596
1,1982,AK_1,0.288493,97.0,14066.0,0.279,0.014,0.249,-0.22,49.0,0,0,0.558621,0.464646
2,1984,AK_1,0.431049,98.0,14066.0,0.279,0.014,0.216,-0.301,51.0,0,0,0.618391,0.464646
3,1986,AK_1,0.42111,99.0,14066.0,0.279,0.014,0.196,-0.298,53.0,0,0,0.581609,0.53
4,1988,AK_1,0.373454,100.0,14066.0,0.279,0.014,0.212,-0.41,55.0,0,0,0.593103,0.55


In [40]:
ACS_data = pd.read_csv(local_data_folder + "ACS_2005_2017.csv")

## Scaling columns to be proportions, not absolute numbers
columns_to_scale = ['bach_deg_num','black_pop','high_school_num','white_pop']
new_column_names = ['bachelor_deg_perc', 'black_perc', 'HS_diploma_perc', 'white_perc']
for i, colname in enumerate(columns_to_scale):
    ACS_data[new_column_names[i]] = ACS_data[colname]/ACS_data['total_pop']

ACS_data = ACS_data.drop(columns_to_scale, axis = 1, inplace = False)


In [41]:
ACS_data[:5]

Unnamed: 0,median_HH_income,median_age,mortgage_cost,total_pop,unemp_rate,year,district_num,state_name,bachelor_deg_perc,black_perc,HS_diploma_perc,white_perc
0,20188.0,37.3,615,643536.0,6.3,2005,1,Alabama,0.088626,0.280104,0.223001,0.687087
1,20268.0,37.2,545,629035.0,6.7,2005,2,Alabama,0.081261,0.298486,0.211079,0.67495
2,18797.0,36.6,532,616491.0,8.8,2005,3,Alabama,0.071274,0.314389,0.218238,0.653224
3,18606.0,38.9,468,634109.0,6.9,2005,4,Alabama,0.054366,0.047419,0.231413,0.917858
4,22070.0,38.6,593,649666.0,7.0,2005,5,Alabama,0.118981,0.165597,0.197577,0.784645


In [42]:
ACS_data = pd.merge(ACS_data, state_abbs[['state_name', 'state_abb']],
                   how = "left", on = "state_name")

In [43]:
ACS_data['dist_id'] = ACS_data['state_abb'] + "_"+ \
ACS_data['district_num'].astype(str)

In [44]:
cols_to_drop = ['district_num', 'state_name', 'state_abb', 'total_pop']
ACS_data = ACS_data.drop(cols_to_drop, axis = 1, inplace = False)

In [45]:
## Using earliest data (2017) to predict the 2018 election
## So for the merge to work, we have to recode the year of 
## the 2017 data for 2018.

ACS_data['year'][ACS_data.year == 2017] = 2018

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [46]:
combined_data = pd.merge(merged_elections_ideology, ACS_data,
                        how = "left", on = ['dist_id', 'year'])

In [47]:
## Dropping NAs
combined_data = combined_data.dropna()

In [48]:
combined_data

Unnamed: 0,year,dist_id,dem_vote_share,congress,icpsr,nominate_dim1,nominate_dim2,nokken_poole_dim1,nokken_poole_dim2,age,...,house,senate,median_HH_income,median_age,mortgage_cost,unemp_rate,bachelor_deg_perc,black_perc,HS_diploma_perc,white_perc
13,2006,AK_1,0.414254,109.0,14066.0,0.279,0.014,0.367,-0.047,73.0,...,0.464368,0.484848,25793.0,33.5,1015.0,9.4,0.107839,0.032051,0.177972,0.686767
14,2008,AK_1,0.472837,110.0,14066.0,0.279,0.014,0.291,0.269,75.0,...,0.535632,0.500000,30271.0,33.0,1166.0,7.7,0.108886,0.036257,0.164006,0.691129
15,2010,AK_1,0.306755,111.0,14066.0,0.279,0.014,0.242,0.642,77.0,...,0.590805,0.581633,28202.0,33.8,1167.0,9.6,0.116734,0.034610,0.159744,0.674308
16,2012,AK_1,0.309172,112.0,14066.0,0.279,0.014,0.338,-0.052,79.0,...,0.443678,0.520408,29932.0,33.8,1239.0,7.8,0.108769,0.035829,0.178108,0.665188
17,2014,AK_1,0.445620,113.0,14066.0,0.279,0.014,0.265,0.116,81.0,...,0.462069,0.540816,31557.0,33.3,1258.0,7.6,0.116235,0.033649,0.179152,0.656391
18,2016,AK_1,0.417197,114.0,14066.0,0.279,0.014,0.291,0.205,83.0,...,0.432184,0.448980,31981.0,33.5,1278.0,8.0,0.122472,0.032017,0.180522,0.644155
32,2006,AL_1,0.318440,109.0,20300.0,0.367,0.513,0.382,0.239,47.0,...,0.464368,0.484848,20810.0,37.0,686.0,6.3,0.083976,0.279771,0.236152,0.682812
33,2008,AL_1,0.000000,110.0,20300.0,0.367,0.513,0.401,0.393,49.0,...,0.535632,0.500000,22473.0,37.3,736.0,7.2,0.097215,0.270287,0.221705,0.685659
34,2010,AL_1,0.000000,111.0,20300.0,0.367,0.513,0.340,0.680,51.0,...,0.590805,0.581633,21250.0,38.4,763.0,12.8,0.098489,0.282178,0.218399,0.673739
35,2012,AL_1,0.000000,112.0,20300.0,0.367,0.513,0.310,0.506,53.0,...,0.443678,0.520408,22195.0,38.7,765.0,11.6,0.096977,0.277145,0.223431,0.678229


In [None]:
## Gotta code in the 2016 (training) and 2018 (test) vote share

In [49]:
combined_data.to_csv(local_data_folder + "combined_data.csv",
                     index=False)