In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import os



In [2]:
data_folder = "https://raw.githubusercontent.com/cdriscoll92/CS-109A-Final-Project/master/data/"
# local_data_folder = '/Users/poojatyagi/Dropbox (MIT)/CS 109A Final project/Data'
local_data_folder = "/Users/colleendriscoll/Dropbox/Classes/CS 109A/CS 109A Final project/data/"

In [3]:
## Reading in state abbreviations file to get the correct district ID columns
state_abbs = pd.read_csv(data_folder + "state_abbreviations_correspondence_table.csv")

In [4]:
## Grouping CLEA by district-year to get the 
## democratic share of the two-party vote
def group_to_D_vote(groupby_obj, democrat_code):
    ## Groupby object with "yr", "dist_id", "pty", ""
    years = []
    dist_ids = []
    dem_shares = []
    
    for name, group in groupby_obj:
        dem_share = 0
        years.append(group.yr.values[0])
        dist_ids.append(group.dist_id.values[0])

        if democrat_code in group.pty.values: ## If a Democrat ran
            total_votes = np.sum(group.cv1.values)
            dem_votes = np.sum(group.cv1[group.pty == democrat_code].values)
            dem_share = dem_votes/total_votes
        dem_shares.append(dem_share)
    
    dem_vote_share_dict = {'year': years,
                           'dist_id': dist_ids,
                           'dem_vote_share': dem_shares
                          }
    return(dem_vote_share_dict)


In [5]:
def clea_clean(clea_file_name, state_abb_df):
    ## Read in data
    clea_results = pd.read_csv(clea_file_name)
    democrat_code = 180
    republican_code = 583
    election_month_int = 11
    
    ## Subsetting to only Democrats and Republicans
    clea_results = clea_results[(clea_results.pty == democrat_code) | 
                                (clea_results.pty == republican_code)]
    ## Only general elections (November)
    clea_results = clea_results[clea_results.mn == election_month_int]

    ## Extracting district number from constituency name
    ## There are some states with only one district that then don't 
    ## have a district number listed -- therefore filling those NAs with 1s
    clea_results['dist_num'] = clea_results.cst_n.str.findall('[0-9]+').\
    str[0].fillna(1)
    
    ## Lowercase state name to match CLEA listing
    state_abb_df['state_name_lower'] = state_abb_df.state_name.str.lower()

    ## Merging CLEA with state abbrevation correspondence table
    clea_merged = pd.merge(clea_results, state_abb_df,
                              how = 'right',
                              left_on = 'sub',
                              right_on = 'state_name_lower')
    
    ## Creating distict ID variable to merge on later
    clea_merged['dist_id'] = clea_merged['state_abb']+ "_" + \
    clea_merged['dist_num'].astype(str)

    ## Grouping CLEA by district-year to get the democratic share of the 
    ## two-party vote
    grouped = clea_merged.groupby(['dist_id', 'yr'])
    
    dem_vote_share = pd.DataFrame(group_to_D_vote(grouped, 
                                                 democrat_code))
    
    return dem_vote_share

In [6]:
clea_cleaned = clea_clean(data_folder + "election_results/clea_20180507.csv",
                          state_abbs)

In [7]:
clea_cleaned

Unnamed: 0,year,dist_id,dem_vote_share
0,1980,AK_1,0.259215
1,1982,AK_1,0.288493
2,1984,AK_1,0.431049
3,1986,AK_1,0.421110
4,1988,AK_1,0.373454
5,1990,AK_1,0.480790
6,1992,AK_1,0.477895
7,1994,AK_1,0.365124
8,1996,AK_1,0.380061
9,1998,AK_1,0.356059


In [8]:
results_2018_df = pd.read_csv(local_data_folder + 
                           "election_results/2018_scraped_cleaned.csv")
grouped_2018 = results_2018_df.groupby(['dist_id', 'yr'])
results_2018 = pd.DataFrame(group_to_D_vote(grouped_2018, "D"))
election_results = pd.concat([clea_cleaned, results_2018],
                             ignore_index=True)

## Make sure that no observations were lost/added in the concatenation
assert (len(clea_cleaned)+ len(results_2018) ==\
        len(election_results)), \
"Combined DataFrame not same length as two DFs combined"

In [9]:

def drop_secondary_members(nominate_df):
    ## Support function for NOMINATE cleaning
    ## Districts where there was more than one member of Congress serving, 
    ## assign the one who voted the most number of times to the district
    multiple_member_districts = nominate_df.dist_id\
    [nominate_df.dist_id.duplicated()]
    
    nominate_df['main_member'] = 1
    for district in multiple_member_districts:
        member_votes = nominate_df.nominate_number_of_votes\
        [nominate_df.dist_id == district]

        orders = np.argsort(member_votes)

        lowest_score_index = nominate_df['main_member']\
        [nominate_df.dist_id == district][orders == 0].index

        nominate_df.loc[lowest_score_index, 'main_member'] = 0

    ## Only keeping the main member in each district
    nominate_df = nominate_df[nominate_df.main_member == 1]
    nominate_df = nominate_df.drop(columns = ['main_member'], axis = 1,
                                   inplace = False)

    return nominate_df

In [10]:
def nominate_scores_clean(nom_file_name, cols_keep):
    nominate_scores = pd.read_csv(nom_file_name)
    nominate_scores = nominate_scores[cols_keep]
    
    ## Dropping president
    nominate_scores = nominate_scores[nominate_scores['state_abbrev']\
                                      != "USA"]

    ## Dropping members who didn't vote (they can't provide ideology measures then)
    missing_vote_num_indices = nominate_scores.nominate_number_of_votes.isna()\
    == True
    nominate_scores = nominate_scores[~missing_vote_num_indices]

    ## District ID column
    nominate_scores['dist_id'] = nominate_scores.state_abbrev + '_' + \
    nominate_scores.district_code.astype(str)

    nominate_scores = drop_secondary_members(nominate_scores)

    nominate_scores.drop('nominate_number_of_votes', axis = 1,
                        inplace = True)

    ## Election year during which this Congress was in session (not the one that
    ## produced this Congress!)
    session_length = 2
    congress_start_year = 1788
    nominate_scores['year'] = congress_start_year + session_length*\
    nominate_scores['congress']

    return nominate_scores

In [11]:
nominate_csvs = os.listdir(local_data_folder + "nominate_scores")
nominate_csvs_full = [local_data_folder + "nominate_scores/" + x \
                     for x in nominate_csvs]

In [12]:
nom_cols_keep = ['congress', 'icpsr', 'district_code',
                'state_abbrev', 'party_code', 'bioname', 'born',
                'nominate_dim1', 'nominate_dim2','nominate_number_of_votes',
                'nokken_poole_dim1', 'nokken_poole_dim2']

In [13]:
nom_combined = nominate_scores_clean(nominate_csvs_full[0],
                                nom_cols_keep)

for file_path in nominate_csvs_full[1:]:
    df = nominate_scores_clean(file_path, nom_cols_keep)
    nom_combined = nom_combined.append(df, ignore_index = True)
    

In [14]:
merged_elections_ideology = pd.merge(election_results, nom_combined, how = "left", 
                                     on = ["year", "dist_id"])
merged_elections_ideology['age'] = merged_elections_ideology['year'] - \
merged_elections_ideology['born']

merged_elections_ideology['dem_incumbent'] = 0
merged_elections_ideology['dem_incumbent'][merged_elections_ideology.party_code == 100] = 1
merged_elections_ideology = merged_elections_ideology.drop(\
    ['district_code','state_abbrev', 'bioname', 'born', 'party_code'],
                                                           axis = 1, inplace = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [15]:
merged_elections_ideology.columns

Index(['year', 'dist_id', 'dem_vote_share', 'congress', 'icpsr',
       'nominate_dim1', 'nominate_dim2', 'nokken_poole_dim1',
       'nokken_poole_dim2', 'age', 'dem_incumbent'],
      dtype='object')

In [16]:
def lag_columns(df, by_cols, lag_cols, n_lag_terms):
    df.sort_values(by = by_cols,inplace=True)
    # 1) Create new columns
    new_col_names = [x+"_lag"+ str(i) for x in list(lag_cols) \
                     for i in range(1,n_lag_terms+1)]
    for new_col_name in new_col_names:
        df[new_col_name] = 'NaN'
    new_columns_dict = {x: [] for x in new_col_names}
    
    # Unique district IDs, for example
    groupby_values = df[by_cols[0]].unique()
    for val in groupby_values: ## in each district
        
        for lag_term in range(1, n_lag_terms+1): ## For each year lagged
            ## Get the right column name -- matches the one above
            new_col_name = lag_cols[0] + "_lag"+str(lag_term)
            ## Shift values using pd.DataFrame.shift()
            lagged_vals = df[df[by_cols[0]] == val][lag_cols[0]].shift(lag_term).values
            ## Insert lagged values back into the main data frame
            ## This is where the problem is
            new_columns_dict[new_col_name].extend(lagged_vals)
    
    for key in new_columns_dict:
        df[key] = new_columns_dict[key]
        
    return(df)

In [17]:
## Lagged vote share
merged_elections_ideology = lag_columns(merged_elections_ideology,
                                        ['dist_id', 'year'],
                                        ['dem_vote_share'], 1)
merged_elections_ideology = merged_elections_ideology.rename(
    columns={'dem_vote_share_lag1':'dem_prior_vote_share'})

In [18]:
merged_elections_ideology[100:105]

Unnamed: 0,year,dist_id,dem_vote_share,congress,icpsr,nominate_dim1,nominate_dim2,nokken_poole_dim1,nokken_poole_dim2,age,dem_incumbent,dem_prior_vote_share
95,1980,AL_5,1.0,96.0,14419.0,-0.175,0.834,-0.173,0.759,43.0,1,
96,1982,AL_5,0.815645,97.0,14419.0,-0.175,0.834,-0.227,0.974,45.0,1,1.0
97,1984,AL_5,1.0,98.0,14419.0,-0.175,0.834,-0.191,0.962,47.0,1,0.815645
98,1986,AL_5,0.789045,99.0,14419.0,-0.175,0.834,-0.173,0.838,49.0,1,1.0
99,1988,AL_5,0.650707,100.0,14419.0,-0.175,0.834,-0.093,0.603,51.0,1,0.789045


In [19]:
ntl_df = pd.read_csv(local_data_folder + "national_government_makeup.csv")
merged_elections_ideology = pd.merge(merged_elections_ideology, ntl_df,
                                     how = "left", on = "year")

## President = binary(0,1) = [Republican, Democrat]
## House, Senate = float(0,1) = proportion seats held by Democrats

In [20]:
merged_elections_ideology[:5]

Unnamed: 0,year,dist_id,dem_vote_share,congress,icpsr,nominate_dim1,nominate_dim2,nokken_poole_dim1,nokken_poole_dim2,age,dem_incumbent,dem_prior_vote_share,president,house,senate
0,1980,AK_1,0.259215,96.0,14066.0,0.279,0.014,0.22,0.171,47.0,0,,1,0.501149,0.59596
1,1982,AK_1,0.288493,97.0,14066.0,0.279,0.014,0.249,-0.22,49.0,0,0.259215,0,0.558621,0.464646
2,1984,AK_1,0.431049,98.0,14066.0,0.279,0.014,0.216,-0.301,51.0,0,0.288493,0,0.618391,0.464646
3,1986,AK_1,0.42111,99.0,14066.0,0.279,0.014,0.196,-0.298,53.0,0,0.431049,0,0.581609,0.53
4,1988,AK_1,0.373454,100.0,14066.0,0.279,0.014,0.212,-0.41,55.0,0,0.42111,0,0.593103,0.55


In [21]:
ACS_data = pd.read_csv(local_data_folder + "ACS_2005_2017.csv")

## Scaling columns to be proportions, not absolute numbers
columns_to_scale = ['bach_deg_num','black_pop','high_school_num','white_pop']
new_column_names = ['bachelor_deg_perc', 'black_perc', 'HS_diploma_perc', 'white_perc']
for i, colname in enumerate(columns_to_scale):
    ACS_data[new_column_names[i]] = ACS_data[colname]/ACS_data['total_pop']

ACS_data = ACS_data.drop(columns_to_scale, axis = 1, inplace = False)


In [22]:
ACS_data[:5]

Unnamed: 0,median_HH_income,median_age,mortgage_cost,total_pop,unemp_rate,year,district_num,state_name,bachelor_deg_perc,black_perc,HS_diploma_perc,white_perc
0,20188.0,37.3,615,643536.0,6.3,2005,1,Alabama,0.088626,0.280104,0.223001,0.687087
1,20268.0,37.2,545,629035.0,6.7,2005,2,Alabama,0.081261,0.298486,0.211079,0.67495
2,18797.0,36.6,532,616491.0,8.8,2005,3,Alabama,0.071274,0.314389,0.218238,0.653224
3,18606.0,38.9,468,634109.0,6.9,2005,4,Alabama,0.054366,0.047419,0.231413,0.917858
4,22070.0,38.6,593,649666.0,7.0,2005,5,Alabama,0.118981,0.165597,0.197577,0.784645


In [23]:
ACS_data = pd.merge(ACS_data, state_abbs[['state_name', 'state_abb']],
                   how = "left", on = "state_name")

In [24]:
ACS_data['dist_id'] = ACS_data['state_abb'] + "_"+ \
ACS_data['district_num'].astype(str)

In [25]:
cols_to_drop = ['district_num', 'state_name', 'state_abb', 'total_pop']
ACS_data = ACS_data.drop(cols_to_drop, axis = 1, inplace = False)

In [26]:
## Using earliest data (2017) to predict the 2018 election
## So for the merge to work, we have to recode the year of 
## the 2017 data for 2018.

ACS_data['year'][ACS_data.year == 2017] = 2018

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [27]:
combined_data = pd.merge(merged_elections_ideology, ACS_data,
                        how = "left", on = ['dist_id', 'year'])

In [28]:
## Dropping NAs
combined_data = combined_data.dropna()

In [30]:
## Redistricting variable
def merge_redist(df, redist_df, year_col='year', dist_id_col='dist_id'):
    merged_df = df.merge(redist_df, on=(year_col, dist_id_col), how='left')
    merged_df.redistricted.fillna(0, inplace=True)
    return merged_df

redist_df = pd.read_csv(local_data_folder + "redist_2000-2018.csv")

combined_data = merge_redist(combined_data, redist_df)

In [31]:
combined_data.to_csv(local_data_folder + "combined_data.csv",
                     index=False)