In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import os



In [2]:
data_folder = "https://raw.githubusercontent.com/cdriscoll92/CS-109A-Final-Project/master/data/"
# local_data_folder = '/Users/poojatyagi/Dropbox (MIT)/CS 109A Final project/Data'
local_data_folder = "/Users/colleendriscoll/Dropbox/Classes/CS 109A/CS 109A Final project/data/"

In [3]:
## Reading in state abbreviations file to get the correct district ID columns
state_abbs = pd.read_csv(data_folder + "state_abbreviations_correspondence_table.csv")

In [4]:
## Grouping CLEA by district-year to get the 
## democratic share of the two-party vote
def group_to_D_vote(groupby_obj, democrat_code):
    ## Groupby object with "yr", "dist_id", "pty", ""
    years = []
    dist_ids = []
    dem_shares = []
    
    for name, group in groupby_obj:
        dem_share = 0
        years.append(group.yr.values[0])
        dist_ids.append(group.dist_id.values[0])

        if democrat_code in group.pty.values: ## If a Democrat ran
            total_votes = np.sum(group.cv1.values)
            dem_votes = np.sum(group.cv1[group.pty == democrat_code].values)
            dem_share = dem_votes/total_votes
        dem_shares.append(dem_share)
    
    dem_vote_share_dict = {'year': years,
                           'dist_id': dist_ids,
                           'dem_vote_share': dem_shares
                          }
    return(dem_vote_share_dict)


In [5]:
def clea_clean(clea_file_name, state_abb_df):
    ## Read in data
    clea_results = pd.read_csv(clea_file_name)
    democrat_code = 180
    republican_code = 583
    election_month_int = 11
    
    ## Subsetting to only Democrats and Republicans
    clea_results = clea_results[(clea_results.pty == democrat_code) | 
                                (clea_results.pty == republican_code)]
    ## Only general elections (November)
    clea_results = clea_results[clea_results.mn == election_month_int]

    ## Extracting district number from constituency name
    ## There are some states with only one district that then don't 
    ## have a district number listed -- therefore filling those NAs with 1s
    clea_results['dist_num'] = clea_results.cst_n.str.findall('[0-9]+').\
    str[0].fillna(1)
    
    ## Lowercase state name to match CLEA listing
    state_abb_df['state_name_lower'] = state_abb_df.state_name.str.lower()

    ## Merging CLEA with state abbrevation correspondence table
    clea_merged = pd.merge(clea_results, state_abb_df,
                              how = 'right',
                              left_on = 'sub',
                              right_on = 'state_name_lower')
    
    ## Creating distict ID variable to merge on later
    clea_merged['dist_id'] = clea_merged['state_abb']+ "_" + \
    clea_merged['dist_num'].astype(str)

    ## Grouping CLEA by district-year to get the democratic share of the 
    ## two-party vote
    grouped = clea_merged.groupby(['dist_id', 'yr'])
    
    dem_vote_share = pd.DataFrame(group_to_D_vote(grouped, 
                                                 democrat_code))
    
    return dem_vote_share

In [6]:
clea_cleaned = clea_clean(data_folder + "election_results/clea_20180507.csv",
                          state_abbs)

In [7]:
results_2018_df = pd.read_csv(local_data_folder + 
                           "election_results/2018_scraped_cleaned.csv")
grouped_2018 = results_2018_df.groupby(['dist_id', 'yr'])
results_2018 = pd.DataFrame(group_to_D_vote(grouped_2018, "D"))
election_results = pd.concat([clea_cleaned, results_2018],
                             ignore_index=True)

## Make sure that no observations were lost/added in the concatenation
assert (len(clea_cleaned)+ len(results_2018) ==\
        len(election_results)), \
"Combined DataFrame not same length as two DFs combined"

In [8]:

def drop_secondary_members(nominate_df):
    ## Support function for NOMINATE cleaning
    ## Districts where there was more than one member of Congress serving, 
    ## assign the one who voted the most number of times to the district
    multiple_member_districts = nominate_df.dist_id\
    [nominate_df.dist_id.duplicated()]
    
    nominate_df['main_member'] = 1
    for district in multiple_member_districts:
        member_votes = nominate_df.nominate_number_of_votes\
        [nominate_df.dist_id == district]

        orders = np.argsort(member_votes)

        lowest_score_index = nominate_df['main_member']\
        [nominate_df.dist_id == district][orders == 0].index

        nominate_df.loc[lowest_score_index, 'main_member'] = 0

    ## Only keeping the main member in each district
    nominate_df = nominate_df[nominate_df.main_member == 1]
    nominate_df = nominate_df.drop(columns = ['main_member'], axis = 1,
                                   inplace = False)

    return nominate_df

In [9]:
def nominate_scores_clean(nom_file_name, cols_keep):
    nominate_scores = pd.read_csv(nom_file_name)
    nominate_scores = nominate_scores[cols_keep]
    
    ## Dropping president
    nominate_scores = nominate_scores[nominate_scores['state_abbrev']\
                                      != "USA"]

    ## Dropping members who didn't vote (they can't provide ideology measures then)
    missing_vote_num_indices = nominate_scores.nominate_number_of_votes.isna()\
    == True
    nominate_scores = nominate_scores[~missing_vote_num_indices]

    ## District ID column
    nominate_scores['dist_id'] = nominate_scores.state_abbrev + '_' + \
    nominate_scores.district_code.astype(str)

    nominate_scores = drop_secondary_members(nominate_scores)

    nominate_scores.drop('nominate_number_of_votes', axis = 1,
                        inplace = True)

    ## Election year during which this Congress was in session (not the one that
    ## produced this Congress!)
    session_length = 2
    congress_start_year = 1788
    nominate_scores['year'] = congress_start_year + session_length*\
    nominate_scores['congress']

    return nominate_scores

In [10]:
nominate_csvs = os.listdir(local_data_folder + "nominate_scores")
nominate_csvs_full = [local_data_folder + "nominate_scores/" + x \
                     for x in nominate_csvs]

In [11]:
nom_cols_keep = ['congress', 'icpsr', 'district_code',
                'state_abbrev', 'party_code', 'bioname', 'born',
                'nominate_dim1', 'nominate_dim2','nominate_number_of_votes',
                'nokken_poole_dim1', 'nokken_poole_dim2']

In [12]:
nom_combined = nominate_scores_clean(nominate_csvs_full[0],
                                nom_cols_keep)

for file_path in nominate_csvs_full[1:]:
    df = nominate_scores_clean(file_path, nom_cols_keep)
    nom_combined.append(df, ignore_index = True)
    

In [92]:
merged_elections_ideology = pd.merge(election_results, nom_combined, how = "left", 
                                     on = ["year", "dist_id"])
merged_elections_ideology['age'] = merged_elections_ideology['year'] - \
merged_elections_ideology['born']

merged_elections_ideology['dem_incumbent'] = 0
merged_elections_ideology['dem_incumbent'][merged_elections_ideology.party_code == 100] = 1
merged_elections_ideology.drop(['district_code','state_abbrev',
                               'bioname', 'born', 'party_code'],
                               axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [93]:
merged_elections_ideology.columns

Index(['year', 'dist_id', 'dem_vote_share', 'congress', 'icpsr',
       'nominate_dim1', 'nominate_dim2', 'nokken_poole_dim1',
       'nokken_poole_dim2', 'age', 'dem_incumbent'],
      dtype='object')

In [94]:
lag_cols = ['colA', 'colB']
n_lag_terms = 2

new_col_names = [x+str(i) for x in lag_cols for i in range(n_lag_terms)]
new_col_names

['colA0', 'colA1', 'colB0', 'colB1']

In [106]:
df = merged_elections_ideology
by_cols = ['dist_id', 'year']
lag_column_name = ['dem_vote_share']
n_lag_terms = 2

## Beginnings of a function to lag variables
# df.gdp.shift(-1)
## ^How to do it in pandas
## I should make a function to do it by district and year
## https://stackoverflow.com/questions/26280345/pandas-shift-down-values-by-one-row-within-a-group

## Steps:
# 1) Create new columns
# 2) By district, sort values
# 3) By district, shift and insert values into new columns

def lag_columns(df, by_cols, lag_cols, n_lag_terms):
    df.sort_values(by = by_cols)
    # 1) Create new columns
    new_col_names = [x+"_lag"+ str(i) for x in list(lag_cols) \
                     for i in range(1,n_lag_terms+1)]
    for new_col_name in new_col_names:
        df[new_col_name] = 'NaN'
    new_columns_dict = {x: [] for x in new_col_names}
    
    # Unique district IDs, for example
    groupby_values = df[by_cols[0]].unique()
    for val in groupby_values: ## in each district
        
        for lag_term in range(1, n_lag_terms+1): ## For each year lagged
            ## Get the right column name -- matches the one above
            new_col_name = lag_cols[0] + "_lag"+str(lag_term)
            ## Shift values using pd.DataFrame.shift()
            lagged_vals = df[df[by_cols[0]] == val][lag_cols[0]].shift(lag_term).values
            ## Insert lagged values back into the main data frame
            ## This is where the problem is
            new_columns_dict[new_col_name].extend(lagged_vals)
    
    for key in new_columns_dict:
        df[key] = new_columns_dict[key]
        
    return(df)
    
df_test = lag_columns(df, by_cols, lag_column_name, n_lag_terms)


In [107]:
df_test[df_test.dist_id == "WI_6"]

Unnamed: 0,year,dist_id,dem_vote_share,congress,icpsr,nominate_dim1,nominate_dim2,nokken_poole_dim1,nokken_poole_dim2,age,dem_incumbent,dem_vote_share_lag1,dem_vote_share_lag2
8061,1980,WI_6,0.406532,96.0,14675.0,0.379,-0.367,0.303,-0.622,40.0,0,1.0,1.0
8062,1982,WI_6,0.349869,97.0,14675.0,0.379,-0.367,0.306,-0.555,42.0,0,0.683766,1.0
8063,1984,WI_6,0.24168,98.0,14675.0,0.379,-0.367,0.288,-0.531,44.0,0,0.616147,0.683766
8064,1986,WI_6,0.0,99.0,14675.0,0.379,-0.367,0.241,-0.233,46.0,0,0.786284,0.616147
8065,1988,WI_6,0.257532,100.0,14675.0,0.379,-0.367,0.272,-0.133,48.0,0,1.0,0.786284
8066,1990,WI_6,0.0,101.0,14675.0,0.379,-0.367,0.364,-0.085,50.0,0,1.0,1.0
8067,1992,WI_6,0.471256,102.0,14675.0,0.379,-0.367,0.294,-0.129,52.0,0,0.0,1.0
8068,1994,WI_6,0.0,103.0,14675.0,0.379,-0.367,0.423,-0.537,54.0,0,0.354996,0.0
8069,1996,WI_6,0.246569,104.0,14675.0,0.379,-0.367,0.545,-0.722,56.0,0,0.0,0.354996
8070,1998,WI_6,0.0,105.0,14675.0,0.379,-0.367,0.488,-0.585,58.0,0,0.404174,0.0


In [None]:
df_test[:25]

In [None]:
def add_lagged_terms(df, lag_column_names, by_columns, n_lag_terms):
    df.gdp.shift(-1)