# Load and preprocess 2012 data

We will, over time, look over other years. Our current goal is to explore the features of a single year.

---

In [1]:
%pylab --no-import-all inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


## Load the data.

---

If this fails, be sure that you've saved your own data in the prescribed location, then retry.

In [2]:
file = "../data/interim/anes_timeseries_2012.dta"
df_rawest = pd.read_stata(file)

good_columns = ['campfin_limcorp', # "Should gov be able to limit corporate contributions"
                'pid_self', # Your own party identification
                'spsrvpr_ssself', # Government services
                'defsppr_self', # Defense spending
                'inspre_self', # 7pt scale govt-private medical insur scale
                'gun_control', # Gun control
                'guarpr_self',  # Is the gov't responsible for standard of living?
                'immig_policy',  # Which comes closest to your view about what government policy should be toward unauthorized immigrants now living in the United States?
                'aidblack_self',  # 7pt scale govt assistance to blacks scale: self-placemt
                'envjob_self',  # environment-jobs tradeoff self-placement
                'aa_uni',  # Does R favor or oppose affirmative action in universities
                'fedspend_ss',  # Federal Budget Spending: Social Security
                'fedspend_schools',  # Federal Budget Spending: public schools
                'fedspend_scitech',  # Federal Budget Spending: science and technology}
                'fedspend_crime',  # Federal Budget Spending: dealing with crime
                'fedspend_welfare',  # Federal Budget Spending: welfare programs
                'envir_gwarm',  # Is global warming happening or not
                'gayrt_marry',  # position on gay marriage
                'penalty_favdpen', # Favor death penalty?
                'relig_churchoft', # Do you go to church often?
                'dem_edu',  # Not really about parties.
                'dem_veteran', # Are you a veteran?
                'budget_rdefctax',  # Reduce deficit by increasing corporate taxes
                'budget_rdefmil',  # Reduce deficit by cutting military spending
                'patriot_amident',  # How important is being an American to you personally?
                'milln_milltax',  # "Favor or oppose tax on millionaires"
                'budget_rdef250k',  # Reduce deficit by raising personal inc tax for over 250K inc
                'fairjob_opin',  # "Opinion about govt ensuring fair jobs for blacks"
                'immigpo_jobs',  # Do immigrants take jobs?
                'wiretap_warrant',  # "Favor or oppose court auth for terr suspect wiretaps"
                'postvote_presvtwho',  # Did you vote for President Obama?
                'abortpre_4point',  # Abortion. higher: more permissive
                'pid_x',  # SUMMARY- Party ID (1: strong dem, 7: strong republican)
               ]

df_raw = df_rawest[good_columns]

## Clean the data
---

In [3]:
def convert_to_int(s):
    """Turn ANES data entry into an integer.
    
    >>> convert_to_int("1. Govt should provide many fewer services")
    1
    >>> convert_to_int("2")
    2
    """
    try:
        return int(s.partition('.')[0])
    except ValueError:
        warnings.warn("Couldn't convert: "+s)
        return np.nan
    except AttributeError:
        return s

def negative_to_nan(value):
    """Convert negative values to missing.
    
    ANES codes various non-answers as negative numbers.
    For instance, if a question does not pertain to the 
    respondent.
    """
    return value if value >= 0 else np.nan

def lib1_cons2_neutral3(x):
    """Rearrange questions where 3 is neutral."""
    return -3 + x if x != 1 else x

def liblow_conshigh(x):
    """Reorder questions where the liberal response is low."""
    return -x

def dem_edu_special_treatment(x):
    """Eliminate negative numbers and {95. Other}"""
    return np.nan if x == 95 or x <0 else x

def pid_self_special_treatment(x):
    return np.nan if x not in {1, 2, 3} else lib1_cons2_neutral3(x)

transforms = {}
transforms['campfin_limcorp'] = lib1_cons2_neutral3
transforms['pid_self'] = pid_self_special_treatment
transforms['defsppr_self'] = liblow_conshigh
transforms['inspre_self'] = liblow_conshigh
transforms['gun_control'] = lib1_cons2_neutral3
transforms['guarpr_self'] = liblow_conshigh
transforms['aidblack_self'] = liblow_conshigh
transforms['envjob_self'] = liblow_conshigh
transforms['aa_uni'] = lib1_cons2_neutral3
transforms['fedspend_ss'] = lib1_cons2_neutral3
transforms['fedspend_schools'] = lib1_cons2_neutral3
transforms['fedspend_scitech'] = lib1_cons2_neutral3
transforms['fedspend_crime'] = lib1_cons2_neutral3
transforms['fedspend_welfare'] = lib1_cons2_neutral3
transforms['envir_gwarm'] = liblow_conshigh
transforms['gayrt_marry'] = liblow_conshigh
transforms['relig_churchoft'] = liblow_conshigh
transforms['budget_rdefctax'] = lib1_cons2_neutral3
transforms['budget_rdefmil'] = lib1_cons2_neutral3
transforms['milln_milltax'] = lib1_cons2_neutral3
transforms['fairjob_opin'] = lambda x: np.nan if x not in {1, 2} else -x
transforms['wiretap_warrant'] = lib1_cons2_neutral3
transforms['dem_edu'] = dem_edu_special_treatment
transforms['budget_rdef250K'] = lib1_cons2_neutral3
transforms['postvote_presvtwho'] = lambda x: np.nan if x not in {1, 2, 5} else (3 - 2 * x) if x in {1, 2} else 0
transforms['abortpre_4point'] = lambda x: np.nan if x not in {1, 2, 3, 4} else x

df = df_raw.applymap(convert_to_int)
df = df.applymap(negative_to_nan)

# Transform each column.
for column in df.columns:
    try:
        df[column] = df[column].map(transforms[column], na_action='ignore')
    except KeyError:
        pass

In [4]:
print("Variables now available: df")

Variables now available: df


In [5]:
df_rawest.pid_self.value_counts()

1. Democrat                 2361
3. Independent              1845
2. Republican               1389
5. Other party {SPECIFY}     161
0. No preference {VOL}        66
-9. Refused                   48
-8. Don't know                44
Name: pid_self, dtype: int64

In [6]:
df.head()

Unnamed: 0,campfin_limcorp,pid_self,spsrvpr_ssself,defsppr_self,inspre_self,gun_control,guarpr_self,immig_policy,aidblack_self,envjob_self,...,budget_rdefmil,patriot_amident,milln_milltax,budget_rdef250k,fairjob_opin,immigpo_jobs,wiretap_warrant,postvote_presvtwho,abortpre_4point,pid_x
0,1.0,1.0,5.0,,-5.0,1.0,,3.0,-1.0,,...,,,,,,,,,2.0,1.0
1,-1.0,1.0,5.0,-1.0,-1.0,1.0,-1.0,4.0,,,...,1.0,2.0,1.0,1.0,-1.0,4.0,-1.0,1.0,3.0,1.0
2,,1.0,6.0,,-2.0,0.0,-2.0,3.0,,-2.0,...,-1.0,1.0,1.0,1.0,-1.0,1.0,1.0,1.0,4.0,1.0
3,-1.0,1.0,,,-1.0,0.0,-4.0,3.0,-1.0,,...,0.0,1.0,0.0,2.0,-1.0,1.0,1.0,1.0,4.0,1.0
4,1.0,0.0,5.0,-4.0,-1.0,0.0,-3.0,3.0,-1.0,-1.0,...,-1.0,1.0,1.0,1.0,-1.0,3.0,1.0,1.0,4.0,3.0


In [7]:
df.to_csv("../data/processed/2012.csv")