# Load and preprocess 2012 data

We will, over time, look over other years. Our current goal is to explore the features of a single year.

---

In [12]:
%pylab --no-import-all inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


## Load the data.

---

If this fails, be sure that you've saved your own data in the prescribed location, then retry.

In [13]:
file = "../data/interim/2012data.dta"
df_rawest = pd.read_stata(file)

In [14]:
good_columns = [#'campfin_limcorp', # "Should gov be able to limit corporate contributions"
    'pid_x',  # Your own party identification
    
    'abortpre_4point',  # Abortion
    'trad_adjust',  # Moral Relativism
    'trad_lifestyle',  # "Newer" lifetyles
    'trad_tolerant',  # Moral tolerance
    'trad_famval',  # Traditional Families
    'gayrt_discstd_x',  # Gay Job Discrimination
    'gayrt_milstd_x',  # Gay Military Service
    
    'inspre_self',  # National health insurance
    'guarpr_self',  # Guaranteed Job
    'spsrvpr_ssself',  # Services/Spending
    
    'aa_work_x',  # Affirmative Action  ( Should this be aapost_hire_x? )
    'resent_workway', 
    'resent_slavery', 
    'resent_deserve',
    'resent_try',
]

df_raw = df_rawest[good_columns]

## Clean the data
---

In [15]:
def convert_to_int(s):
    """Turn ANES data entry into an integer.
    
    >>> convert_to_int("1. Govt should provide many fewer services")
    1
    >>> convert_to_int("2")
    2
    """
    try:
        return int(s.partition('.')[0])
    except ValueError:
        warnings.warn("Couldn't convert: "+s)
        return np.nan
    except AttributeError:
        return s

def negative_to_nan(value):
    """Convert negative values to missing.
    
    ANES codes various non-answers as negative numbers.
    For instance, if a question does not pertain to the 
    respondent.
    """
    return value if value >= 0 else np.nan

def lib1_cons2_neutral3(x):
    """Rearrange questions where 3 is neutral."""
    return -3 + x if x != 1 else x

def liblow_conshigh(x):
    """Reorder questions where the liberal response is low."""
    return -x

def dem_edu_special_treatment(x):
    """Eliminate negative numbers and {95. Other}"""
    return np.nan if x == 95 or x <0 else x

df = df_raw.applymap(convert_to_int)
df = df.applymap(negative_to_nan)

df.abortpre_4point = df.abortpre_4point.apply(lambda x: np.nan if x not in {1, 2, 3, 4} else -x)

df.loc[:, 'trad_lifestyle'] = df.trad_lifestyle.apply(lambda x: -x)  # 1: moral relativism, 5: no relativism
df.loc[:, 'trad_famval'] = df.trad_famval.apply(lambda x: -x)  # Tolerance. 1: tolerance, 7: not

df.loc[:, 'spsrvpr_ssself'] = df.spsrvpr_ssself.apply(lambda x: -x)

df.loc[:, 'resent_workway'] = df.resent_workway.apply(lambda x: -x)
df.loc[:, 'resent_try'] = df.resent_try.apply(lambda x: -x)


df.rename(inplace=True, columns=dict(zip(
    good_columns,
    ["PartyID",
    
    "Abortion",
    "MoralRelativism",
    "NewerLifestyles",
    "MoralTolerance",
    "TraditionalFamilies",
    "GayJobDiscrimination",
    "GayMilitaryService",

    "NationalHealthInsurance",
    "StandardOfLiving",
    "ServicesVsSpending",

    "AffirmativeAction",
    "RacialWorkWayUp",
    "RacialGenerational",
    "RacialDeserve",
    "RacialTryHarder",
    ]
)))

In [16]:
print("Variables now available: df")

Variables now available: df


In [17]:
df_rawest.pid_x.value_counts()

1. Strong Democrat               1485
2. Not very strong Democract      871
4. Independent                    792
7. Strong Republican              762
3. Independent-Democrat           747
6. Not very strong Republican     623
5. Independent-Republican         610
-2. Missing                        24
Name: pid_x, dtype: int64

In [18]:
df.PartyID.value_counts()

1.0    1485
2.0     871
4.0     792
7.0     762
3.0     747
6.0     623
5.0     610
Name: PartyID, dtype: int64

In [19]:
df.describe()

Unnamed: 0,PartyID,Abortion,MoralRelativism,NewerLifestyles,MoralTolerance,TraditionalFamilies,GayJobDiscrimination,GayMilitaryService,NationalHealthInsurance,StandardOfLiving,ServicesVsSpending,AffirmativeAction,RacialWorkWayUp,RacialGenerational,RacialDeserve,RacialTryHarder
count,5890.0,5855.0,5497.0,5492.0,5495.0,5490.0,2912.0,2942.0,5471.0,5476.0,5241.0,5839.0,5485.0,5485.0,5469.0,5484.0
mean,3.524278,-2.953373,3.134437,-2.461763,2.513558,-2.181967,2.083791,1.745751,3.998355,4.176954,-3.883419,5.004453,-2.298997,3.196171,3.522582,-2.681984
std,2.110186,1.090402,1.392147,1.241175,1.179452,1.190937,1.470371,1.259823,1.974286,1.839472,1.639783,1.850802,1.241282,1.358639,1.228953,1.275149
min,1.0,-4.0,1.0,-5.0,1.0,-5.0,1.0,1.0,1.0,1.0,-7.0,1.0,-5.0,1.0,1.0,-5.0
25%,1.0,-4.0,2.0,-3.0,2.0,-3.0,1.0,1.0,2.0,3.0,-5.0,4.0,-3.0,2.0,3.0,-4.0
50%,3.0,-3.0,3.0,-2.0,2.0,-2.0,1.0,1.0,4.0,4.0,-4.0,5.0,-2.0,3.0,4.0,-3.0
75%,5.0,-2.0,4.0,-1.0,3.0,-1.0,2.0,2.0,6.0,6.0,-3.0,7.0,-1.0,4.0,5.0,-2.0
max,7.0,-1.0,5.0,-1.0,5.0,-1.0,5.0,5.0,7.0,7.0,-1.0,7.0,-1.0,5.0,5.0,-1.0


In [20]:
df.head()

Unnamed: 0,PartyID,Abortion,MoralRelativism,NewerLifestyles,MoralTolerance,TraditionalFamilies,GayJobDiscrimination,GayMilitaryService,NationalHealthInsurance,StandardOfLiving,ServicesVsSpending,AffirmativeAction,RacialWorkWayUp,RacialGenerational,RacialDeserve,RacialTryHarder
0,1.0,-2.0,,,,,1.0,1.0,5.0,,-5.0,4.0,,,,
1,1.0,-3.0,1.0,-5.0,1.0,-4.0,,,1.0,1.0,-5.0,1.0,-5.0,1.0,1.0,-5.0
2,1.0,-4.0,2.0,-4.0,1.0,-1.0,,,2.0,2.0,-6.0,2.0,-3.0,5.0,4.0,-4.0
3,1.0,-4.0,5.0,-1.0,2.0,-1.0,1.0,1.0,1.0,4.0,,4.0,-4.0,1.0,1.0,-5.0
4,3.0,-4.0,1.0,-4.0,2.0,-4.0,,,1.0,3.0,-5.0,2.0,-5.0,1.0,2.0,-4.0


In [21]:
df.to_csv("../data/processed/2012.csv")