# Load and preprocess 2016 data

We will, over time, look over other years. Our current goal is to explore the features of a single year.

---

In [1]:
%pylab --no-import-all inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


## Load the data.

---

If this fails, be sure that you've saved your own data in the prescribed location, then retry.

In [2]:
file = "../data/interim/2016data.dta"
df_rawest = pd.read_stata(file)

In [5]:
good_columns = [#'campfin_limcorp', # "Should gov be able to limit corporate contributions"
    'V161126',  # Your own party identification
    
    'V161232',  # Abortion
    'V162207',  # Moral Relativism
    'V162208',  # "Newer" lifetyles
    'V162209',  # Moral tolerance
    'V162210',  # Traditional Families
    'V161229x',  # Gay Job Discrimination
    'V161230',  # Gay Adoption
    
    'V161184',  # National health insurance
    'V161189',  # Guaranteed Job
    'V161178',  # Services/Spending
]

df_raw = df_rawest[good_columns]

## Clean the data
---

In [13]:
def convert_to_int(s):
    """Turn ANES data entry into an integer.
    
    >>> convert_to_int("1. Govt should provide many fewer services")
    1
    >>> convert_to_int("2")
    2
    """
    try:
        return int(s.partition('.')[0])
    except ValueError:
        warnings.warn("Couldn't convert: "+s)
        return np.nan
    except AttributeError:
        return s

def negative_to_nan(value):
    """Convert negative values to missing.
    
    ANES codes various non-answers as negative numbers.
    For instance, if a question does not pertain to the 
    respondent.
    """
    return value if value >= 0 else np.nan

def lib1_cons2_neutral3(x):
    """Rearrange questions where 3 is neutral."""
    return -3 + x if x != 1 else x

def liblow_conshigh(x):
    """Reorder questions where the liberal response is low."""
    return -x

df = df_raw.applymap(convert_to_int)
df = df.applymap(negative_to_nan)

df.rename(inplace=True, columns=dict(zip(
    good_columns,
    ["PartyID",
    
    "Abortion",
    "MoralRelativism",
    "NewerLifestyles",
    "MoralTolerance",
    "TraditionalFamilies",
    "GayJobDiscrimination",
    "GayAdoption",

    "NationalHealthInsurance",
    "StandardOfLiving",
    "ServicesVsSpending",

#    "AffirmativeAction",
#    "RacialResentment1",
#    "RacialResentment2",
#    "RacialResentment3",
#    "RacialResentment4",
    ]
)))

df.PartyID = df.PartyID.apply(lambda x: np.nan if x == 99 else -x)
df.Abortion = df.Abortion.apply(lambda x: np.nan if x not in {1, 2, 3, 4} else x)

df.loc[:, 'MoralRelativism'] = df.MoralRelativism.apply(lambda x: -x)  # 1: moral relativism, 5: no relativism
df.loc[:, 'MoralTolerance'] = df.MoralTolerance.apply(lambda x: -x)  # Tolerance. 1: tolerance, 7: not
df.loc[:, 'GayJobDiscrimination'] = df.GayJobDiscrimination.apply(lambda x: -x)  # protect gays against discrimination
df.loc[:, 'GayAdoption'] = df.GayAdoption.apply(lambda x: -x)  # support gay adoption


df.loc[:, 'NationalHealthInsurance'] = df.NationalHealthInsurance.apply(lambda x: -x)  # Gov't insurance?
df.loc[:, 'StandardOfLiving'] = df.StandardOfLiving.apply(lambda x: -x)  # Gov't guaranteed job?

# df.loc[:, 'aa_work_x'] = df.aa_work_x.apply(lambda x: -x)  # affirmative action
# df.loc[:, 'V045194'] = df.V045194.apply(lambda x: -x)  # Racial support
# df.loc[:, 'V045195'] = df.V045195.apply(lambda x: -x)  # Systemic factors?



In [14]:
print("Variables now available: df")

Variables now available: df


In [15]:
df_rawest.V161126.value_counts()

99. Haven't thought much about this (FTF ONLY: DO NOT PROBE)    944
4. Moderate, middle of the road                                 895
6. Conservative                                                 703
5. Slightly conservative                                        508
2. Liberal                                                      506
3. Slightly liberal                                             380
7. Extremely conservative                                       166
1. Extremely liberal                                            146
-9. Refused                                                      18
-8. Don't know (FTF only)                                         5
Name: V161126, dtype: int64

In [16]:
df.PartyID.value_counts()

-4.0    895
-6.0    703
-5.0    508
-2.0    506
-3.0    380
-7.0    166
-1.0    146
Name: PartyID, dtype: int64

In [18]:
df.head()

Unnamed: 0,PartyID,Abortion,MoralRelativism,NewerLifestyles,MoralTolerance,TraditionalFamilies,GayJobDiscrimination,GayAdoption,NationalHealthInsurance,StandardOfLiving,ServicesVsSpending
0,,2.0,-4.0,1.0,-4.0,2.0,-4.0,-2.0,-6.0,-7.0,99.0
1,-5.0,4.0,-1.0,3.0,-1.0,3.0,,-2.0,-1.0,-4.0,4.0
2,,2.0,-2.0,1.0,-2.0,3.0,-1.0,-1.0,-6.0,-7.0,99.0
3,,3.0,-4.0,2.0,-2.0,1.0,-1.0,-2.0,-6.0,-5.0,3.0
4,-4.0,4.0,-2.0,2.0,-3.0,2.0,-4.0,-1.0,-1.0,-7.0,99.0


In [17]:
df.to_csv("../data/processed/2016.csv")