# Load and preprocess 2008 data

We will, over time, look over other years. Our current goal is to explore the features of a single year.

---

In [1]:
%pylab --no-import-all inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


## Load the data.

---

If this fails, be sure that you've saved your own data in the prescribed location, then retry.

In [2]:
file = "../data/interim/2008data.dta"
df_rawest = pd.read_stata(file)

In [3]:
df_rawest.V085157.value_counts()

5. Against preferential hiring and promotion of blacks    1442
1. For preferential hiring and promotion of blacks         474
-2. No Post-election IW                                    220
7. Other {SPECIFY}                                         121
-8. Don't know                                              38
-9. Refused                                                 27
Name: V085157, dtype: int64

In [4]:
good_columns = [#'campfin_limcorp', # "Should gov be able to limit corporate contributions"
    'V083098x',  # Your own party identification
    
    'V085086',  # Abortion
    'V085139',  # Moral Relativism
    'V085140',  # "Newer" lifetyles
    'V085141',  # Moral tolerance
    'V085142',  # Traditional Families
    'V083211x',  # Gay Job Discrimination
    'V083213',  # Gay Adoption
    'V083212x',  # Gay Military Service
    
    'V083119',  # National health insurance
    'V083128',  # Guaranteed Job
    'V083105',  # Services/Spending
    
#    'V085157',  # Affirmative Action  ( Should this be aapost_hire_x? )
    'V085143', 
    'V085144', 
    'V085145',
    'V085146',
]

df_raw = df_rawest[good_columns]

## Clean the data
---

In [5]:
def convert_to_int(s):
    """Turn ANES data entry into an integer.
    
    >>> convert_to_int("1. Govt should provide many fewer services")
    1
    >>> convert_to_int("2")
    2
    """
    try:
        return int(s.partition('.')[0])
    except ValueError:
        warnings.warn("Couldn't convert: "+s)
        return np.nan
    except AttributeError:
        return s

def negative_to_nan(value):
    """Convert negative values to missing.
    
    ANES codes various non-answers as negative numbers.
    For instance, if a question does not pertain to the 
    respondent.
    """
    return value if value >= 0 else np.nan

def lib1_cons2_neutral3(x):
    """Rearrange questions where 3 is neutral."""
    return -3 + x if x != 1 else x

def liblow_conshigh(x):
    """Reorder questions where the liberal response is low."""
    return -x

def dem_edu_special_treatment(x):
    """Eliminate negative numbers and {95. Other}"""
    return np.nan if x == 95 or x <0 else x

df = df_raw.applymap(convert_to_int)
df = df.applymap(negative_to_nan)


df.rename(inplace=True, columns=dict(zip(
    good_columns,
    ["PartyID",
    
    "Abortion",
    "MoralRelativism",
    "NewerLifestyles",
    "MoralTolerance",
    "TraditionalFamilies",
    "GayJobDiscrimination",
    "GayAdoption",
    "GayMilitaryService",

    "NationalHealthInsurance",
    "StandardOfLiving",
    "ServicesVsSpending",

#    "AffirmativeAction",
    "RacialWorkWayUp",
    "RacialGenerational",
    "RacialDeserve",
    "RacialTryHarder",

    ]
)))

df.Abortion = df.Abortion.apply(lambda x: np.nan if x not in {1, 2, 3, 4} else -x)

df.loc[:, 'NewerLifestyles'] = df.NewerLifestyles.apply(lambda x: -x)  # Tolerance. 1: tolerance, 7: not
df.loc[:, 'TraditionalFamilies'] = df.TraditionalFamilies.apply(lambda x: -x)  # 1: moral relativism, 5: no relativism

df.loc[:, 'ServicesVsSpending'] = df.ServicesVsSpending.apply(lambda x: -x)  # Gov't insurance?

df.loc[:, 'RacialTryHarder'] = df.RacialTryHarder.apply(lambda x: -x)  # Racial support
df.loc[:, 'RacialWorkWayUp'] = df.RacialWorkWayUp.apply(lambda x: -x)  # Systemic factors?

In [6]:
print("Variables now available: df")

Variables now available: df


In [7]:
df_rawest.V083098x.value_counts()

0. Strong Democrat (1;1;-1)                       580
1. Weak Democrat (1;5;-1)                         393
2. Independent-Democrat (3,4,5,-8;-1;5)           392
3. Independent-Independent (3,4,5,-8;-1;3)        264
6. Strong Republican (2;1;-1)                     230
4. Independent-Republican (3,4,5,-8;-1;1)         223
5. Weak Republican (2;5;-1)                       200
-1. INAP, -9 in J1; -8,-9 in J1a; -8,-9 in J1b     40
Name: V083098x, dtype: int64

In [8]:
df.PartyID.value_counts()

0.0    580
1.0    393
2.0    392
3.0    264
6.0    230
4.0    223
5.0    200
Name: PartyID, dtype: int64

In [9]:
df.describe()

Unnamed: 0,PartyID,Abortion,MoralRelativism,NewerLifestyles,MoralTolerance,TraditionalFamilies,GayJobDiscrimination,GayAdoption,GayMilitaryService,NationalHealthInsurance,StandardOfLiving,ServicesVsSpending,RacialWorkWayUp,RacialGenerational,RacialDeserve,RacialTryHarder
count,2282.0,1031.0,2096.0,2091.0,2090.0,2093.0,2223.0,2240.0,2228.0,1044.0,1024.0,953.0,2077.0,2088.0,2084.0,2079.0
mean,2.29667,-2.805044,2.857347,-2.370636,2.369378,-1.986622,2.19928,2.996429,1.997307,3.456897,3.961914,-4.658972,-2.21377,2.967433,3.248081,-2.517557
std,2.002646,1.128541,1.46988,1.244271,1.20372,1.109585,1.555849,2.000443,1.454285,2.026919,1.946377,1.686004,1.223385,1.36662,1.280741,1.283101
min,0.0,-4.0,1.0,-5.0,1.0,-5.0,1.0,1.0,1.0,1.0,1.0,-7.0,-5.0,1.0,1.0,-5.0
25%,0.0,-4.0,2.0,-3.0,1.0,-3.0,1.0,1.0,1.0,1.0,2.0,-6.0,-3.0,2.0,2.0,-4.0
50%,2.0,-3.0,2.0,-2.0,2.0,-2.0,1.0,1.0,1.0,3.0,4.0,-5.0,-2.0,3.0,3.0,-2.0
75%,4.0,-2.0,4.0,-1.0,3.0,-1.0,4.0,5.0,2.0,5.0,5.0,-4.0,-1.0,4.0,4.0,-1.0
max,6.0,-1.0,5.0,-1.0,5.0,-1.0,5.0,5.0,5.0,7.0,7.0,-1.0,-1.0,5.0,5.0,-1.0


In [10]:
df.head()

Unnamed: 0,PartyID,Abortion,MoralRelativism,NewerLifestyles,MoralTolerance,TraditionalFamilies,GayJobDiscrimination,GayAdoption,GayMilitaryService,NationalHealthInsurance,StandardOfLiving,ServicesVsSpending,RacialWorkWayUp,RacialGenerational,RacialDeserve,RacialTryHarder
0,4.0,-4.0,5.0,-4.0,1.0,-3.0,1.0,1.0,1.0,7.0,7.0,-1.0,-1.0,5.0,3.0,-1.0
1,0.0,,,,,,4.0,5.0,2.0,5.0,,-5.0,,,,
2,5.0,-2.0,5.0,-1.0,2.0,-1.0,1.0,5.0,1.0,7.0,5.0,-5.0,-3.0,1.0,3.0,-2.0
3,3.0,-3.0,2.0,-2.0,3.0,-3.0,2.0,5.0,2.0,7.0,5.0,-4.0,-2.0,2.0,4.0,-2.0
4,6.0,-2.0,4.0,-1.0,4.0,-2.0,5.0,5.0,5.0,4.0,,-3.0,-2.0,2.0,3.0,-4.0


In [11]:
df.to_csv("../data/processed/2008.csv")