# Load and preprocess 1996 data

We will, over time, look over other years. Our current goal is to explore the features of a single year.

---

In [1]:
%pylab --no-import-all inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


## Load the data.

---

If this fails, be sure that you've saved your own data in the prescribed location, then retry.

In [2]:
file = "../data/interim/1996data.dta"
df_rawest = pd.read_stata(file)

In [3]:
good_columns = [
    'V960420',  # Party Identification
    
    'V960503',  #Abortion
    'V961248',  #Moral Relativism
    'V961247',  #“Newer Lifestyles”
    'V961250',  #Moral Tolerance
    'V961249',  #Traditional Families
    'V961194',  #Gay Job Discrimination
    'V961196',  # Gay and Armed Forces
    
    'V960479',  #National Health Insurance
    'V960483',  #Guaranteed Job
    'V960450',  #Services/Spending
    
    'V961268',  #Affirmative Action
#    'V045193',  #Racial Resentment 1
#    'V045194',  #Racial Resentment2
#    'V045195',  #Racial Resentment3
#    'V045196',  #Racial Resentment4
]
df_raw = df_rawest[good_columns]

## Clean the data
---

In [4]:
def convert_to_int(s):
    """Turn ANES data entry into an integer.
    
    >>> convert_to_int("1. Govt should provide many fewer services")
    1
    >>> convert_to_int("2")
    2
    """
    try:
        return int(s.partition('.')[0])
    except ValueError:
        warnings.warn("Couldn't convert: "+s)
        return np.nan
    except AttributeError:
        return s
    
def not_informative_to_nan(x):
    """Convert non-informative values to missing.
    
    ANES codes various non-answers as 8, 9, and 0.
    For instance, if a question does not pertain to the 
    respondent.
    """
    return np.nan if x in {8, 9, 0} else x

df = df_raw.applymap(convert_to_int)

df.rename(inplace=True, columns=dict(zip(
    good_columns,
    ["PartyID",
    
    "Abortion",
    "MoralRelativism",
    "NewerLifestyles",
    "MoralTolerance",
    "TraditionalFamilies",
    "GayJobDiscrimination",
    "GayMilitaryService",

    "NationalHealthInsurance",
    "StandardOfLiving",
    "ServicesVsSpending",

    "AffirmativeAction",
    "RacialWorkWayUp",
    "RacialDeserve",
    "RacialTryHarder",
    "RacialGenerational",
    ]
)))

non_pid_columns = list(df.columns)
non_pid_columns.remove('PartyID')
df[non_pid_columns] = df[non_pid_columns].applymap(not_informative_to_nan)

# Code so that liberal is lower numbers
df.loc[:, 'PartyID'] = df.PartyID.apply(lambda x: np.nan if x >= 7 else x)  # 7: other minor party, 8: apolitical, 9: NA

df.loc[:, 'Abortion'] = df.Abortion.apply(lambda x: np.nan if x in {7, 8, 9, 0} else -x)


df.loc[:, 'NewerLifestyles'] = df.NewerLifestyles.apply(lambda x: -x)  # Tolerance. 1: tolerance, 7: not
df.loc[:, 'TraditionalFamilies'] = df.TraditionalFamilies.apply(lambda x: -x)  # 1: moral relativism, 5: no relativism

df.loc[:, 'ServicesVsSpending'] = df.ServicesVsSpending.apply(lambda x: -x)  # Gov't insurance?

In [5]:
print("Variables now available: df")

Variables now available: df


In [6]:
df_rawest.V960420.value_counts()

1. Weak Democrat (1,5/8/9,0 in K1, K1a/b     334
0. Strong Democrat (1,1,0 in K1, K1a/b,      329
5. Weak Republican (2,5/8/9,0 in K1, K1a     257
2. Independent-Democrat (3/4/5,0,5 in K1     233
6. Strong Republican (2,1,0 in K1, K1a/b     214
4. Independent-Republican (3/4/5,0,1 in      183
3. Independent-Independent (3,0,3/8/9 in     145
8. Apolitical (5,0,3/8/9 in K1, K1a/b, K      14
9. NA (8/9,0,0 in K1, K1a/b, K1c)              4
7. Other; minor party; refuses to say (4       1
Name: V960420, dtype: int64

In [7]:
df.PartyID.value_counts()

1.0    334
0.0    329
5.0    257
2.0    233
6.0    214
4.0    183
3.0    145
Name: PartyID, dtype: int64

In [10]:
df.describe()

Unnamed: 0,PartyID,Abortion,MoralRelativism,NewerLifestyles,MoralTolerance,TraditionalFamilies,GayJobDiscrimination,GayMilitaryService,NationalHealthInsurance,StandardOfLiving,ServicesVsSpending,AffirmativeAction
count,1695.0,1679.0,1526.0,1519.0,1522.0,1524.0,1436.0,1480.0,1536.0,1553.0,1466.0,1412.0
mean,2.676106,-2.889815,3.292267,-2.221198,2.621551,-1.729659,2.552925,2.42973,3.965495,4.462331,-3.892224,2.934136
std,2.102619,1.08808,1.336413,1.1603,1.211389,0.937516,1.639023,1.651662,1.877897,1.73726,1.514204,1.737336
min,0.0,-4.0,1.0,-5.0,1.0,-5.0,1.0,1.0,1.0,1.0,-7.0,1.0
25%,1.0,-4.0,2.0,-3.0,2.0,-2.0,1.0,1.0,2.0,3.0,-5.0,1.0
50%,2.0,-3.0,4.0,-2.0,2.0,-1.0,2.0,2.0,4.0,5.0,-4.0,2.0
75%,5.0,-2.0,4.0,-1.0,4.0,-1.0,4.0,4.0,5.0,6.0,-3.0,5.0
max,6.0,-1.0,5.0,-1.0,5.0,-1.0,5.0,5.0,7.0,7.0,-1.0,5.0


In [8]:
df.head()

Unnamed: 0,PartyID,Abortion,MoralRelativism,NewerLifestyles,MoralTolerance,TraditionalFamilies,GayJobDiscrimination,GayMilitaryService,NationalHealthInsurance,StandardOfLiving,ServicesVsSpending,AffirmativeAction
0,3.0,-2.0,5.0,-1.0,2.0,-2.0,1.0,5.0,,5.0,-3.0,1.0
1,2.0,-4.0,1.0,-1.0,1.0,-1.0,1.0,2.0,4.0,4.0,-4.0,4.0
2,1.0,-4.0,2.0,-3.0,3.0,-2.0,4.0,4.0,3.0,3.0,-4.0,4.0
3,1.0,-2.0,2.0,-2.0,2.0,-2.0,,4.0,3.0,4.0,-5.0,2.0
4,3.0,-4.0,2.0,-4.0,3.0,-3.0,2.0,1.0,7.0,6.0,-2.0,5.0


In [9]:
df.to_csv("../data/processed/1996.csv")