# Load and preprocess 2016 data

We will, over time, look over other years. Our current goal is to explore the features of a single year.

---

In [2]:
%pylab --no-import-all inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


## Load the data.

---

If this fails, be sure that you've saved your own data in the prescribed location, then retry.

In [3]:
file = "../data/interim/2016data.dta"
df_rawest = pd.read_stata(file)

In [5]:
good_columns = [
    # Church attendance
    'V161244',
    'V161245',
    'V161245a',
    # Education
    'V161270',
    # Gender
    'V161342',
    # Age
    'V161267',
    # Income
    'V161361x',
    # Race
    # 'V161310',  # KeyError: "['V161310'] not in index"

]

df_raw = df_rawest[good_columns]

In [6]:
df_raw.describe()

Unnamed: 0,V161244,V161245,V161245a,V161270,V161342,V161267,V161361x
count,4271,4271,4271.0,4271,4271,4271,4271
unique,4,7,4.0,19,4,75,30
top,1. Yes,"-1. INAP, 2,-8,-9 in V161244",-1.0,"13. Bachelor's degree (for example: BA, AB, BS)",2. Female,-9. RF (year of birth),"01. Under $5,000"
freq,2552,1719,3482.0,955,2232,120,276


## Clean the data
---

In [7]:
def convert_to_int(s):
    """Turn ANES data entry into an integer.
    
    >>> convert_to_int("1. Govt should provide many fewer services")
    1
    >>> convert_to_int("2")
    2
    """
    try:
        return int(s.partition('.')[0])
    except ValueError:
        warnings.warn("Couldn't convert: "+s)
        return np.nan
    except AttributeError:
        return s

def negative_to_nan(value):
    """Convert negative values to missing.
    
    ANES codes various non-answers as negative numbers.
    For instance, if a question does not pertain to the 
    respondent.
    """
    return value if value >= 0 else np.nan

def lib1_cons2_neutral3(x):
    """Rearrange questions where 3 is neutral."""
    return -3 + x if x != 1 else x

def liblow_conshigh(x):
    """Reorder questions where the liberal response is low."""
    return -x

df = df_raw.applymap(convert_to_int)
df = df.applymap(negative_to_nan)

# df.rename(inplace=True, columns=dict(zip(
#     good_columns,
#     ["PartyID",
    
#     "Abortion",
#     "MoralRelativism",
#     "NewerLifestyles",
#     "MoralTolerance",
#     "TraditionalFamilies",
#     "GayJobDiscrimination",
#     "GayAdoption",

#     "NationalHealthInsurance",
#     "StandardOfLiving",
#     "ServicesVsSpending",

#     "AffirmativeAction",
#     "RacialWorkWayUp",
#     "RacialGenerational",
#     "RacialDeserve",
#     "RacialTryHarder",
#     ]
# )))

# df.PartyID = df.PartyID.apply(lambda x: np.nan if x == 99 else x)
# df.Abortion = df.Abortion.apply(lambda x: np.nan if x not in {1, 2, 3, 4} else -x)

# df.loc[:, 'ServicesVsSpending'] = df.ServicesVsSpending.apply(lambda x: x if x != 99 else np.nan)
# df.loc[:, 'NationalHealthInsurance'] = df.NationalHealthInsurance.apply(lambda x: x if x != 99 else np.nan)
# df.loc[:, 'StandardOfLiving'] = df.StandardOfLiving.apply(lambda x: x if x != 99 else np.nan)


# df.loc[:, 'NewerLifestyles'] = df.NewerLifestyles.apply(lambda x: -x)  # Tolerance. 1: tolerance, 7: not
# df.loc[:, 'TraditionalFamilies'] = df.TraditionalFamilies.apply(lambda x: -x)  # 1: moral relativism, 5: no relativism

# df.loc[:, 'ServicesVsSpending'] = df.ServicesVsSpending.apply(lambda x: -x)  # Gov't insurance?

# df.loc[:, 'RacialTryHarder'] = df.RacialTryHarder.apply(lambda x: -x)  # Racial support
# df.loc[:, 'RacialWorkWayUp'] = df.RacialWorkWayUp.apply(lambda x: -x)  # Systemic factors?

In [8]:
df.describe()

Unnamed: 0,V161244,V161245,V161245a,V161270,V161342,V161267,V161361x
count,4252.0,2549.0,788.0,4256.0,4230.0,4150.0,4069.0
mean,1.399812,2.485288,1.459391,11.735902,1.532861,49.575663,15.386827
std,0.489917,1.239825,0.498665,7.210176,0.504164,17.580882,8.080488
min,1.0,1.0,1.0,1.0,1.0,18.0,1.0
25%,1.0,1.0,1.0,9.0,1.0,34.0,9.0
50%,1.0,2.0,1.0,11.0,2.0,50.0,16.0
75%,2.0,4.0,2.0,13.0,2.0,63.0,22.0
max,2.0,5.0,2.0,95.0,3.0,90.0,28.0


In [5]:
print("Variables now available: df")

Variables now available: df


In [9]:
df_rawest.V161158x.value_counts()

1. Strong Democrat                              890
7. Strong Republican                            721
4. Independent                                  579
2. Not very strong Democract                    560
6. Not very strong Republican                   508
5. Independent-Republican                       500
3. Independent-Democrat                         490
-9.0                                             12
-8. DK (-8) in V161156 or V161157 (FTF only)     11
Name: V161158x, dtype: int64

In [7]:
df.PartyID.value_counts()

1.0    890
7.0    721
4.0    579
2.0    560
6.0    508
5.0    500
3.0    490
Name: PartyID, dtype: int64

In [8]:
df.describe()

Unnamed: 0,PartyID,Abortion,MoralRelativism,NewerLifestyles,MoralTolerance,TraditionalFamilies,GayJobDiscrimination,GayAdoption,NationalHealthInsurance,StandardOfLiving,ServicesVsSpending,AffirmativeAction,RacialWorkWayUp,RacialGenerational,RacialDeserve,RacialTryHarder
count,4248.0,4208.0,3640.0,3634.0,3632.0,3635.0,4186.0,4166.0,3766.0,3774.0,3628.0,3546.0,3630.0,3635.0,3631.0,3627.0
mean,3.858522,-2.935361,2.978571,-2.729224,2.413546,-2.34663,1.616579,1.268363,4.06718,4.287228,-3.899118,3.9185,-2.536088,3.000275,3.262462,-2.968018
std,2.15242,1.112457,1.386775,1.339398,1.186639,1.269563,1.020414,0.443161,2.046862,1.810872,1.701706,1.46202,1.388133,1.428251,1.318325,1.363811
min,1.0,-4.0,1.0,-5.0,1.0,-5.0,1.0,1.0,1.0,1.0,-7.0,1.0,-5.0,1.0,1.0,-5.0
25%,2.0,-4.0,2.0,-4.0,2.0,-3.0,1.0,1.0,2.0,3.0,-5.0,4.0,-4.0,2.0,2.0,-4.0
50%,4.0,-3.0,3.0,-2.0,2.0,-2.0,1.0,1.0,4.0,4.0,-4.0,5.0,-2.0,3.0,3.0,-3.0
75%,6.0,-2.0,4.0,-2.0,3.0,-1.0,2.0,2.0,6.0,6.0,-3.0,5.0,-1.0,4.0,4.0,-2.0
max,7.0,-1.0,5.0,-1.0,5.0,-1.0,4.0,2.0,7.0,7.0,-1.0,5.0,-1.0,5.0,5.0,-1.0


In [9]:
df.head()

Unnamed: 0,PartyID,Abortion,MoralRelativism,NewerLifestyles,MoralTolerance,TraditionalFamilies,GayJobDiscrimination,GayAdoption,NationalHealthInsurance,StandardOfLiving,ServicesVsSpending,AffirmativeAction,RacialWorkWayUp,RacialGenerational,RacialDeserve,RacialTryHarder
0,7.0,-2.0,4.0,-1.0,4.0,-2.0,4.0,2.0,6.0,7.0,,5.0,-1.0,5.0,5.0,-1.0
1,6.0,-4.0,1.0,-3.0,1.0,-3.0,,2.0,1.0,4.0,-4.0,5.0,-3.0,2.0,4.0,-3.0
2,3.0,-2.0,2.0,-1.0,2.0,-3.0,1.0,1.0,6.0,7.0,,5.0,-1.0,5.0,5.0,-1.0
3,5.0,-3.0,4.0,-2.0,2.0,-1.0,1.0,2.0,6.0,5.0,-3.0,5.0,-2.0,4.0,4.0,-2.0
4,3.0,-4.0,2.0,-2.0,3.0,-2.0,4.0,1.0,1.0,7.0,,,-2.0,3.0,4.0,-3.0


In [10]:
df.to_csv("../data/processed/2016_demography.csv")

In [13]:
df_rawest.V160102.to_csv("../data/processed/2016_weights.csv")