In [1]:
%pylab --no-import-all inline

Populating the interactive namespace from numpy and matplotlib


# Load and preprocess 1976 data

Time to start looking at other years!

---

In [2]:
import pandas as pd

## Load the data.

---

If this fails, be sure that you've saved your own data in the prescribed location, then retry.

In [3]:
file = "../data/interim/1976data.dta"  
# Matt Wilson converted the older Stata file to the one we use.
df_rawest = pd.read_stata(file)

In [4]:
good_columns = [
    # Demographic
    'V763174',  # SUMMARY-R'S PARTY ID
    
    'V763796',  # OPIN:WHEN ALLOW ABORTION (1: never)

    'V763273',  # Private vs public insurance
    'V763241',  # GOVT GUAR JOB/S.L  (1: guarantee)
    'V763353',  # Gov't should spend less, even if cutting health and education.

    'V763264',  # MNRTY GRP AID SCL (1: help)
#    'V763757',  # THE POOR ARE POOR BECAUSE THE AMERICAN WAY OF LIFE DOESN'T GIVE ALL PEOPLE AN EQUAL CHANCE? (1: agree)
]
df_raw = df_rawest[good_columns]

In [5]:
def convert_to_int(s):
    """Turn ANES data entry into an integer.
    
    >>> convert_to_int("1. Govt should provide many fewer services")
    1
    >>> convert_to_int("2")
    2
    """
    try:
        return int(s.partition('.')[0])
    except ValueError:
        warnings.warn("Couldn't convert: "+s)
        return np.nan
    except AttributeError:
        return s


def not_informative_to_nan(x):
    """Convert non-informative values to missing.
    
    ANES codes various non-answers as 8, 9, and 0.
    For instance, if a question does not pertain to the 
    respondent.
    """
    return np.nan if x in {8, 9, 0} else x


df = df_raw.applymap(convert_to_int)

df.rename(inplace=True, columns=dict(zip(
    good_columns,
    ["PartyID",
    
    "Abortion",
#     "MoralRelativism",
#     "NewerLifestyles",
#     "MoralTolerance",
#     "TraditionalFamilies",
#     "GayJobDiscrimination",
#     "GayMilitaryService",

    "NationalHealthInsurance",
    "StandardOfLiving",
    "ServicesVsSpending",

    "AffirmativeAction",
#     "RacialResentment1",
#     "RacialResentment2",
#     "RacialResentment3",
#     "RacialResentment4",
    ]
)))


non_pid_columns = list(df.columns)
non_pid_columns.remove('PartyID')
df[non_pid_columns] = df[non_pid_columns].applymap(not_informative_to_nan)  # Dropped because its info is different.

# Code so that liberal is lower numbers
df.loc[:, 'PartyID'] = df.PartyID.apply(lambda x: np.nan if x >= 7 else x)  # 7: other minor party, 8: apolitical, 9: NA

df.loc[:, 'Abortion'] = df.Abortion.apply(lambda x: np.nan if x in {7, 8, 9, 0} else -x)

df.loc[:, 'ServicesVsSpending'] = df.ServicesVsSpending.apply(lambda x: -x)

In [6]:
df.tail()

Unnamed: 0,PartyID,Abortion,NationalHealthInsurance,StandardOfLiving,ServicesVsSpending,AffirmativeAction
2243,5.0,,,,-5.0,3.0
2244,5.0,,3.0,7.0,-5.0,5.0
2245,5.0,-1.0,2.0,7.0,-5.0,6.0
2246,0.0,-4.0,1.0,4.0,-5.0,7.0
2247,3.0,-4.0,7.0,7.0,-5.0,6.0


In [7]:
print("Variables now available: df")

Variables now available: df


In [8]:
df_rawest.V763796.value_counts()

2. PERMI IF HE DANG    832
4. NEVER FORBI ABOR    488
0. INAP.               339
3. FOR PERS REA ONL    297
1. ABORT SHLD NEVER    205
8. DON-T KNOW           42
7. OTHER                25
9. NOT ASCERTAINED      20
Name: V763796, dtype: int64

In [9]:
df.Abortion.value_counts()

-2.0    832
-4.0    488
-3.0    297
-1.0    205
Name: Abortion, dtype: int64

In [10]:
df_rawest.V763174.value_counts()

1. WEAK DEM 5 5 0      547
0. STRONG DEM 5 1 0    338
5. WEAK REP 1 5 0      327
3. IND-IND 2 0 3       315
2. IND-DEM 2 0 5       260
4. IND-REP 2 0 1       218
6. STRONG REP 1 1 0    208
8. APOLITICAL 3 0 3     20
7. OTH,MINOR PARTY,      9
9. NA,DK 8 0 0           6
Name: V763174, dtype: int64

In [11]:
df.PartyID.value_counts()

1.0    547
0.0    338
5.0    327
3.0    315
2.0    260
4.0    218
6.0    208
Name: PartyID, dtype: int64

In [12]:
df.to_csv("../data/processed/1976.csv")

In [13]:
df.describe()

Unnamed: 0,PartyID,Abortion,NationalHealthInsurance,StandardOfLiving,ServicesVsSpending,AffirmativeAction
count,2213.0,1822.0,1769.0,1790.0,2116.0,1851.0
mean,2.605965,-2.586169,3.983041,4.43352,-4.10586,4.290654
std,1.965388,1.001912,2.366992,2.006361,1.666849,1.983667
min,0.0,-4.0,1.0,1.0,-5.0,1.0
25%,1.0,-4.0,1.0,3.0,-5.0,3.0
50%,2.0,-2.0,4.0,4.0,-5.0,4.0
75%,4.0,-2.0,7.0,6.0,-5.0,6.0
max,6.0,-1.0,7.0,7.0,-1.0,7.0


In [19]:
df_rawest.V763353.value_counts()

5. DISAGREE            1643
1. AGREE                473
8. DK;DEPENDS;CAN-T      93
9. NOT ASCERTAINED       39
Name: V763353, dtype: int64