# 1. Import libraries and data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Create path
path = r'/Users/dana/Documents/Pew Research Reading Project/'

# import data
df19 = pd.read_spss(os.path.join(path,'02 Data','Original Data','January 8-February 7, 2019 - Core Trends Survey - SPSS.sav'))
df18 = pd.read_spss(os.path.join(path,'02 Data','Original Data','January 3-10, 2018 - Core Trends Survey - SPSS.sav'))

# 2. Remove and rename columns

In [3]:
df19.shape

(1502, 74)

In [4]:
df18.shape

(2002, 70)

In [5]:
df19.columns

Index(['respid', 'sample', 'comp', 'int_date', 'lang', 'cregion', 'state',
       'density', 'sfips', 'usr', 'qs1', 'sex', 'eminuse', 'intmob', 'intfreq',
       'snsint2', 'home4nw', 'bbhome1', 'bbhome2', 'device1a', 'smart2', 'q20',
       'bbsmart1', 'bbsmart2', 'bbsmart3a', 'bbsmart3b', 'bbsmart3c',
       'bbsmart3d', 'bbsmart3e', 'bbsmart3f', 'bbsmart3foe@', 'bbsmart4',
       'web1a', 'web1b', 'web1c', 'web1d', 'web1e', 'web1f', 'web1g', 'web1h',
       'web1i', 'sns2a', 'sns2b', 'sns2c', 'sns2d', 'sns2e', 'device1b',
       'device1c', 'device1d', 'books1', 'books2a', 'books2b', 'books2c',
       'age', 'marital', 'educ2', 'emplnw', 'hisp', 'racem1', 'racem2',
       'racem3', 'racem4', 'racecmb', 'birth_hisp', 'inc', 'party', 'partyln',
       'hh1', 'hh3', 'ql1', 'ql1a', 'qc1', 'weight', 'cellweight'],
      dtype='object')

In [6]:
# get rid of unneeded columns
df19 = df19[['respid', 'int_date', 'lang', 'state','sex','books1', 'books2a', 'books2b',
       'books2c', 'age', 'marital', 'educ2', 'emplnw','hisp', 'racecmb', 'inc', 'party']]

In [7]:
# same for df18
df18 = df18[['respid', 'int_date', 'lang', 'state', 'sex','books1', 'books2a', 'books2b',
       'books2c', 'age', 'marital', 'educ2', 'emplnw', 'hisp', 'racecmb', 'inc', 'party']]

In [8]:
# rename columns
df19 = df19.rename(columns = {'respid':'response_id','lang':'language','books1':'number_of_books_read',
                          'books2a':'read_printed_books','books2b':'read_audiobooks','books2c':'read_e-books',
                          'educ2':'level_of_education','emplnw':'employment',
                         'hisp':'hispanic','racecmb':'race','inc':'income'})

In [9]:
df18 = df18.rename(columns = {'respid':'response_id','lang':'language','books1':'number_of_books_read',
                          'books2a':'read_printed_books','books2b':'read_audiobooks','books2c':'read_e-books',
                          'educ2':'level_of_education','emplnw':'employment',
                         'hisp':'hispanic','racecmb':'race','inc':'income'})

In [10]:
df19.head()

Unnamed: 0,response_id,int_date,language,state,sex,number_of_books_read,read_printed_books,read_audiobooks,read_e-books,age,marital,level_of_education,employment,hispanic,race,income,party
0,4.0,190108.0,English,NC,Female,,,,,75.0,Married,High school graduate (Grade 12 with diploma or...,Not employed for pay,No,Black or African-American,"20 to under $30,000",Democrat
1,7.0,190108.0,English,OH,Male,,,,,56.0,Married,"Some college, no degree (includes some communi...",Employed full-time,No,White,"100 to under $150,000, OR",Republican
2,9.0,190108.0,English,NJ,Female,3.0,Yes,No,No,65.0,Married,Four year college or university degree/Bachelo...,Retired,No,White,"75 to under $100,000",Republican
3,10.0,190108.0,English,IN,Female,20.0,Yes,No,Yes,67.0,Married,High school graduate (Grade 12 with diploma or...,Retired,No,Mixed Race,"50 to under $75,000",Independent
4,11.0,190114.0,English,MI,Male,20.0,Yes,No,Yes,64.0,Married,Two year associate degree from a college or un...,Employed full-time,No,White,"75 to under $100,000",Independent


In [11]:
df18.head()

Unnamed: 0,response_id,int_date,language,state,sex,number_of_books_read,read_printed_books,read_audiobooks,read_e-books,age,marital,level_of_education,employment,hispanic,race,income,party
0,1.0,180103.0,English,PA,Female,1.0,Yes,No,No,33.0,Living with a partner,High school graduate (Grade 12 with diploma or...,Employed full-time,No,White,"50 to under $75,000",Democrat
1,2.0,180103.0,English,SC,Female,5.0,Yes,No,No,76.0,Married,(VOL) Don't know,Retired,No,White,"30 to under $40,000",Independent
2,3.0,180103.0,English,NJ,Female,0.0,,,,(VOL) Refused,Widowed,Two year associate degree from a college or un...,(VOL) Have own business/self-employed,No,White,"30 to under $40,000",Republican
3,4.0,180103.0,English,MD,Female,2.0,Yes,No,No,60.0,Living with a partner,Two year associate degree from a college or un...,(VOL) Other,No,White,"10 to under $20,000",Democrat
4,5.0,180103.0,English,NH,Male,6.0,Yes,No,Yes,55.0,Married,"Some college, no degree (includes some communi...",Employed full-time,No,White,"75 to under $100,000",Republican


# 3. Fix mixed types

In [12]:
# check for mixed types
for col in df19.columns.tolist():
  weird = (df19[[col]].applymap(type) != df19[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df19[weird]) > 0:
    print (col)

number_of_books_read
read_printed_books
read_audiobooks
read_e-books
age


In [13]:
# first change to string rather than cat so that I can change strings to integers
df19['number_of_books_read'] = df19['number_of_books_read'].astype('str')

In [14]:
# change "none" to 0
df19.loc[df19['number_of_books_read'] == 'None', 'number_of_books_read'] = 0

In [15]:
# change "97 or more" to 97
df19.loc[df19['number_of_books_read'] == '97 or more', 'number_of_books_read'] = 97

In [16]:
# change don't know and refused to code 99
df19.loc[df19['number_of_books_read'] == "(VOL) Don't know", 'number_of_books_read'] = 99
df19.loc[df19['number_of_books_read'] == "(VOL) Refused", 'number_of_books_read'] = 99

In [17]:
# then change to integer
df19['number_of_books_read'] = df19['number_of_books_read'].astype('float')
df19['number_of_books_read'] = df19['number_of_books_read'].astype('int')

In [18]:
# make subset of under 99
dfunder = df19[df19['number_of_books_read']<99]

In [19]:
# find range
dfunder['number_of_books_read'].describe()

count    1475.000000
mean       12.985763
std        21.152748
min         0.000000
25%         1.000000
50%         5.000000
75%        15.000000
max        97.000000
Name: number_of_books_read, dtype: float64

In [20]:
# impute don't know and refused with interquartile range
df19.loc[df19['number_of_books_read'] == 99, 'number_of_books_read'] = np.random.randint(1,15,27)

In [21]:
# describe
df19['number_of_books_read'].describe()

count    1502.000000
mean       12.888149
std        20.981182
min         0.000000
25%         1.000000
50%         5.000000
75%        15.000000
max        97.000000
Name: number_of_books_read, dtype: float64

In [22]:
# do the same procedure to age column

In [23]:
# first change to string rather than cat so that I can change strings to integers
df19['age'] = df19['age'].astype('str')

In [24]:
# change don't know and refused to code 99
df19.loc[df19['age'] == "(VOL) Don't know", 'age'] = 99
df19.loc[df19['age'] == "(VOL) Refused", 'age'] = 99

In [25]:
# then change to integer
df19['age'] = df19['age'].astype('float')
df19['age'] = df19['age'].astype('int')

In [26]:
# make subset of under 99
dfage = df19[df19['age']<99]

In [27]:
# find range
dfage['age'].describe()

count    1446.000000
mean       50.942600
std        18.170439
min        18.000000
25%        35.000000
50%        52.000000
75%        65.000000
max        96.000000
Name: age, dtype: float64

In [28]:
# impute don't know and refused with interquartile range
df19.loc[df19['age'] == 99, 'age'] = np.random.randint(18,65,56)

In [29]:
df19['age'].describe()

count    1502.000000
mean       50.615180
std        18.077109
min        18.000000
25%        35.000000
50%        52.000000
75%        65.000000
max        96.000000
Name: age, dtype: float64

In [30]:
# check read printed books
df19['read_printed_books'].value_counts(dropna=False)

Yes                 1020
NaN                  358
No                   121
(VOL) Don't know       3
Name: read_printed_books, dtype: int64

In [31]:
# change don't know to No
df19.loc[df19['read_printed_books'] == "(VOL) Don't know", 'read_printed_books'] = 'No'

In [32]:
df19['read_printed_books'].value_counts(dropna=False)

Yes                 1020
NaN                  358
No                   124
(VOL) Don't know       0
Name: read_printed_books, dtype: int64

In [33]:
# do the same for audiobooks
df19['read_audiobooks'].value_counts(dropna=False)

No                  829
NaN                 358
Yes                 313
(VOL) Don't know      2
Name: read_audiobooks, dtype: int64

In [34]:
# change don't know to No
df19.loc[df19['read_audiobooks'] == "(VOL) Don't know", 'read_audiobooks'] = 'No'

In [35]:
df19['read_audiobooks'].value_counts(dropna=False)

No                  831
NaN                 358
Yes                 313
(VOL) Don't know      0
Name: read_audiobooks, dtype: int64

In [36]:
# do the same for e-books
df19['read_e-books'].value_counts(dropna=False)

No                  724
Yes                 418
NaN                 358
(VOL) Don't know      2
Name: read_e-books, dtype: int64

In [37]:
# change don't know to No
df19.loc[df19['read_e-books'] == "(VOL) Don't know", 'read_e-books'] = 'No'

In [38]:
df19['read_e-books'].value_counts(dropna=False)

No                  726
Yes                 418
NaN                 358
(VOL) Don't know      0
Name: read_e-books, dtype: int64

In [39]:
# check how many non-readers there are
df19['number_of_books_read'].value_counts()

0     351
2     117
3     105
1      87
10     77
4      72
5      72
6      70
12     68
20     63
15     56
30     44
8      38
25     33
50     33
97     26
96     24
7      23
24     21
40     18
13     13
14     11
60     10
9       8
11      8
36      6
45      6
80      6
48      4
35      4
22      3
18      3
52      3
16      3
75      2
23      2
88      2
90      1
70      1
27      1
17      1
26      1
84      1
66      1
95      1
72      1
34      1
Name: number_of_books_read, dtype: int64

In [40]:
# it's about equal to nulls so change nulls to No
df19['read_printed_books'].fillna('No',inplace = True)
df19['read_audiobooks'].fillna('No',inplace = True)
df19['read_e-books'].fillna('No',inplace = True)

In [41]:
# remove unused categories
df19['read_printed_books'] = df19['read_printed_books'].cat.remove_unused_categories()
df19['read_audiobooks'] = df19['read_audiobooks'].cat.remove_unused_categories()
df19['read_e-books'] = df19['read_e-books'].cat.remove_unused_categories()

In [42]:
# check again for mixed types
for col in df19.columns.tolist():
  weird = (df19[[col]].applymap(type) != df19[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df19[weird]) > 0:
    print (col)

In [43]:
# do the same as above to the 2018 df

In [44]:
# check data types
df18.dtypes

response_id              float64
int_date                 float64
language                category
state                   category
sex                     category
number_of_books_read    category
read_printed_books      category
read_audiobooks         category
read_e-books            category
age                     category
marital                 category
level_of_education      category
employment              category
hispanic                category
race                    category
income                  category
party                   category
dtype: object

In [45]:
# check for mixed types
for col in df18.columns.tolist():
  weird = (df18[[col]].applymap(type) != df18[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df18[weird]) > 0:
    print (col)

number_of_books_read
read_printed_books
read_audiobooks
read_e-books
age


In [46]:
df18['number_of_books_read'].value_counts()

0.0                 432
3.0                 168
2.0                 136
6.0                 117
12.0                116
4.0                 116
5.0                 107
10.0                105
1.0                  98
20.0                 79
15.0                 55
8.0                  50
97 or more           47
7.0                  42
30.0                 36
50.0                 32
96.0                 31
(VOL) Don't know     30
25.0                 29
24.0                 25
40.0                 21
(VOL) Refused        15
9.0                  10
35.0                  9
70.0                  9
36.0                  8
45.0                  7
60.0                  7
13.0                  6
18.0                  6
75.0                  5
26.0                  4
11.0                  4
14.0                  4
17.0                  4
16.0                  3
80.0                  3
32.0                  3
52.0                  3
23.0                  3
65.0                  2
55.0            

In [47]:
# first change to string rather than cat so that I can change strings to integers
df18['number_of_books_read'] = df18['number_of_books_read'].astype('str')

In [48]:
# change "none" to 0
df18.loc[df18['number_of_books_read'] == 'None', 'number_of_books_read'] = 0

In [49]:
# change "97 or more" to 97
df18.loc[df18['number_of_books_read'] == '97 or more', 'number_of_books_read'] = 97

In [50]:
# change don't know and refused to code 99
df18.loc[df18['number_of_books_read'] == "(VOL) Don't know", 'number_of_books_read'] = 99
df18.loc[df18['number_of_books_read'] == "(VOL) Refused", 'number_of_books_read'] = 99

In [51]:
# then change to integer
df18['number_of_books_read'] = df18['number_of_books_read'].astype('float')
df18['number_of_books_read'] = df18['number_of_books_read'].astype('int')

In [52]:
# make subset of under 99
dfunders = df18[df18['number_of_books_read']<99]

In [53]:
# find total rows
df18.shape

(2002, 17)

In [54]:
# find range
dfunders['number_of_books_read'].describe()

count    1957.000000
mean       12.772611
std        21.665115
min         0.000000
25%         1.000000
50%         5.000000
75%        12.000000
max        97.000000
Name: number_of_books_read, dtype: float64

In [55]:
# impute don't know and refused with interquartile range
df18.loc[df18['number_of_books_read'] == 99, 'number_of_books_read'] = np.random.randint(1,12,45)

In [56]:
# describe
df18['number_of_books_read'].describe()

count    2002.000000
mean       12.620380
std        21.448979
min         0.000000
25%         1.000000
50%         5.000000
75%        12.000000
max        97.000000
Name: number_of_books_read, dtype: float64

In [57]:
# do the same procedure to age column

In [58]:
# first change to string rather than cat so that I can change strings to integers
df18['age'] = df18['age'].astype('str')

In [59]:
# change don't know and refused to code 99, and "97 or older" to 97
df18.loc[df18['age'] == "(VOL) Don't know", 'age'] = 99
df18.loc[df18['age'] == "(VOL) Refused", 'age'] = 99
df18.loc[df18['age'] == "97 or older", 'age'] = 97

In [60]:
# then change to integer
df18['age'] = df18['age'].astype('float')
df18['age'] = df18['age'].astype('int')

In [61]:
# make subset of under 99
dfages = df18[df18['age']<99]

In [62]:
# find range
dfages['age'].describe()

count    1953.000000
mean       50.601639
std        18.717788
min        18.000000
25%        34.000000
50%        52.000000
75%        65.000000
max        97.000000
Name: age, dtype: float64

In [63]:
# impute don't know and refused with interquartile range
df18.loc[df18['age'] == 99, 'age'] = np.random.randint(18,65,49)

In [64]:
df18['age'].describe()

count    2002.000000
mean       50.391109
std        18.659376
min        18.000000
25%        34.000000
50%        52.000000
75%        65.000000
max        97.000000
Name: age, dtype: float64

In [65]:
# check read printed books
df18['read_printed_books'].value_counts(dropna=False)

Yes                 1385
NaN                  447
No                   164
(VOL) Don't know       5
(VOL) Refused          1
Name: read_printed_books, dtype: int64

In [66]:
# change don't know and refused to No
df18.loc[df18['read_printed_books'] == "(VOL) Don't know", 'read_printed_books'] = 'No'
df18.loc[df18['read_printed_books'] == "(VOL) Refused", 'read_printed_books'] = 'No'

In [67]:
df18['read_printed_books'].value_counts(dropna=False)

Yes                 1385
NaN                  447
No                   170
(VOL) Don't know       0
(VOL) Refused          0
Name: read_printed_books, dtype: int64

In [68]:
# do the same for audiobooks
df18['read_audiobooks'].value_counts(dropna=False)

No                  1165
NaN                  447
Yes                  386
(VOL) Don't know       3
(VOL) Refused          1
Name: read_audiobooks, dtype: int64

In [69]:
# change don't know and refused to No
df18.loc[df18['read_audiobooks'] == "(VOL) Don't know", 'read_audiobooks'] = 'No'
df18.loc[df18['read_audiobooks'] == "(VOL) Refused", 'read_audiobooks'] = 'No'

In [70]:
df18['read_audiobooks'].value_counts(dropna=False)

No                  1169
NaN                  447
Yes                  386
(VOL) Don't know       0
(VOL) Refused          0
Name: read_audiobooks, dtype: int64

In [71]:
# do the same for e-books
df18['read_e-books'].value_counts(dropna=False)

No                  1005
Yes                  538
NaN                  447
(VOL) Don't know       9
(VOL) Refused          3
Name: read_e-books, dtype: int64

In [72]:
# change don't know and refused to No
df18.loc[df18['read_e-books'] == "(VOL) Don't know", 'read_e-books'] = 'No'
df18.loc[df18['read_e-books'] == "(VOL) Refused", 'read_e-books'] = 'No'

In [73]:
df18['read_e-books'].value_counts(dropna=False)

No                  1017
Yes                  538
NaN                  447
(VOL) Don't know       0
(VOL) Refused          0
Name: read_e-books, dtype: int64

In [74]:
# check how many non-readers there are
df18['number_of_books_read'].value_counts()

0     432
3     172
2     142
4     120
6     118
12    116
5     112
10    108
1     101
20     79
8      59
15     55
97     47
7      45
30     36
50     32
96     31
25     29
24     25
40     21
9      12
70      9
11      9
35      9
36      8
45      7
60      7
13      6
18      6
75      5
17      4
26      4
14      4
80      3
52      3
23      3
16      3
32      3
22      2
55      2
65      2
90      2
19      1
85      1
39      1
73      1
28      1
84      1
66      1
54      1
88      1
Name: number_of_books_read, dtype: int64

In [75]:
# it's about equal to nulls so change nulls to No
df18['read_printed_books'].fillna('No',inplace = True)
df18['read_audiobooks'].fillna('No',inplace = True)
df18['read_e-books'].fillna('No',inplace = True)

In [76]:
# remove unused categories
df18['read_printed_books'] = df18['read_printed_books'].cat.remove_unused_categories()
df18['read_audiobooks'] = df18['read_audiobooks'].cat.remove_unused_categories()
df18['read_e-books'] = df18['read_e-books'].cat.remove_unused_categories()

In [77]:
# check again for mixed types
for col in df18.columns.tolist():
  weird = (df18[[col]].applymap(type) != df18[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df18[weird]) > 0:
    print (col)

# 4. Prepare dataframes for merging

In [78]:
# check if columns are the same
df18.columns

Index(['response_id', 'int_date', 'language', 'state', 'sex',
       'number_of_books_read', 'read_printed_books', 'read_audiobooks',
       'read_e-books', 'age', 'marital', 'level_of_education', 'employment',
       'hispanic', 'race', 'income', 'party'],
      dtype='object')

In [79]:
df19.columns

Index(['response_id', 'int_date', 'language', 'state', 'sex',
       'number_of_books_read', 'read_printed_books', 'read_audiobooks',
       'read_e-books', 'age', 'marital', 'level_of_education', 'employment',
       'hispanic', 'race', 'income', 'party'],
      dtype='object')

In [80]:
# check if datatypes are the same
df18.dtypes

response_id              float64
int_date                 float64
language                category
state                   category
sex                     category
number_of_books_read       int64
read_printed_books      category
read_audiobooks         category
read_e-books            category
age                        int64
marital                 category
level_of_education      category
employment              category
hispanic                category
race                    category
income                  category
party                   category
dtype: object

In [81]:
df19.dtypes

response_id              float64
int_date                 float64
language                category
state                   category
sex                     category
number_of_books_read       int64
read_printed_books      category
read_audiobooks         category
read_e-books            category
age                        int64
marital                 category
level_of_education      category
employment              category
hispanic                category
race                    category
income                  category
party                   category
dtype: object

In [82]:
# add year
df18['year']='2018'
df19['year']='2019'

In [83]:
# remove int_date
df18 = df18.drop(columns=['int_date'])
df19 = df19.drop(columns=['int_date'])

In [113]:
df18.head()

In [85]:
df19.head()

Unnamed: 0,response_id,language,state,sex,number_of_books_read,read_printed_books,read_audiobooks,read_e-books,age,marital,level_of_education,employment,hispanic,race,income,party,year
0,4.0,English,NC,Female,0,No,No,No,75,Married,High school graduate (Grade 12 with diploma or...,Not employed for pay,No,Black or African-American,"20 to under $30,000",Democrat,2019
1,7.0,English,OH,Male,0,No,No,No,56,Married,"Some college, no degree (includes some communi...",Employed full-time,No,White,"100 to under $150,000, OR",Republican,2019
2,9.0,English,NJ,Female,3,Yes,No,No,65,Married,Four year college or university degree/Bachelo...,Retired,No,White,"75 to under $100,000",Republican,2019
3,10.0,English,IN,Female,20,Yes,No,Yes,67,Married,High school graduate (Grade 12 with diploma or...,Retired,No,Mixed Race,"50 to under $75,000",Independent,2019
4,11.0,English,MI,Male,20,Yes,No,Yes,64,Married,Two year associate degree from a college or un...,Employed full-time,No,White,"75 to under $100,000",Independent,2019


In [86]:
# remove response id
df18 = df18.drop(columns=['response_id'])
df19 = df19.drop(columns=['response_id'])

In [87]:
# create new id based on index (+2002 so that ids will be different from 2018 when merged)
df18['id'] = df18.index + 1
df19['id'] = df19.index + 2003

In [88]:
df18['id'].duplicated().sum()

0

In [89]:
df19['id'].duplicated().sum()

0

In [90]:
df18['id'].describe()

count    2002.000000
mean     1001.500000
std       578.071939
min         1.000000
25%       501.250000
50%      1001.500000
75%      1501.750000
max      2002.000000
Name: id, dtype: float64

In [91]:
df19['id'].describe()

count    1502.000000
mean     2753.500000
std       433.734366
min      2003.000000
25%      2378.250000
50%      2753.500000
75%      3128.750000
max      3504.000000
Name: id, dtype: float64

# 5. Concatenate

In [94]:
# use concat because the data has the same columns, just different time periods
dfconcat = pd.concat([df18, df19])

In [93]:
# check for differing categories in columns

In [95]:
dfconcat.columns

Index(['language', 'state', 'sex', 'number_of_books_read',
       'read_printed_books', 'read_audiobooks', 'read_e-books', 'age',
       'marital', 'level_of_education', 'employment', 'hispanic', 'race',
       'income', 'party', 'year', 'id'],
      dtype='object')

In [97]:
dfconcat['language'].value_counts()

English    3242
Spanish     262
Name: language, dtype: int64

In [98]:
dfconcat['state'].value_counts()

CA    377
TX    278
FL    238
NY    226
IL    134
PA    127
OH    120
GA    119
MI    116
NC    116
VA     97
WA     87
NJ     86
MD     78
TN     76
SC     70
WI     70
MA     69
MN     68
MO     65
IN     63
AZ     61
CO     61
OR     60
KY     56
LA     50
CT     48
UT     43
OK     40
AL     38
IA     38
KS     31
NM     27
AR     26
AK     25
MS     23
NV     21
ID     18
MT     16
ME     16
NH     15
NE     15
HI     15
WV     15
VT     14
DC     13
RI     12
SD     11
ND      8
DE      4
WY      4
Name: state, dtype: int64

In [99]:
dfconcat['sex'].value_counts()

Male      1909
Female    1595
Name: sex, dtype: int64

In [100]:
dfconcat['read_printed_books'].value_counts()

Yes    2405
No     1099
Name: read_printed_books, dtype: int64

In [101]:
dfconcat['read_audiobooks'].value_counts()

No     2805
Yes     699
Name: read_audiobooks, dtype: int64

In [102]:
dfconcat['read_e-books'].value_counts()

No     2548
Yes     956
Name: read_e-books, dtype: int64

In [103]:
dfconcat['marital'].value_counts()

Married                  1689
Never been married        759
Divorced                  374
Widowed                   296
Living with a partner     211
Separated                 102
(VOL) Refused              64
(VOL) Don't know            9
Name: marital, dtype: int64

In [104]:
dfconcat['level_of_education'].value_counts()

Four year college or university degree/Bachelor's degree (e.g., BS, BA, AB)                                                 875
High school graduate (Grade 12 with diploma or GED certificate)                                                             798
Some college, no degree (includes some community college)                                                                   535
Two year associate degree from a college or university                                                                      368
Some postgraduate or professional schooling, no postgraduate degree (e.g. some graduate school)                             361
High school incomplete (Grades 9-11 or Grade 12 with NO diploma)                                                            183
Postgraduate or professional degree, including master's, doctorate, medical or law degree                                   147
Less than high school (Grades 1-8 or no formal schooling)                                               

In [105]:
dfconcat['employment'].value_counts()

Employed full-time                       1539
Retired                                   887
Not employed for pay                      401
Employed part-time                        376
(VOL) Have own business/self-employed     106
(VOL) Disabled                            100
(VOL) Refused                              37
(VOL) Student                              31
(VOL) Other                                24
(VOL) Don't know                            3
Name: employment, dtype: int64

In [106]:
dfconcat['hispanic'].value_counts()

No                  2905
Yes                  539
(VOL) Refused         52
(VOL) Don't know       8
Name: hispanic, dtype: int64

In [107]:
dfconcat['race'].value_counts()

White                        2437
Black or African-American     414
Or some other race            282
Don't know/Refused (VOL.)     158
Asian or Asian-American       120
Mixed Race                     93
Name: race, dtype: int64

In [108]:
dfconcat['income'].value_counts()

$150,000 or more?            428
50 to under $75,000          412
(VOL) Refused                390
100 to under $150,000, OR    366
75 to under $100,000         355
20 to under $30,000          327
10 to under $20,000          295
30 to under $40,000          269
Less than $10,000            261
40 to under $50,000          224
(VOL) Don't know             177
Name: income, dtype: int64

In [109]:
dfconcat['party'].value_counts()

Independent            1139
Democrat               1060
Republican              820
(VOL) No preference     239
(VOL) Refused           153
(VOL) Don't know         69
(VOL) Other party        24
Name: party, dtype: int64

In [110]:
dfconcat['year'].value_counts()

2018    2002
2019    1502
Name: year, dtype: int64

In [111]:
# all looks good, ready to merge with 2021 data in next notebook

In [112]:
# export
dfconcat.to_pickle(os.path.join(path, '02 Data','Prepared Data','2018-2019.pkl'))