# Pew Research Center public opinion on science survey from 2014 - analysis

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("PRC_2014_sciencesurvey.csv")
df.head()
df.columns

Unnamed: 0,caseid,weight,sample,int_date,lang,cregion,state,usr,density,form,...,party,partyln,ideo,hh1,hh3,eminuse,intmob,ql1,ql1a,qc1
0,100003,3.285714,1,81514,1,3,37,U,3,2,...,3,1.0,2,9,9.0,1,1,1,,
1,100004,3.214286,1,81814,1,3,51,U,5,1,...,2,,5,5,2.0,1,1,1,,
2,100014,1.857143,1,81514,1,1,36,U,5,1,...,1,,2,1,,1,2,1,,
3,100020,3.5,1,81914,1,1,9,S,4,2,...,9,1.0,2,3,2.0,1,1,1,,
4,100022,3.642857,1,81814,1,2,39,S,5,1,...,1,,3,2,2.0,1,1,1,,


Index(['caseid', 'weight', 'sample', 'int_date', 'lang', 'cregion', 'state',
       'usr', 'density', 'form', 'q1', 'q2a', 'q2b', 'q2c', 'q2e', 'q2f',
       'q2gf1', 'q2hf2', 'q3', 'q4', 'q5a', 'q5b', 'q5c', 'q6', 'q7', 'q8',
       'q9f1', 'q9f1_code1', 'q9f1_code2', 'q9f1_code3', 'q12a', 'q12b', 'q13',
       'q16', 'q17', 'q18', 'q20f1', 'q21af2', 'q21bf2', 'q21cf2', 'q23',
       'q24a', 'q24b', 'q24c', 'q24d', 'q24e', 'q24f', 'q25', 'q27', 'q28',
       'q29', 'q30', 'q32', 'q33', 'q34', 'q35', 'q37', 'q38', 'q39', 'q40',
       'q41', 'knosct14', 'knosct15', 'knosct16', 'knosct17', 'knosct18',
       'knosct19', 'knosct_count', 'sexz', 'agerec', 'educ2', 'scideg', 'hisp',
       'racecmb', 'race3m1', 'race3m2', 'race3m3', 'race3m4', 'racethn',
       'birth_hisp', 'usborn', 'marital', 'parent', 'citizen', 'relig', 'chr',
       'born', 'attend', 'income', 'reg', 'party', 'partyln', 'ideo', 'hh1',
       'hh3', 'eminuse', 'intmob', 'ql1', 'ql1a', 'qc1'],
      dtype='object')

In [258]:
# which questions do I want to focus on? (~12 questions)
# q7, q16, (q20f1, q21af2, q21bf2, q21cf2), q24a,b,c,d,e,f, q25, q33-35, q38 
# I will choose one question first to keep the target space small (q16)
# I can increase target space later

In [259]:
# which demographics do I want to use in model? (10 demographics)
# cregion, usr, density, sexz, agerec, educ2, racecmb, relig, party, ideo

# Clean dataframe

#### Clean demographics

In [260]:
# the dataframe has too many columns to work with in an efficient way,
# so keep only columns of particular interest
df2 = df[['cregion', 'usr', 'sexz', 'agerec', 'density', 'racecmb', 
           'party', 'partyln', 'educ2', 'relig', 'ideo', 'income', 
           'q7', 'q16', 'q20f1', 'q21af2', 'q21bf2', 'q21cf2',
           'q24a', 'q24b', 'q24c', 'q24d', 'q24e', 'q24f', 'q25', 
           'q33', 'q34', 'q35', 'q38']]
df2.head()

Unnamed: 0,cregion,usr,sexz,agerec,density,racecmb,party,partyln,educ2,relig,...,q24b,q24c,q24d,q24e,q24f,q25,q33,q34,q35,q38
0,3,U,2,65,3,1,3,1.0,6,1,...,1,1,1,1,1,2,2,2,2,1
1,3,U,2,24,5,1,2,,8,5,...,2,2,1,2,1,1,2,1,2,2
2,1,U,2,99,5,1,1,,4,2,...,9,1,1,9,1,1,2,9,1,1
3,1,S,2,45,4,1,9,1.0,8,2,...,1,2,2,2,1,1,2,1,2,1
4,2,S,1,21,5,1,1,,3,2,...,2,2,2,2,1,2,2,1,2,9


In [261]:
# check values of 'cregion'
feature = 'cregion'
df2[feature].value_counts().sort_index()

1    285
2    334
3    897
4    486
Name: cregion, dtype: int64

In [262]:
# this seems to prevent warnings about setting values to copy of dataframe slice
df2 = df2.copy()

In [264]:
# convert 'usr' feature to dummies
df2[['R', 'S', 'U']] = pd.get_dummies(df2['usr'])
df2.head()

Unnamed: 0,cregion,usr,sexz,agerec,density,racecmb,party,partyln,educ2,relig,...,q24e,q24f,q25,q33,q34,q35,q38,R,S,U
0,3,U,2,65,3,1,3,1.0,6,1,...,1,1,2,2,2,2,1,0.0,0.0,1.0
1,3,U,2,24,5,1,2,,8,5,...,2,1,1,2,1,2,2,0.0,0.0,1.0
2,1,U,2,99,5,1,1,,4,2,...,9,1,1,2,9,1,1,0.0,0.0,1.0
3,1,S,2,45,4,1,9,1.0,8,2,...,2,1,1,2,1,2,1,0.0,1.0,0.0
4,2,S,1,21,5,1,1,,3,2,...,2,1,2,2,1,2,9,0.0,1.0,0.0


#### The entry '99' has been used when no answer is provided. We can use this for the mean time instead of NaN.

In [265]:
# replace non-answers with 99 for sexz
feature = 'sexz'
df2[feature].value_counts().sort_index()
df2.loc[df2[feature]>2, feature] = 99

1    1007
2     991
3       1
8       2
9       1
Name: sexz, dtype: int64

In [266]:
# agerec is fine

In [267]:
# check values of 'density'
feature = 'density'
df2[feature].value_counts().sort_index()

1    318
2    335
3    374
4    424
5    551
Name: density, dtype: int64

In [268]:
# replace non-answers with 99 for 'racecmb'
feature = 'racecmb'
df2[feature].value_counts().sort_index()
df2.loc[df2[feature]>5, feature] = 99

1    1358
2     275
3      72
4      78
5     184
9      35
Name: racecmb, dtype: int64

In [269]:
# keep only R,D,I for 'party'; make rest 99
feature = 'party'
df2[feature].value_counts().sort_index()
df2.loc[df2[feature]>3, feature] = 99

1    454
2    666
3    737
4     82
5      7
9     56
Name: party, dtype: int64

In [270]:
# ignore 'partyln'
feature = 'partyln'
df2[feature].value_counts().sort_index()

     1120
1     283
2     293
9     306
Name: partyln, dtype: int64

In [271]:
# replace non-answers with 99 for educ2
feature = 'educ2'
df2[feature].value_counts().sort_index()
df2.loc[df2[feature]>8, feature] = 99

1     74
2     87
3    537
4    283
5    199
6    425
7     32
8    356
9      9
Name: educ2, dtype: int64

In [272]:
# comeback to relig
feature = 'relig'
df2[feature].value_counts().sort_index()

1     746
2     448
3      23
4      11
5      54
6      17
7      14
8      18
9      88
10     75
11     34
12    256
13    165
14      6
15     12
99     35
Name: relig, dtype: int64

In [273]:
# replace non-answers with 99 for ideo
feature = 'ideo'
df2[feature].value_counts().sort_index()
df2.loc[df2[feature]>5, feature] = 99

1    127
2    571
3    721
4    371
5    125
9     87
Name: ideo, dtype: int64

In [274]:
# replace non-answers with 99 for income
feature = 'income'
df2[feature].value_counts().sort_index()
df2.loc[df2[feature]>9, feature] = 99

1     166
2     226
3     209
4     200
5     150
6     255
7     186
8     199
9     191
10    220
Name: income, dtype: int64

#### Clean questions

In [275]:
df2.columns

Index(['cregion', 'usr', 'sexz', 'agerec', 'density', 'racecmb', 'party',
       'partyln', 'educ2', 'relig', 'ideo', 'income', 'q7', 'q16', 'q20f1',
       'q21af2', 'q21bf2', 'q21cf2', 'q24a', 'q24b', 'q24c', 'q24d', 'q24e',
       'q24f', 'q25', 'q33', 'q34', 'q35', 'q38', 'R', 'S', 'U'],
      dtype='object')

In [276]:
# check 'q7'
question = 'q7'
df2[question].value_counts().sort_index()
#df2.loc[df2[question]==9, question] = 99
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1    1100
2     830
9      72
Name: q7, dtype: int64

In [277]:
# response 9 means 'don't know'; only 4% of responses => drop rows with response 9
# change 'q16' responses of 2 to 0
# problem is then a binary classification problem

question = 'q16'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1    1326
2     595
9      81
Name: q16, dtype: int64

In [278]:
# combine 'q20f1', 'q21af2', 'q21bf2', 'q21cf2'

In [279]:
# create new column 'q20_21' where 1 = solid evidence for global warming, 0 = not
#
# map values to 1,0 and replace 9,' ' with 99
df2[['q20f1', 'q21af2']].head()
df2.loc[:,'q20f1'] = df2['q20f1'].map({'1':1, '2':1, '3':0, '9':99, ' ':99}).astype(int)
df2.loc[:,'q21af2'] = df2['q21af2'].map({'1':1, '2':0, '3':99, '9':99,' ':99}).astype(int)
df2[['q20f1', 'q21af2']].head()

Unnamed: 0,q20f1,q21af2
0,,2.0
1,2.0,
2,3.0,
3,,2.0
4,3.0,


Unnamed: 0,q20f1,q21af2
0,99,0
1,1,99
2,0,99
3,99,0
4,0,99


In [280]:
df2['q20_21'] = df2.apply(lambda x: x['q20f1'] if x['q21af2'] == 99 else x['q21af2'], axis=1)
df2[['q20f1', 'q21af2', 'q20_21']].head()

Unnamed: 0,q20f1,q21af2,q20_21
0,99,0,0
1,1,99,1
2,0,99,0
3,99,0,0
4,0,99,0


In [281]:
# check 'q24a'
question = 'q24a'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1    996
2    933
9     73
Name: q24a, dtype: int64

In [282]:
# check 'q24b'
question = 'q24b'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1    960
2    956
9     86
Name: q24b, dtype: int64

In [283]:
# check 'q24c'
question = 'q24c'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1     818
2    1002
9     182
Name: q24c, dtype: int64

In [284]:
# check 'q24d'
question = 'q24d'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1    1375
2     512
9     115
Name: q24d, dtype: int64

In [285]:
# check 'q24e'
question = 'q24e'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1    1045
2     877
9      80
Name: q24e, dtype: int64

In [286]:
# check 'q24f'
question = 'q24f'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1    1092
2     832
9      78
Name: q24f, dtype: int64

In [287]:
# check 'q25'
question = 'q25'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1     564
2    1404
9      34
Name: q25, dtype: int64

In [288]:
# check 'q33'
question = 'q33'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1     306
2    1653
9      43
Name: q33, dtype: int64

In [289]:
# check 'q34'
question = 'q34'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1    928
2    983
9     91
Name: q34, dtype: int64

In [290]:
# check 'q35'
question = 'q35'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1     574
2    1366
9      62
Name: q35, dtype: int64

In [291]:
# check 'q38'
question = 'q38'
df2[question].value_counts().sort_index()
df2.loc[:,question] = df2.loc[:,question].map({1:1, 2:0, 9:99})

1     786
2    1098
9     118
Name: q38, dtype: int64

# Logistic Regression

In [292]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

df2.head()

Unnamed: 0,cregion,usr,sexz,agerec,density,racecmb,party,partyln,educ2,relig,...,q24f,q25,q33,q34,q35,q38,R,S,U,q20_21
0,3,U,2,65,3,1,3,1.0,6,1,...,1,0,0,0,0,1,0.0,0.0,1.0,0
1,3,U,2,24,5,1,2,,8,5,...,1,1,0,1,0,0,0.0,0.0,1.0,1
2,1,U,2,99,5,1,1,,4,2,...,1,1,0,99,1,1,0.0,0.0,1.0,0
3,1,S,2,45,4,1,99,1.0,8,2,...,1,1,0,1,0,1,0.0,1.0,0.0,0
4,2,S,1,21,5,1,1,,3,2,...,1,0,0,1,0,99,0.0,1.0,0.0,0


In [293]:
df2.columns

Index(['cregion', 'usr', 'sexz', 'agerec', 'density', 'racecmb', 'party',
       'partyln', 'educ2', 'relig', 'ideo', 'income', 'q7', 'q16', 'q20f1',
       'q21af2', 'q21bf2', 'q21cf2', 'q24a', 'q24b', 'q24c', 'q24d', 'q24e',
       'q24f', 'q25', 'q33', 'q34', 'q35', 'q38', 'R', 'S', 'U', 'q20_21'],
      dtype='object')

##### choose features relevant to question (iterative process)

In [294]:
features = ['cregion', 'U', 'S', 'R', 'sexz', 'agerec', \
             'educ2', 'ideo', 'income']

In [3]:
# define function for evaluating models
# the function performs a grid search over model parameters
def eval_model(question, features, df):
    
    items = features + question
    df_test = df[df[items]!=99][items].dropna().astype(int)
    X = df_test[items[:-1]]
    y = df_test[items[-1]]

    Cs = np.logspace(0, 5, 20)
    penalties = ('l1', 'l2')
    class_weights = (None, 'balanced')
    parameters = {'penalty':penalties, 'C':Cs, 'class_weight':class_weights}
    model = LogisticRegression()
    
    clf = GridSearchCV(model, parameters, cv=4, n_jobs=-1)
    clf.fit(X, y)

    return(#"cv_results", clf.cv_results_, 
           "best_params", clf.best_params_,
           "best_score", clf.best_score_, 
           "y_mean", y.mean() )

### We can now evaluate the model for different questions. Each time we will compare the result to the mean of the response as a measure of how well the model is performing. The mean of the response (a given question) would be the score obtained from always choosing yes (1).

In [296]:
# q20_21
question = ['q20_21']
eval_model(question, features, df2)

('best_params',
 {'C': 6.1584821106602643, 'class_weight': None, 'penalty': 'l1'},
 'best_score',
 0.78988561107766408,
 'y_mean',
 0.7802528597230584)

In [297]:
# q24a
question = ['q24a']
eval_model(question, features, df2)

('best_params',
 {'C': 1.0, 'class_weight': 'balanced', 'penalty': 'l1'},
 'best_score',
 0.6396614268440145,
 'y_mean',
 0.5235792019347038)

In [298]:
# q24b
question = ['q24b']
eval_model(question, features, df2)

('best_params',
 {'C': 3.3598182862837818, 'class_weight': 'balanced', 'penalty': 'l2'},
 'best_score',
 0.63432383262583381,
 'y_mean',
 0.49787750151607035)

In [299]:
# q24c
question = ['q24c']
eval_model(question, features, df2)

('best_params',
 {'C': 1.0, 'class_weight': None, 'penalty': 'l2'},
 'best_score',
 0.66517571884984028,
 'y_mean',
 0.4536741214057508)

In [300]:
# q24d
question = ['q24d']
eval_model(question, features, df2)

('best_params',
 {'C': 1.0, 'class_weight': None, 'penalty': 'l1'},
 'best_score',
 0.73469387755102045,
 'y_mean',
 0.7346938775510204)

In [301]:
# q24e
question = ['q24e']
eval_model(question, features, df2)

('best_params',
 {'C': 11.28837891684689, 'class_weight': 'balanced', 'penalty': 'l1'},
 'best_score',
 0.67757575757575761,
 'y_mean',
 0.5442424242424242)

In [302]:
# q24f
question = ['q24f']
eval_model(question, features, df2)

('best_params',
 {'C': 1.8329807108324359, 'class_weight': None, 'penalty': 'l1'},
 'best_score',
 0.60766423357664234,
 'y_mean',
 0.5736009732360098)

In [303]:
# q25
question = ['q25']
eval_model(question, features, df2)

('best_params',
 {'C': 1.0, 'class_weight': None, 'penalty': 'l1'},
 'best_score',
 0.71300715990453456,
 'y_mean',
 0.2852028639618138)

In [304]:
# q33
question = ['q33']
eval_model(question, features, df2)

('best_params',
 {'C': 1.0, 'class_weight': None, 'penalty': 'l1'},
 'best_score',
 0.83751493428912782,
 'y_mean',
 0.16248506571087215)

In [305]:
# q34
question = ['q34']
eval_model(question, features, df2)

('best_params',
 {'C': 1.0, 'class_weight': 'balanced', 'penalty': 'l1'},
 'best_score',
 0.56908094948265364,
 'y_mean',
 0.5015216068167986)

In [306]:
# q35
question = ['q35']
eval_model(question, features, df2)

('best_params',
 {'C': 1.0, 'class_weight': None, 'penalty': 'l1'},
 'best_score',
 0.70319084888621308,
 'y_mean',
 0.30222757375075254)

In [307]:
# q38
question = ['q38']
eval_model(question, features, df2)

('best_params',
 {'C': 11.28837891684689, 'class_weight': None, 'penalty': 'l1'},
 'best_score',
 0.63574520717377858,
 'y_mean',
 0.4199134199134199)

#### For each question,  the model does as least as well as the mean score.