In [1]:
import pandas as pd
import numpy as np
import re
import docx
from docx import Document

# Part 1: Extract variable descriptions

In [2]:
df = pd.read_csv("Dataset - 2014 Pew Research Center Science Survey (SPSS).csv")

In [3]:
df.head()

Unnamed: 0,caseid,weight,sample,int_date,lang,cregion,state,usr,density,form,...,party,partyln,ideo,hh1,hh3,eminuse,intmob,ql1,ql1a,qc1
0,100003,3.285714,1,81514,1,3,37,U,3,2,...,3,1.0,2,9,9.0,1,1,1,,
1,100004,3.214286,1,81814,1,3,51,U,5,1,...,2,,5,5,2.0,1,1,1,,
2,100014,1.857143,1,81514,1,1,36,U,5,1,...,1,,2,1,,1,2,1,,
3,100020,3.5,1,81914,1,1,9,S,4,2,...,9,1.0,2,3,2.0,1,1,1,,
4,100022,3.642857,1,81814,1,2,39,S,5,1,...,1,,3,2,2.0,1,1,1,,


In [4]:
document = Document('Codebook - 2014 Pew Research Center Science Survey.docx')

In [5]:
p_list = []

for p in document.paragraphs:
    p_list.append(p.text)

In [7]:
# Clean the data, accounting for subquestions within question.
question_tuples = []
q_subparts = []
for i in range(len(p_list)):
    Q_num = re.findall('(Q[0-9]+[A-Z]*[f1]*[F1]*[_]*[code]*[0-9]*)', p_list[i])
    if Q_num:
        question = p_list[i].strip(Q_num[0])
        question = question.strip()
        # If there is a subquestion (upon manual inspection, these all have
        # [] in them), find the subquestion information.
        if "[" in question or "[" in p_list[i+1]:
            a_cat = False
            for j in range(1,7):
                Q_subpart = re.findall('([a-f]\.)(.*)', p_list[i+j])
                if Q_subpart:
                    for part in Q_subpart[0]:
                        part = part.strip()
                        question = question + str(" " + part)
                    a_cat = True
                if not a_cat:
                    Q_subpart2 = re.findall('(^[\\t]*\d)(.*)', p_list[i+j])
                    if Q_subpart2:
                        for part in Q_subpart2[0]:
                            part = part.strip()
                            question = question + str(" " + part) 
        atuple = (Q_num[0], question)
        question_tuples.append(atuple)
    Q_know = re.findall('(KNOSCT[0-9]+)', p_list[i])
    if Q_know:
        question = p_list[i].strip(Q_know[0])
        qustion = question.strip()
        atuple = (Q_know[0], question)
        question_tuples.append(atuple)

In [8]:
question_tuples

[('Q1',
  'All in all, are you satisfied or dissatisfied with the way things are going in this country today?'),
 ('Q2',
  'We’d like you to compare the United States to other industrialized countries in a few different areas. (First,) what about... [INSERT ITEM; READ AND RANDOMIZE]? [READ FOR FIRST ITEM, THEN AS NECESSARY: Do you think the U.S. is the BEST IN THE WORLD, above average, average or below average in [ITEM] compared to other industrialized countries?] a. Its scientific achievements b. Its military c. Its economy e. Science, technology, engineering and math education for grades K to 12'),
 ('Q3',
  'How much do you ENJOY keeping up with news about science – a lot, some, not much, or not at all?'),
 ('Q4',
  'Overall, has science made life easier or more difficult for most people?'),
 ('Q5',
  'Has science had a mostly positive or mostly negative effect on the quality of [INSERT ITEM; RANDOMIZE] in the U.S.? What about [NEXT ITEM]? [IF NECESSARY: Has science had a mostly pos

In [9]:
cleaned_data = pd.DataFrame()

In [10]:
for tup in question_tuples:
    var_name = tup[0]
    question = tup[1]
    cleaned_data = cleaned_data.append([[var_name, question]], ignore_index=True)

In [11]:
cleaned_data.columns = ["Var_Name", "Var_Text"]
len(cleaned_data)

48

In [13]:
cleaned_data.to_csv(path_or_buf="PewScience_cleaned.csv", index=False)

# Part 2: Create Survey Detail Description

In [19]:
d = {"Survey_Number": [4], \
    "Survey_Name": ["Pew Research Center 2014 GP Survey on Science"], "Num_Participants": [2002], \
    "Org_Conduct": ["Pew Research"], "Num_Questions": [48], "Data_Link": \
    ["http://bit.ly/2FbEzIP"], \
    "Documentation_Link": ["http://bit.ly/2t7EVM6"],\
     "Source_Link": ["http://pewrsr.ch/2owjlvS"],\
     "Summary": ["A 2014 survey that included a number of open-ended questions to gauge what respondents had in mind when thinking about the positive and negative effects of science on society. "]}

In [20]:
detail = pd.DataFrame(data=d, columns = ["Survey_Number", "Survey_Name", "Num_Participants", \
                      "Org_Conduct", "Num_Questions", "Data_Link", "Documentation_Link", "Source_Link", \
                      "Summary"])

In [21]:
detail

Unnamed: 0,Survey_Number,Survey_Name,Num_Participants,Org_Conduct,Num_Questions,Data_Link,Documentation_Link,Source_Link,Summary
0,4,Pew Research Center 2014 GP Survey on Science,2002,Pew Research,48,http://bit.ly/2FbEzIP,http://bit.ly/2t7EVM6,http://pewrsr.ch/2owjlvS,A 2014 survey that included a number of open-e...


In [22]:
detail.to_csv(path_or_buf="PewScience_Survey_Detail.csv", index=False)