## Part 1: Extract the variable descriptions

In [2]:
# Code Ownership: Original (for this page)
import pandas as pd
import re

In [3]:
with open("ols_animal_research_survey_data_2016_ukda_data_dictionary.rtf") as file:  
    data = file.read()

In [4]:
# Find the variable text (second part of tuple)
question_tuples = re.findall('(Variable\slabel\s=\s\}\{\\\\cf4\s)([\d\w\s\?\.\:\,]*)', data)
question_tuples[:5]

[('Variable label = }{\\cf4 ', 'Respondent serial'),
 ('Variable label = }{\\cf4 ',
  'How well informed do you feel about the use of animals in scientific research in the UK?'),
 ('Variable label = }{\\cf4 ',
  'Interested in finding out more about the ongoing work to find alternatives to using animals in research?'),
 ('Variable label = }{\\cf4 ',
  'Interested in finding out more about the ongoing work to improve the welfare of animals in scientific research?'),
 ('Variable label = }{\\cf4 ',
  'I can accept the use if animals in research as long as it is for medical research purposes and there is no alternative')]

In [8]:
# Find the variable names (second part of tuple)
var_name_tuples = re.findall('(Variable\s=\s\}\{\\\\f2\\\\fs20\\\\cf4\s)([\d\w]*)', data)
var_name_tuples[:5]

[('Variable = }{\\f2\\fs20\\cf4 ', 'ID'),
 ('Variable = }{\\f2\\fs20\\cf4 ', 'Q1'),
 ('Variable = }{\\f2\\fs20\\cf4 ', 'Q2a'),
 ('Variable = }{\\f2\\fs20\\cf4 ', 'Q2b'),
 ('Variable = }{\\f2\\fs20\\cf4 ', 'Q3a')]

In [None]:
# Create the dataframe (we can do this because
# I checked that both lists were in the same order)
cleaned_data = pd.DataFrame()
for tuple in enumerate(question_tuples):
    survey_num = 1
    var_name = var_name_tuples[tuple[0]][1]
    question = tuple[1][1]
    cleaned_data = cleaned_data.append([[survey_num, var_name, question]], ignore_index=True)
cleaned_data.columns = ["Survey_Number", "Var_Name", "Var_Text"]
cleaned_data.head()

In [None]:
# Split data into three for better comparison (since this is the first data 
# we are cleaning. Find length of data.
len(cleaned_data)/3

In [None]:
# Change the survey numbers of the two "new" surveys.
cleaned_data.loc[82:165, "Survey_Number"] += 1
cleaned_data.loc[165:248, "Survey_Number"] += 2

In [None]:
def make_csv(survey_num):
    '''
    Given a survey number, make a csv of var_name and var_text for just that survey number.
    Used in GSS notebook as well.
    Inputs:
        - survey_num (int)
    Output:
        - None, creates a csv
    '''
    df = cleaned_data[cleaned_data["Survey_Number"] == survey_num]
    df = df.iloc[:, 1:3]
    name = "OLSAnimal_" + str(survey_num) + ".csv"
    df.to_csv(path_or_buf=name, index=False)

In [None]:
make_csv(1)
make_csv(2)
make_csv(3)

In [None]:
cleaned_data.to_csv(path_or_buf="OLSAnimal_cleaned_joined.csv", index=False)

## Part 2: Create the survey CSV

In [9]:
# Create a function to create a dictionary from a list of variables. 
# This is used in other notebooks to create the detail csv.
def create_d_list(survey_num, survey_name, num_part, org_conduct, num_questions, data_link, 
             doc_link, source_link, summary):
    '''
    Given information for a dictionary with list values, make the detail dictionary.
    Inputs:
        - survey_num: A number for the survey (list of ints)
        - survey_name: Name of survey (list of strings)
        - num_part: Number of participants in the survey (list of ints)
        - org_conduct: Name of organization (list of strings)
        - num_questions: Number of questions in the survey (list of ints)
        - data_link: Link to data (list of strings)
        - doc_link: Link to documentation (list of strings)
        - source_link: Link to source (list of strings)
        - summary: Summary of survey (list of strings)
    Output:
        - python dictionary that stores the values.
    '''
    d = {"Survey_Number": survey_num,
         "Survey_Name": survey_name, 
         "Num_Participants": num_part,
         "Org_Conduct": org_conduct, 
         "Num_Questions": num_questions,
         "Data_Link": data_link,
         "Documentation_Link": doc_link,
         "Source_Link": source_link,
         "Summary": summary}
    return d

In [10]:
# Create a function to create a csv given a dictionaries. This is used in other 
# notebooks to create the detail csv.
def create_detail_csv(d, name):
    '''
    Given a dictionary (created above) and a survey name, create an unindexed csv 
    with those details.
    Inputs:
        - d: a python dictionary
        - name: name of the survey (string)
    Output:
        - None, but creates a csv.
    '''
    detail = pd.DataFrame(data=d, columns = ["Survey_Number", "Survey_Name", "Num_Participants",
                            "Org_Conduct", "Num_Questions", "Data_Link", "Documentation_Link",
                            "Source_Link", "Summary"])
    detail.to_csv(path_or_buf=name + "_detail.csv", index=False)

In [11]:
survey_num = [1, 2, 3] 
survey_name = ["Public Attitudes towards Animal Research 2016-1", \
               "Public Attitudes towards Animal Research 2016-2", \
               "Public Attitudes towards Animal Research 2016-3"]
num_part = [987, 987, 987]
org_conduct = ["IPSOS", "IPSOS", "IPSOS"]
num_questions = [82, 83, 83]
data_link = ["http://bit.ly/2F7fNuv", "http://bit.ly/2F7fNuv", \
             "http://bit.ly/2F7fNuv"]
doc_link = ["http://bit.ly/2CUZWsE", "http://bit.ly/2CUZWsE", \
            "http://bit.ly/2CUZWsE"]
source_link = ["http://bit.ly/2CsBSlr", "http://bit.ly/2CsBSlr", \
               "http://bit.ly/2CsBSlr"]
summary = ["Part 1 of findings of a 2016 survey on current public awareness and attitudes\
           towards the use of animals in research. The survey also examines attitudes towards,\
           and trust in, the regulatory system and the people who work with animals in research.",\
    "Part 2 of findings of a 2016 survey on current public awareness and attitudes towards\
    the use of animals in research. The survey also examines attitudes towards, and trust in,\
    the regulatory system and the people who work with animals in research.",\
    "Part 3 of findings of a ,2016 survey on current public awareness and attitudes towards\
    the use of animals in research. The survey also examines attitudes towards, and trust in,\
    the regulatory system and the people who work with animals in research."]

In [12]:
d = create_d_list(survey_num, survey_name, num_part, org_conduct, num_questions, data_link, 
             doc_link, source_link, summary)

In [13]:
create_detail_csv(d, "OLSAnimal")