In [None]:
# Code Ownership: Original (for this page)
import pandas as pd
import docx2txt
import re

In [None]:
text = docx2txt.process('2016 CCS - Leadership Survey - Final Questionnaire.docx')

In [3]:
text = text.splitlines()

In [4]:
def is_valid(txt):
    '''
    Given text, find if it contains an invalid match (one that would prevent
    the for loop from continuing).
    Input:
        - txt: a string of text
    Output:
        - True or False boolean, depending on whether the text is valid or not.
    '''
    pattern1_invalid = re.compile('(N*Q[0-9]+_*[A-Za-z]*)(.*)')
    pattern2_invalid = re.compile('(NA-[0-9]+)(.)')
    pattern3_invalid = re.compile('Display This Question:')
    pattern4_invalid = re.compile('Multilateral')
    if pattern1_invalid.match(txt):
        return False
    elif pattern2_invalid.match(txt):
        return False
    elif pattern3_invalid.match(txt):
        return False
    elif pattern4_invalid.match(txt):
        return False
    else:
        return True

In [5]:
def find_questions(text):
    '''
    Given a text string, find the tuples of var_name and var_text in that text.
    Input:
        - text: a string of text
    Output:
        - question_tuples: a list of tuples of var_name, var_text pairs
    '''
    question_tuples = []
    # Go through each item in the text list and find if it fits a pattern.
    for i in range(len(text)):
        pattern1 = re.findall('(N*Q[0-9]+_*[A-Za-z]*)(.*)', text[i])
        pattern2 = re.findall('(NA-[0-9]+)(.*)', text[i])
        # If it fits the pattern, find all the subparts and add them to the 
        # question until you reach a breaking point.
        if pattern1:
            question = pattern1[0][1]
            for j in range(1,1000):
                # While you still have items to process.
                if i+j < len(text):
                    # Check if the string is a valid subpart.
                    if is_valid(text[i+j]):
                        # Check the string is not empty.
                        if text[i+j]:
                            # Add the subpart to the question.
                            question += text[i+j]
                    # If you reach an invalid subpart, break.
                    else:
                        break
                # If you are at the end of the list, break.
                else:
                    break
            # Append the pattern1 compiled strings to the list.
            num = pattern1[0][0]
            pattern_cleaned = (num, question)
            question_tuples.append(pattern_cleaned)
        # Append the pattern2 compiled strings to the list. Upon manually 
        # inspecting the text, these patterns do not have subparts.
        if pattern2:
            num = pattern2[0][0]
            question = pattern2[0][1]
            pattern_cleaned = (num, question)
            question_tuples.append(pattern_cleaned)
    return question_tuples

In [6]:
question_tuples = find_questions(text)

In [7]:
question_tuples = question_tuples[0:-1]

In [9]:
def to_csv(tuple_list, name):
    '''
    Given a name and tuple_list, convert the data to a csv of cleaned data.
    Also used in Pew notebook.
    Inputs:
        - tuple_list: a list of (var_name, var_text) tuples
        - name: the survey name
    Output:
        - None, creates csv.
    '''
    cleaned_data = pd.DataFrame()
    for tup in question_tuples:
        var_name = tup[0]
        question = tup[1]
        cleaned_data = cleaned_data.append([[var_name, question]], ignore_index=True)
    cleaned_data.columns = ["Var_Name", "Var_Text"]
    cleaned_data.to_csv(path_or_buf=name + ".csv", index=False)

In [10]:
to_csv(question_tuples, "2016CCSLeadershipSurvey")

## Part 2: Create the Detail CSV

In [16]:
def create_d(survey_num, survey_name, num_part, org_conduct, num_questions, data_link, 
             doc_link, source_link, summary):
    d = {"Survey_Number": [survey_num],
         "Survey_Name": [survey_name], 
         "Num_Participants": [num_part],
         "Org_Conduct": [org_conduct], 
         "Num_Questions": [num_questions],
         "Data_Link": [data_link],
         "Documentation_Link": [doc_link],
         "Source_Link": [source_link],
         "Summary": [summary]}
    return d

In [17]:
def create_detail_csv(d, name):
    detail = pd.DataFrame(data=d, columns = ["Survey_Number", "Survey_Name", "Num_Participants",
                            "Org_Conduct", "Num_Questions", "Data_Link", "Documentation_Link",
                            "Source_Link", "Summary"])
    detail.to_csv(path_or_buf=name + "_detail.csv", index=False)

In [18]:
survey_num = 9
survey_name = "2015/2016 Chicago Council Leadership Survey"
num_part = 422
org_conduct = "The Chicago Council on Global Affairs"
num_questions = len(question_tuples)
data_link = "http://bit.ly/2HerrQE"
doc_link = "http://bit.ly/2oZAbCD"
source_link = "NA"
summary = "A 2015 survey on attitudes towards homosexuality on Facebook, conducted as part \
    of an undergraduate honors class research project in the National University of Singapore."

In [19]:
d = create_d(survey_num, survey_name, num_part, org_conduct, num_questions, data_link, 
             doc_link, source_link, summary)

In [20]:
create_detail_csv(d, "2016CCS")