# Part 1: Extract variable descriptions

In [2]:
# Code Ownership: Original (for this page)
import pandas as pd
import PyPDF2 as Pypdf
import re

In [3]:
# Used in def getPDFContent(path, first_page, last_page):
    '''
    Given a pdf path and first and last page, return the content as a list of lines in the pdf.
    Inputs:
        - pdf path (string)
        - first page, last page (ints)
    Output:
        - content (list of string lineslines)
    '''
    content = []
    pdf = Pypdf.PdfFileReader(path)
    for i in range(first_page, last_page):
        content += pdf.getPage(i).extractText().splitlines()
    return content

In [4]:
text = getPDFContent('GSS_Codebook_index.pdf', 0, 116)
text[:10]

['Page 1Index',
 'INDEX TO DATA SET (by Data Order)MnemonicMnemonic description',
 'YEARGSS YEAR FOR THIS RESPONDENT                       ',
 'IDRESPONDENT ID NUMBER                                ',
 '',
 'WRKSTATLABOR FORCE STATUS',
 '',
 'HRS1NUMBER OF HOURS WORKED LAST WEEK',
 '',
 'HRS2NUMBER OF HOURS USUALLY WORK A WEEK']

In [5]:
df1 = pd.read_csv("GSS2012.csv")
df2 = pd.read_csv("GSS2014.csv")
df3 = pd.read_csv("GSS2016.csv")

In [6]:
def get_matches(df, text_list):
    '''
    Given dataframe and text list, gets the matches of variable name 
    and text for the variables that are in the data.
    Inputs:
        - df: dataframe of csv data
        - text_list: content (list of string lineslines)
    '''
    question_tuples = set()
    for col in df.columns:
        for text in text_list:
            if text[0:len(col)] == col:
                atuple = (text[0:len(col)], text[len(col):])
                question_tuples.add(atuple)
    return question_tuples

In [7]:
gss2012 = get_matches(df1, text)
gss2014 = get_matches(df2, text)
gss2016 = get_matches(df3, text)

In [8]:
# Create the dataframe for all three surveys, incrementing the survey count by 1 each time.
cleaned_data = pd.DataFrame()
survey_list = [(gss2012, "General Social Survey 2012"), (gss2014, "General Social Survey 2014"), \
               (gss2016, "General Social Survey 2016")]
survey_num = 4
for survey in survey_list:
    survey_num += 1
    for tup in survey[0]:
        var_name = tup[0]
        question = tup[1]
        cleaned_data = cleaned_data.append([[survey_num, var_name, \
                                             question]], ignore_index=True)
cleaned_data.columns = ["Survey_Number", "Var_Name", "Var_Text"]
cleaned_data.head()

Unnamed: 0,Survey_Number,Var_Name,Var_Text
0,5,AGE,4AGE OF PERSON #4
1,5,FAIR,5PEOPLE FAIR OR NOT
2,5,FEJOBAFF,FOR OR AGAINST PREFERENTIAL HIRING OF WOMEN
3,5,HAPPY7,HOW HAPPY R IS
4,5,AGE,2AGE OF PERSON #2


In [9]:
def make_csv(survey_num, year):
    '''
    Given a survey number, make a csv of var_name and var_text for just that survey number.
    Used in OLS Animal notebook as well.
    Inputs:
        - survey_num (int)
    Output:
        - None, creates a csv
    '''
    df = cleaned_data[cleaned_data["Survey_Number"] == survey_num]
    df = df.iloc[:, 1:3]
    name = "GSS_cleaned_" + str(year) + ".csv" 
    df.to_csv(path_or_buf=name, index=False)

In [10]:
make_csv(5, 2012)
make_csv(6, 2014)
make_csv(7, 2016)

In [12]:
# Create a csv of all the data for reference.
cleaned_data.to_csv(path_or_buf="GSS_cleaned.csv", index=False)

# Part 2: Create Survey Detail Description

In [13]:
# Create a function to create a dictionary from a list of variables. This is used in other notebooks
# to create the detail csv.
def create_d_list(survey_num, survey_name, num_part, org_conduct, num_questions, data_link, 
             doc_link, source_link, summary):
    '''
    Given information for a dictionary with list values, make the detail dictionary.
    Inputs:
        - survey_num: A number for the survey (list of ints)
        - survey_name: Name of survey (list of strings)
        - num_part: Number of participants in the survey (list of ints)
        - org_conduct: Name of organization (list of strings)
        - num_questions: Number of questions in the survey (list of ints)
        - data_link: Link to data (list of strings)
        - doc_link: Link to documentation (list of strings)
        - source_link: Link to source (list of strings)
        - summary: Summary of survey (list of strings)
    Output:
        - python dictionary that stores the values.
    '''
    d = {"Survey_Number": survey_num,
         "Survey_Name": survey_name, 
         "Num_Participants": num_part,
         "Org_Conduct": org_conduct, 
         "Num_Questions": num_questions,
         "Data_Link": data_link,
         "Documentation_Link": doc_link,
         "Source_Link": source_link,
         "Summary": summary}
    return d

In [14]:
# Create a function to create a csv given a dictionaries. This is used in other notebooks
# to create the detail csv.
def create_detail_csv(d, name):
    '''
    Given a dictionary (created above) and a survey name, create an unindexed csv 
    with those details.
    Inputs:
        - d: a python dictionary
        - name: name of the survey (string)
    Output:
        - None, but creates a csv.
    '''
    detail = pd.DataFrame(data=d, columns = ["Survey_Number", "Survey_Name", "Num_Participants",
                            "Org_Conduct", "Num_Questions", "Data_Link", "Documentation_Link",
                            "Source_Link", "Summary"])
    detail.to_csv(path_or_buf=name + "_detail.csv", index=False)

In [16]:
survey_num = cleaned_data["Survey_Number"].unique()
survey_name = ["General Social Survey 2012", "General Social Survey 2014", \
            "General Social Survey 2016"]
num_part = [1500, 1500, 1500]
org_conduct = ["NORC", "NORC", "NORC"]
num_questions = [len(cleaned_data[cleaned_data["Survey_Number"] == 5]), 
                 len(cleaned_data[cleaned_data["Survey_Number"] == 6]),
                 len(cleaned_data[cleaned_data["Survey_Number"] == 7])]
data_link = ["http://bit.ly/2I1BhXu", "http://bit.ly/2I1BhXu", "http://bit.ly/2I1BhXu"]
doc_link = ["http://bit.ly/2F4F1cG", "http://bit.ly/2F4F1cG", "http://bit.ly/2F4F1cG"]
source_link = ["http://gss.norc.org", "http://gss.norc.org", "http://gss.norc.org"]
summary = ["The GSS gathers data on contemporary American society in order to monitor and explain \
    trends and constants in attitudes, behaviors, and attributes. The GSS contains a standard core \
    of demographic, behavioral, and attitudinal questions, plus topics of special interest. Among \
    the topics covered are civil liberties, crime and violence, intergroup tolerance, morality, \
    national spending priorities, psychological well-being, social mobility, and stress and traumatic \
    events. Altogether the GSS is the single best source for sociological and attitudinal trend data \
    covering the United States. It allows researchers to examine the structure and functioning of \
    society in general as well as the role played by relevant subgroups and to compare the United \
    States to other nations."] * 3

In [18]:
d = create_d_list(survey_num, survey_name, num_part, org_conduct, num_questions, data_link, 
             doc_link, source_link, summary)

In [19]:
create_detail_csv(d, "GSS")