# Part 1: Extract variable descriptions

In [1]:
import pandas as pd
import PyPDF2 as Pypdf
import re

In [2]:
def getPDFContent(path, first_page, last_page):
    content = []
    pdf = Pypdf.PdfFileReader(path)
    for i in range(first_page, last_page):
        content += pdf.getPage(i).extractText().splitlines()
    return content

In [3]:
text = getPDFContent('GSS_Codebook_index.pdf', 0, 116)
text

['Page 1Index',
 'INDEX TO DATA SET (by Data Order)MnemonicMnemonic description',
 'YEARGSS YEAR FOR THIS RESPONDENT                       ',
 'IDRESPONDENT ID NUMBER                                ',
 '',
 'WRKSTATLABOR FORCE STATUS',
 '',
 'HRS1NUMBER OF HOURS WORKED LAST WEEK',
 '',
 'HRS2NUMBER OF HOURS USUALLY WORK A WEEK',
 '',
 'EVWORKEVER WORK AS LONG AS ONE YEAR',
 '',
 'OCCRS CENSUS OCCUPATION CODE (1970)',
 '',
 'PRESTIGERS OCCUPATIONAL PRESTIGE SCORE  (1970)',
 '',
 'WRKSLFR SELF-EMP OR WORKS FOR SOMEBODY',
 '',
 'WRKGOVTGOVT OR PRIVATE EMPLOYEE',
 '',
 'COMMUTETRAVEL TIME TO WORK',
 '',
 'INDUSTRYRS INDUSTRY CODE   (1970)',
 '',
 'OCC80RS CENSUS OCCUPATION CODE (1980)',
 '',
 'PRESTG80RS OCCUPATIONAL PRESTIGE SCORE  (1980)',
 '',
 'INDUS80RS INDUSTRY CODE   (1980)',
 '',
 'INDUS07INDUSTRY CODE BASED ON NAICS 2007OCCONETOCCUPATION CODE BASED ON O*NETFOUNDCASE WAS RETRIEVABLE IN 2012',
 'OCC10RS CENSUS OCCUPATION CODE (2010)',
 '',
 "OCCINDVR'S OCC/IND VERBATIM RETRIEVED",
 

In [4]:
df1 = pd.read_csv("GSS2012.csv")
df2 = pd.read_csv("GSS2014.csv")
df3 = pd.read_csv("GSS2016.csv")

In [5]:
def get_matches(df, text_list):
    '''
    Gets the matches of variable name and text for the variables that are in the data passed.
    '''
    question_tuples = set()
    for col in df.columns:
        for text in text_list:
            if text[0:len(col)] == col:
                atuple = (text[0:len(col)], text[len(col):])
                question_tuples.add(atuple)
    return question_tuples

In [6]:
gss2012 = get_matches(df1, text)
gss2014 = get_matches(df2, text)
gss2016 = get_matches(df3, text)

In [7]:
len(gss2012)

1196

In [8]:
cleaned_data = pd.DataFrame()
survey_list = [(gss2012, "General Social Survey 2012"), (gss2014, "General Social Survey 2014"), \
               (gss2016, "General Social Survey 2016")]

survey_num = 4
for survey in survey_list:
    survey_num += 1
    for tup in survey[0]:
        var_name = tup[0]
        question = tup[1]
        cleaned_data = cleaned_data.append([[survey_num, var_name, \
                                             question]], ignore_index=True)

In [9]:
cleaned_data.columns = ["Survey_Number", "Var_Name", "Var_Text"]

In [10]:
cleaned_data.head()

Unnamed: 0,Survey_Number,Var_Name,Var_Text
0,5,OTHCREDT,OTHER PEOPLE TAKE CREDIT FOR RS WORK OR IDEAS
1,5,FRNDSEX,R HAD SEX WITH FRIEND LAST YEAR
2,5,SEX,SEX5SEX OF SEX PARTNERS LAST FIVE YEARS
3,5,WORDB,WORD B
4,5,WKAGEISM,R FEELS DISCRIMINATED BECAUSE OF AGE


In [14]:
def make_csv(survey_num, year):
    df = cleaned_data[cleaned_data["Survey_Number"] == survey_num]
    df = df.iloc[:, 1:3]
    print(df)
    name = "GSS_cleaned_" + str(year) + ".csv" 
    df.to_csv(path_or_buf=name, index=False)

In [15]:
make_csv(5, 2012)
make_csv(6, 2014)
make_csv(7, 2016)

       Var_Name                                           Var_Text
0      OTHCREDT      OTHER PEOPLE TAKE CREDIT FOR RS WORK OR IDEAS
1       FRNDSEX                    R HAD SEX WITH FRIEND LAST YEAR
2           SEX            SEX5SEX OF SEX PARTNERS LAST FIVE YEARS
3         WORDB                                             WORD B
4      WKAGEISM               R FEELS DISCRIMINATED BECAUSE OF AGE
5      CONFINAN           CONFID IN BANKS & FINANCIAL INSTITUTIONS
6         ENGDA                         HAPPY IF DAUGHTER ENGINEER
7     MAOCCINDV                MOTHER'S OCC/IND VERBATIM RETRIEVED
8          RANK                      10RS SOCIAL RANK 10 YEARS AGO
9       ENGPROB                          ENGINEERS HELP SOLVE PROB
10       RELSP1    1RELATIONSHIP OF PERSON 11 TO SPOUSE OF HOUSEHO
11       RELSP1    3RELATIONSHIP OF PERSON 13 TO SPOUSE OF HOUSEHO
12      EVCRACK                           R EVER USE CRACK COCAINE
13      RELHHD1           RELATION OF 1ST PERSON TO HOUSEHOLD 

In [16]:
cleaned_data.to_csv(path_or_buf="GSS_cleaned.csv")

In [17]:
len(cleaned_data[cleaned_data["Survey_Number"] == 5])

1196

# Part 2: Create Survey Detail Description

In [18]:
summary_list = ["The GSS gathers data on contemporary American society in order to monitor and explain trends and constants in attitudes, behaviors, and attributes. The GSS contains a standard core of demographic, behavioral, and attitudinal questions, plus topics of special interest. Among the topics covered are civil liberties, crime and violence, intergroup tolerance, morality, national spending priorities, psychological well-being, social mobility, and stress and traumatic events. Altogether the GSS is the single best source for sociological and attitudinal trend data covering the United States. It allows researchers to examine the structure and functioning of society in general as well as the role played by relevant subgroups and to compare the United States to other nations."] * 3

In [19]:
d = {"Survey_Number": cleaned_data["Survey_Number"].unique(), 
    "Survey_Name": ["General Social Survey 2012", "General Social Survey 2014", \
                    "General Social Survey 2016"],
    "Num_Participants": [1500, 1500, 1500], 
    "Org_Conduct": ["NORC", "NORC", "NORC"], 
    "Num_Questions": [len(cleaned_data[cleaned_data["Survey_Number"] == 5]), 
                    len(cleaned_data[cleaned_data["Survey_Number"] == 6]),
                    len(cleaned_data[cleaned_data["Survey_Number"] == 7])], 
    "Data_Link": ["http://bit.ly/2I1BhXu", "http://bit.ly/2I1BhXu", "http://bit.ly/2I1BhXu"],
    "Documentation_Link": ["http://bit.ly/2F4F1cG", "http://bit.ly/2F4F1cG", "http://bit.ly/2F4F1cG"],\
    "Source_Link": ["http://gss.norc.org", "http://gss.norc.org", "http://gss.norc.org"],\
    "Summary": summary_list}

In [20]:
detail = pd.DataFrame(data=d, columns = ["Survey_Number", "Survey_Name", "Num_Participants", \
                      "Org_Conduct", "Num_Questions", "Data_Link", "Documentation_Link", "Source_Link", \
                      "Summary"])

In [21]:
detail

Unnamed: 0,Survey_Number,Survey_Name,Num_Participants,Org_Conduct,Num_Questions,Data_Link,Documentation_Link,Source_Link,Summary
0,5,General Social Survey 2012,1500,NORC,1196,http://bit.ly/2I1BhXu,http://bit.ly/2F4F1cG,http://gss.norc.org,The GSS gathers data on contemporary American ...
1,6,General Social Survey 2014,1500,NORC,1250,http://bit.ly/2I1BhXu,http://bit.ly/2F4F1cG,http://gss.norc.org,The GSS gathers data on contemporary American ...
2,7,General Social Survey 2016,1500,NORC,1315,http://bit.ly/2I1BhXu,http://bit.ly/2F4F1cG,http://gss.norc.org,The GSS gathers data on contemporary American ...


In [23]:
detail.to_csv(path_or_buf="GSS_Survey_Detail.csv", index=False)