## Scan through Chinese Clean Meat Survey Results

1. Import in data / clean it a little
2. Questions:
    * Just a general demographic overview of each person
    * Summary statistics on proportion of people for each clean meat attitude question

In [1]:
import os
import pandas as pd

In [2]:
working_dir = "/Users/angie/Desktop/Code/in_progress/small-projects/clean_meat"
china_data_fp = os.path.join(working_dir, "data", "Cross+Country+Survey+-+China_October+25%2C+2018_07.11.csv")

In [3]:
encoding_windows = "latin1"
# first two rows of data are administrative
df = pd.read_csv(china_data_fp, encoding=encoding_windows, skiprows=[1, 2])
df.head()

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,ExternalReference,DistributionChannel,...,Q9.6_10,Q9.7,Q9.9,Q9.10,Q10.1,Q10.2,assignment_id,test_link,year_of_birth,femaleAs2MaleAs1
0,02/10/2018 19:24,02/10/2018 19:26,0,100,87,1,02/10/2018 19:26,R_ZCPxAZMkmR8PQIx,,anonymous,...,,,,,,,f9ec531d-7d5b-808a-151b-65c74217d8cb,0,1985,2
1,02/10/2018 19:26,02/10/2018 19:31,0,100,284,1,02/10/2018 19:31,R_3sh0r3xRFDEOmU7,,anonymous,...,,20.0,3.0,7.0,,,f3582a8c-97eb-fa24-a18a-1c734120955a,0,1994,1
2,02/10/2018 19:33,02/10/2018 19:34,0,100,60,1,02/10/2018 19:34,R_1iqUVtDeT7Nproh,,anonymous,...,,,,,,,11919e9a-d8ec-440a-689c-a8baf33bd249,0,1987,2
3,02/10/2018 19:31,02/10/2018 19:34,0,100,175,1,02/10/2018 19:34,R_22YsRZfJho3OOwG,,anonymous,...,,,,,,,8e660072-7303-d4d5-9f74-364d2554d6a3,0,1980,2
4,02/10/2018 19:31,02/10/2018 19:35,0,100,284,1,02/10/2018 19:35,R_vCR9O62qw2IrrlT,,anonymous,...,,,,,,,53bae30e-3c00-37ec-4308-d591ffd7bed4,0,1990,1


In [4]:
list(df.columns)

['StartDate',
 'EndDate',
 'Status',
 'Progress',
 'Duration (in seconds)',
 'Finished',
 'RecordedDate',
 'ResponseId',
 'ExternalReference',
 'DistributionChannel',
 'UserLanguage',
 'Q1.2',
 'Q2.2_1',
 'Q2.2_2',
 'Q2.2_3',
 'Q2.2_4',
 'Q2.2_5',
 'Q2.2_6',
 'Q2.2_7',
 'Q2.2_8',
 'Q2.3_1',
 'Q2.3_2',
 'Q2.3_3',
 'Q3.1_1',
 'Q3.1_2',
 'Q3.1_3',
 'Q3.1_4',
 'Q3.1_5',
 'Q3.1_6',
 'Q3.1_7',
 'Q3.1_8',
 'Q3.1_9',
 'Q3.1_10',
 'Q3.2',
 'Q4.1_Q4.1_1',
 'Q4.1_Q4.1_2',
 'Q4.1_Q4.1_3',
 'Q4.1_Q4.1_4',
 'Q4.1_Q4.1_5',
 'Q4.1_Q4.1_6',
 'Q4.1_Q4.1_7',
 'Q4.1_Q4.1_8',
 'Q4.1_Q4.1_9',
 'Q4.1_Q4.1_10',
 'Q4.1_Q4.1_11',
 'Q4.1_Q4.1_12',
 'Q4.1_Q4.1_13',
 'Q4.1_Q4.1_14',
 'Q4.1_Q4.1_15',
 'Q4.1_Q4.1_16',
 'Q4.2',
 'Q5.1_1',
 'Q5.1_2',
 'Q5.1_3',
 'Q5.1_4',
 'Q5.1_5',
 'Q5.1_6',
 'Q5.1_7',
 'Q5.1_8',
 'Q5.1_9',
 'Q5.1_10',
 'Q5.1_11',
 'Q5.1_12',
 'Q5.1_13',
 'Q5.1_14',
 'Q5.1_15',
 'Q5.1_16',
 'Q6.1',
 'Q6.2',
 'Q6.4',
 'Q6.5_1',
 'Q6.5_2',
 'Q6.5_3',
 'Q6.5_4',
 'Q6.5_5',
 'Q6.5_6',
 'Q6.5_7',
 'Q6.5_

In [5]:
def get_unique_non_na_answers(df, colname):
    unique_answers = set(df[colname])
    return list(filter(lambda x: not pd.isna(x), unique_answers))

get_unique_non_na_answers(df, "Q4.1_Q4.1_16")

[2.0, 3.0, 4.0, 5.0, 1.0]

Data Cleaning Tasks:
1. Clean sex column
2. Map question IDs onto meaningful values

In [6]:
df["isFemale"] = df["femaleAs2MaleAs1"] == 2
df = df.drop(columns="femaleAs2MaleAs1")
df.head(2)

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,ExternalReference,DistributionChannel,...,Q9.6_10,Q9.7,Q9.9,Q9.10,Q10.1,Q10.2,assignment_id,test_link,year_of_birth,isFemale
0,02/10/2018 19:24,02/10/2018 19:26,0,100,87,1,02/10/2018 19:26,R_ZCPxAZMkmR8PQIx,,anonymous,...,,,,,,,f9ec531d-7d5b-808a-151b-65c74217d8cb,0,1985,True
1,02/10/2018 19:26,02/10/2018 19:31,0,100,284,1,02/10/2018 19:31,R_3sh0r3xRFDEOmU7,,anonymous,...,,20.0,3.0,7.0,,,f3582a8c-97eb-fa24-a18a-1c734120955a,0,1994,False


In [7]:
get_unique_non_na_answers(df, "Q6.6")

[4.0, 2.0, 3.0, 1.0, 5.0]

In [17]:
resp_meanings = {
    1: "Not at all likely",
    2: "Somewhat likely",
    3: "Moderately likely",
    4: "Very likely",
    5: "Extremely likely"
}
max_len = max([len(txt) for txt in resp_meanings.values()])
clean_meat_q = "Q6.6"
total_responses = len(list(filter(lambda x: not pd.isna(x), df[clean_meat_q])))

print("Imagine that clean meat has become widely available at grocery stores, restaurants, butchers, and markets. How likely are you to try clean meat?")
print("<Response>: <# respondents> (<Percentage of respondents>)")
print("======================================")
for value in sorted(get_unique_non_na_answers(df, clean_meat_q), reverse=True):
    val_count = len(df[df[clean_meat_q] == value])
    perc = 100 * (val_count / total_responses)
    txt = resp_meanings[value]
    print("{}{}: {} ({} %)".format(" " * (max_len - len(txt)), txt, val_count, round(perc, 2)))
    

Imagine that clean meat has become widely available at grocery stores, restaurants, butchers, and markets. How likely are you to try clean meat?
<Response>: <# respondents> (<Percentage of respondents>)
 Extremely likely: 188 (18.08 %)
      Very likely: 446 (42.88 %)
Moderately likely: 220 (21.15 %)
  Somewhat likely: 126 (12.12 %)
Not at all likely: 60 (5.77 %)


In [18]:
print("Imagine that clean meat has become widely available at grocery stores, restaurants, butchers, and markets. How likely are you to try clean meat?")
print("<Response>: <# respondents> (<Cumulative Percentage of Respondents>)")
print("======================================")
total_perc = 0
for value in sorted(get_unique_non_na_answers(df, clean_meat_q), reverse=True):
    val_count = len(df[df[clean_meat_q] == value])
    perc = 100 * (val_count / total_responses)
    total_perc += perc
    txt = resp_meanings[value]
    print("{}{}: {} ({} %)".format(" " * (max_len - len(txt)), txt, val_count, round(total_perc, 2)))

Imagine that clean meat has become widely available at grocery stores, restaurants, butchers, and markets. How likely are you to try clean meat?
<Response>: <# respondents> (<Cumulative Percentage of Respondents>)
 Extremely likely: 188 (18.08 %)
      Very likely: 446 (60.96 %)
Moderately likely: 220 (82.12 %)
  Somewhat likely: 126 (94.23 %)
Not at all likely: 60 (100.0 %)
