### Demographics API

Below we have the code that retrieves the data from the  Mechanical Turk Tracker Demographics API.

In [1]:
from datetime import datetime
import time

# This function takes as input the response for a single survey, and transforms it into a flat dictionary
def flatten(item):
    fmt = "%Y-%m-%dT%H:%M:%S.%fZ"
    
    hit_answer_date = datetime.strptime(item["date"], fmt)
    hit_creation_str = item.get("hitCreationDate")
    
    if hit_creation_str is None: 
        hit_creation_date = None 
        diff = None
    else:
        hit_creation_date = datetime.strptime(hit_creation_str, fmt)
        # convert to unix timestamp
        hit_date_ts = time.mktime(hit_creation_date.timetuple())
        answer_date_ts = time.mktime(hit_answer_date.timetuple())
        diff = int(answer_date_ts-hit_date_ts)
    
    result = {
        "worker_id": str(item["workerId"]),
        "gender": str(item["answers"]["gender"]).lower(),
        "household_income": str(item["answers"]["householdIncome"]),
        "educational_level": str(item["answers"].get("educationalLevel")),
        "household_size": str(item["answers"]["householdSize"]),
        "marital_status": str(item["answers"].get("maritalStatus")),
        "languages_spoken": str(item["answers"].get("languagesSpoken")),
        "time_spent_on_mturk": str(item["answers"].get("timeSpentOnMturk")),
        "weekly_income_from_mturk": str(item["answers"].get("weeklyIncomeFromMturk")),
        "year_of_birth": int(item["answers"]["yearOfBirth"]),
        "location_city": str(item.get("locationCity")),
        "location_region": str(item.get("locationRegion")),
        "location_country": str(item["locationCountry"]),
        "hit_answered_date": hit_answer_date,
        "hit_creation_date": hit_creation_date,
        "post_to_completion_secs": diff
    }
    return result

In [2]:
# The code below retrieves all the responses from the Demographics API
# Since we cannot get all the responses at once, we fetch a few thousand
# records at a time, until fetching them all

import requests
import json

limit = 5000

# The API call that returns the last survey responses
baseurl = "https://mturk-surveys.appspot.com/" + \
    "_ah/api/survey/v1/survey/demographics/answers?limit=" + str(limit)

# This is the cursor variable, used to retrieve more pages of results
nextPageToken = None

# We store the results in this list
results = []

while True:
    if nextPageToken == None:
        url = baseurl
    else:
        url = baseurl + "&cursor=" + nextPageToken

    resp = requests.get(url)
    if resp.status_code == 200:
        data = json.loads(resp.text)
        items = data.get("items")
        if items == None:
            break
        print("Retrieved ", len(items), " responses")
        responses = [flatten(item) for item in items]
        results.extend(responses)
        print("Total of ", len(results), " responses in our data")
    else:
        print("Something went wrong with the network call")

    nextPageToken = data.get("nextPageToken")
    if nextPageToken == None:
        break


Retrieved  5000  responses
Total of  5000  responses in our data
Retrieved  5000  responses
Total of  10000  responses in our data
Retrieved  5000  responses
Total of  15000  responses in our data
Retrieved  5000  responses
Total of  20000  responses in our data
Retrieved  5000  responses
Total of  25000  responses in our data
Retrieved  5000  responses
Total of  30000  responses in our data
Retrieved  5000  responses
Total of  35000  responses in our data
Retrieved  5000  responses
Total of  40000  responses in our data
Retrieved  5000  responses
Total of  45000  responses in our data
Retrieved  5000  responses
Total of  50000  responses in our data
Retrieved  5000  responses
Total of  55000  responses in our data
Retrieved  5000  responses
Total of  60000  responses in our data
Something went wrong with the network call
Retrieved  5000  responses
Total of  65000  responses in our data
Retrieved  5000  responses
Total of  70000  responses in our data
Retrieved  5000  responses
Total o

In [3]:
# Let's print the total number of retrieved responses
print(len(results))

162287


In [4]:
import pandas as pd
df = pd.DataFrame(results)
# Let's save the file as a CSV
df.to_csv("mturk_surveys_extended.csv")

In [5]:
df

Unnamed: 0,educational_level,gender,hit_answered_date,hit_creation_date,household_income,household_size,languages_spoken,location_city,location_country,location_region,marital_status,post_to_completion_secs,time_spent_on_mturk,weekly_income_from_mturk,worker_id,year_of_birth
0,"Graduate degree, Masters",female,2019-11-04 15:42:46.243,2019-11-04 15:42:01,"$40,000-$59,999",2,English,nelsonville,US,oh,married,45.0,1-2 hours per week,$10-$20 per week,6829a46cd608e93e001495e92d8e8e03,1991
1,"Some college, no degree",female,2019-11-04 15:29:09.796,2019-11-04 15:27:03,"$40,000-$59,999",2,English,muskegon,US,mi,married,126.0,20-40 hours per week,$100-$200 per week,c611d3dbe42440d5235b403b0992df41,1953
2,Bachelors degree,male,2019-11-04 15:28:41.148,2019-11-04 14:12:00,"$100,000 or more",5+,English,easton,US,ma,single,4601.0,2-4 hours per week,$5-$10 per week,f44efb390a33d953e3b4c2e2be61ec13,1998
3,Associates degree,male,2019-11-04 15:25:24.796,2019-11-04 14:27:01,"$75,000-$99,999",4,English,owensboro,US,ky,married,3503.0,4-8 hours per week,$20-$50 per week,9c48acb5defef619a4f8b110fad1a0b9,1980
4,Bachelors degree,male,2019-11-04 14:57:48.581,2019-11-04 14:57:02,"$40,000-$59,999",3,English,auburn,US,wa,single,46.0,More than 40 hours per week,More than $500 per week,5bb42cfb6cc0c9a89b2cca5dd55ab1cd,1990
5,"Graduate degree, Masters",female,2019-11-04 14:47:03.228,2019-11-04 14:42:01,"Less than $10,000",4,English,chennai,IN,tn,single,302.0,8-20 hours per week,$5-$10 per week,c2f2e12f9e1008a7d66a15f26f956e2e,1994
6,"Graduate degree, Masters",female,2019-11-04 14:05:24.898,2019-11-04 13:57:00,"$60,000-$74,999",4,English,elk grove village,US,il,married,504.0,20-40 hours per week,$20-$50 per week,392aa74246af8104909e71633fae54ff,1987
7,Bachelors degree,male,2019-11-04 13:59:16.621,2019-11-04 12:57:01,"Less than $10,000",5+,English,chennai,IN,tn,single,3735.0,2-4 hours per week,$1-$5 per week,17d78291ca22362431450dd85d3a6907,1989
8,Bachelors degree,female,2019-11-04 13:48:44.715,2019-11-04 13:42:02,"$25,000-$39,999",4,English,new york,US,ny,married,402.0,4-8 hours per week,$1-$5 per week,face85768001595516a12458cc9482b6,1969
9,Bachelors degree,male,2019-11-04 13:40:40.385,2019-11-04 13:27:02,"$25,000-$39,999",3,"English,Hindi,Malayalam",thrissur,IN,kl,single,818.0,2-4 hours per week,$5-$10 per week,b718cd512deb7f8c9080a74a4d08afec,1987


In [6]:
people_with_language = len([v for v in df.languages_spoken.values if v!='None' and v!=''])
people_with_language

81220

In [7]:
df.location_country.value_counts()

US    117855
IN     30168
CA      2229
GB      1558
IT       697
DE       609
BR       601
PH       514
VE       464
FR       436
ES       340
ZZ       321
MX       301
AU       248
KE       208
AE       193
RO       178
NL       173
TR       171
JP       163
NG       162
TH       156
ID       151
RU       150
PT       146
GR       142
IE       137
MK       134
UA       119
NZ       115
       ...  
SR         2
DJ         2
AM         2
BS         2
PY         2
TM         2
AZ         2
LI         2
MO         2
RW         2
MV         2
SX         2
AI         1
HT         1
BM         1
LA         1
FJ         1
GN         1
PS         1
TZ         1
GM         1
AG         1
BU         1
IM         1
CD         1
FM         1
PF         1
UZ         1
NE         1
CN         1
Name: location_country, Length: 156, dtype: int64

In [8]:
df [  df.languages_spoken.str.contains(',') ].location_country.value_counts()

IN    7226
US    6674
CA     439
IT     375
BR     339
DE     297
GB     189
FR     187
ES     182
VE     146
MX     138
RO      66
NL      64
PT      55
KE      51
PH      48
CO      48
RU      44
JP      39
AE      38
GR      37
MK      34
EG      34
AU      33
TR      32
BE      31
BG      30
BD      30
NG      30
LT      29
      ... 
ZW       5
BA       5
HU       4
NO       4
MT       4
CH       4
KW       4
ZA       4
IQ       3
JO       3
LV       3
SK       3
CY       3
NI       3
JM       2
MD       2
GE       2
AL       2
MN       2
OM       2
CW       2
UG       1
NA       1
FM       1
RW       1
UY       1
MU       1
MV       1
ME       1
SR       1
Name: location_country, Length: 106, dtype: int64

In [9]:
# bilingual and above
len(df [ df.languages_spoken.str.contains(',') ].worker_id.value_counts())

9080

In [10]:
# unique ids of workers that answered the language question
len(df [ df.languages_spoken !='None' ].worker_id.value_counts())

38412

In [11]:
df [  df.languages_spoken.str.contains('English') ].location_country.value_counts()

US    56665
IN    15142
CA     1295
GB      971
IT      415
BR      377
DE      356
PH      220
VE      220
ES      204
FR      196
MX      163
ZZ      155
KE      132
AU      129
NL      100
NG       83
TH       80
JP       78
IE       76
RO       74
UA       71
ID       68
AE       65
PT       65
CO       60
TR       59
RU       51
BD       50
PK       47
      ...  
NA        2
TJ        2
GD        2
RW        2
GY        2
ZM        2
JE        2
KG        2
CM        2
ME        2
MD        2
TM        2
SX        2
AL        2
LU        1
BN        1
FM        1
UY        1
MU        1
UZ        1
DJ        1
NE        1
ET        1
MV        1
GN        1
SR        1
KN        1
BY        1
AM        1
MO        1
Name: location_country, Length: 134, dtype: int64

In [12]:
df [  df.languages_spoken.str.contains('Tamil') ].location_country.value_counts()

IN    6176
US      96
AE      21
CA      12
GB      11
ZZ       9
SA       8
QA       4
AU       4
FR       4
DE       3
NO       3
BH       3
KW       2
SG       2
LK       2
UA       2
PL       2
OM       1
MV       1
SE       1
CO       1
HK       1
IT       1
IQ       1
PK       1
NG       1
MY       1
CZ       1
JP       1
Name: location_country, dtype: int64

In [13]:
df [  df.languages_spoken.str.contains('Spanish') ].location_country.value_counts()

US    3551
IN     192
ES     190
VE     168
BR     157
MX     139
IT      75
CO      49
CA      48
GB      44
FR      36
DE      30
PR      29
PT      26
AR      26
DO      18
CL      18
PE      17
EC      15
BE      14
JP      11
NL      10
ZZ       9
CR       8
GR       8
SV       7
PA       7
MA       6
AU       6
TH       6
      ... 
IE       3
KR       3
GH       3
RU       2
QA       2
UA       2
SE       2
TW       2
PH       2
JO       2
BG       2
CW       2
UY       2
EE       2
JM       2
LB       2
CH       2
SA       1
NP       1
CZ       1
FI       1
RS       1
BD       1
BH       1
AL       1
MY       1
AE       1
MK       1
PK       1
MD       1
Name: location_country, Length: 72, dtype: int64

In [14]:
df [  df.languages_spoken.str.contains('Hindi') ].location_country.value_counts()

IN    2921
US     184
CA      31
AE      29
BD      20
GB      19
SA      11
NP       9
SE       9
BR       7
DE       7
PH       6
IT       5
PK       5
BH       4
UA       4
NZ       3
QA       3
KW       3
NO       3
KR       2
JP       2
HK       2
AU       2
SG       2
OM       2
ZZ       1
IQ       1
GH       1
DK       1
ID       1
Name: location_country, dtype: int64

In [15]:
df [  df.languages_spoken.str.contains('French') ].location_country.value_counts()

US    874
CA    287
FR    208
IN    185
IT     64
DE     63
GB     63
BE     30
ES     26
KE     22
EG     20
NG     20
MA     18
BR     18
DZ     18
BH     17
NL     15
IE     12
PT     10
RO      8
TN      7
VE      5
TR      5
PA      5
IL      4
MX      4
CO      4
JP      4
GH      4
RS      4
     ... 
AR      2
BG      2
HU      2
FI      2
SA      2
KR      2
BD      2
CZ      1
IQ      1
AT      1
FM      1
JO      1
DO      1
NI      1
PL      1
AE      1
RW      1
ID      1
MU      1
UG      1
UA      1
CR      1
RU      1
AL      1
HK      1
AU      1
MD      1
BA      1
MT      1
PK      1
Name: location_country, Length: 66, dtype: int64

In [16]:
df [  df.languages_spoken.str.contains('Malayalam') ].location_country.value_counts()

IN    1540
AE      28
US      24
SA      10
CA       6
QA       4
KW       3
GB       3
BH       3
ZZ       2
OM       2
MV       1
CO       1
BN       1
GT       1
IQ       1
Name: location_country, dtype: int64

In [17]:
df [  df.languages_spoken.str.contains('Tegulu') ].location_country.value_counts()

IN    713
US     30
GB      7
SE      3
ZZ      2
PL      2
JP      1
BH      1
ID      1
CA      1
AU      1
FR      1
Name: location_country, dtype: int64

In [18]:
df [  df.languages_spoken.str.contains('German') ].location_country.value_counts()

US    418
DE    304
IN     48
GB     30
IT     28
FR     19
BE     18
NL     17
ES     16
IE     12
LB     12
CA     11
FI     10
AT      9
MX      6
CO      6
DK      5
TR      5
BR      4
HR      4
AU      4
MK      3
TH      3
RO      2
GR      2
AR      2
VE      2
SI      2
SV      1
AL      1
BA      1
JP      1
PL      1
PH      1
ZA      1
GT      1
NO      1
ID      1
BO      1
VN      1
TN      1
RS      1
ZZ      1
CY      1
CH      1
KE      1
LT      1
EG      1
SE      1
Name: location_country, dtype: int64

In [19]:
df [  df.languages_spoken.str.contains('Chinese') ].location_country.value_counts()

US    683
IN     83
CA     82
HK     16
MY     14
GB     11
TW      6
AU      5
IT      5
KR      4
PH      4
SG      4
DE      3
MX      3
DK      3
ES      3
JP      2
FR      2
ZZ      2
NL      2
ID      2
BR      2
LV      1
BE      1
AE      1
TH      1
GH      1
MN      1
AR      1
NZ      1
ZA      1
PK      1
Name: location_country, dtype: int64

In [20]:
df [  df.languages_spoken.str.contains('Macedonian') ].location_country.value_counts()

MK    39
RS     4
US     2
BA     2
IT     1
Name: location_country, dtype: int64

In [21]:
languages = set([l for v in df.languages_spoken.values if v!='None' 
                 for l in v.split(',') if l!=''])

In [22]:

result = []
for language in languages:
    people = len(set(df[ df.languages_spoken.str.contains(language) ].worker_id.values))
    result.append({"lang":language, "unique_workers": people})
    

df_cnt = pd.DataFrame(result).sort_values('unique_workers', ascending=False)    
print (df_cnt)    
# more than 10 people for the language

#df2 = pd.DataFrame(result).sort_values('unique_workers', ascending=False)
#enough = df2 [df2.unique_workers > 9]
#len(enough)

            lang  unique_workers
44       English           37837
25       Spanish            3051
67         Tamil            2491
68         Hindi            1488
27        French            1258
64         Malay             686
81        German             673
11     Malayalam             666
76       Chinese             568
32       Italian             535
97    Portuguese             476
91        Tegulu             327
40      Japanese             248
45       Russian             234
41        Arabic             227
17       Kannada             173
37        Korean             162
88       Tagalog             157
77          Urdu             131
87    Vietnamese             112
7        Marathi             109
54         Dutch             105
99      Gujarati             102
83       Bengali              96
4        Punjabi              90
89        Polish              84
65       Turkish              75
8       Romanian              65
19         Greek              61
49        

In [23]:
df_cnt.unique_workers.value_counts().sort_index()

1        14
2         7
3         4
4         2
5         4
6         6
7         1
8         1
9         1
10        1
11        1
12        1
13        3
14        1
15        2
16        2
17        1
19        1
20        1
21        3
22        1
23        1
24        4
25        2
27        1
30        1
33        1
36        1
39        1
41        1
43        1
61        1
65        1
75        1
84        1
90        1
96        1
102       1
105       1
109       1
112       1
131       1
157       1
162       1
173       1
227       1
234       1
248       1
327       1
476       1
535       1
568       1
666       1
673       1
686       1
1258      1
1488      1
2491      1
3051      1
37837     1
Name: unique_workers, dtype: int64

In [24]:
len([v for v in df.languages_spoken.values if 'Spanish' in v])

5021

In [25]:
1189 / 19268 * 170000

10490.450487855513

In [26]:
def intersect(a, b):
    """ return the intersection of two lists """
    return list(set(a) & set(b))

my_langauge = ['English', 'Tamil', 'Spanish', 'Hindi', 'Malayalam', 'French', 'Telugu', 'Chinese', 'German', 'Kannada', 'Italian', 'Portuguese', 'Marathi', 'Arabic', 'Russian', 'Japanese', 'Gujarati', 'Urdu', 'Bengali', 'Punjabi', 'Korean', 'Tagalog', 'Romanian', 'Vietnamese', 'Greek', 'Polish', 'Dutch', 'Turkish', 'Hebrew', 'Swedish', 'Serbian', 'Nepali', 'Bulgarian', 'Macedonian', 'Oriya']
pavlick_langauge = ['English', 'Tamil', 'Malayalam', 'Hindi', 'Spanish', 'Telugu', 'Chinese', 'Romanian', 'Portuguese', 'Arabic', 'Kannada', 'German', 'French', 'Polish', 'Urdu', 'Tagalog', 'Marathi', 'Russian', 'Italian', 'Bengali', 'Gujarati', 'Hebrew', 'Dutch', 'Turkish', 'Vietnamese', 'Macedonian', 'Cebuano', 'Swedish', 'Bulgarian', 'Swahili', 'Hungarian', 'Catalan', 'Thai', 'Lithuanian', 'Punjabi']

intersect_language = intersect(my_langauge, pavlick_langauge)
len(intersect_language)


29

In [27]:
from scipy.stats import spearmanr

In [28]:
spearmanr(my_langauge,pavlick_langauge)



SpearmanrResult(correlation=0.057703081232493, pvalue=0.7419647128381073)