# Demos for analyzing World Color Survey (WCS)

COG 260: Data, Computation, and The Mind (Yang Xu)

Data source: http://www1.icsi.berkeley.edu/wcs/data.html

______________________________________________

Import helper function file for WCS data analysis.

In [1]:
from wcs_helper_functions import *

Import relevant Python libraries.

In [2]:
import numpy as np
import pandas as pd
from scipy import stats
from random import random
%matplotlib inline

## Demo 3: Import color naming data
    
> Each of the 330 color chips was named by speakers of 110 different languages.

______________________________________________

Load naming data. 

`namingData` is a hierarchical dictionary organized as follows:

**language _(1 - 110)_ &rarr; speaker _(1 - *range varies per language*)_ &rarr; chip index _(1 - 330)_ &rarr; color term**

In [3]:
namingData = readNamingData('term.txt')

For example, to obtain naming data from language 1 and speaker 1 for all 330 color chips:

In [4]:
#namingData[1]

In [73]:
naming_df = pd.DataFrame.from_dict(namingData, orient= 'index').reset_index()
naming_df = naming_df.rename(columns = {'index':'Language'})
naming_df.head(4)
#naming_df = naming_df.fillna(0)


Unnamed: 0,Language,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,1,"{1: 'LB', 2: 'LB', 3: 'LE', 4: 'WK', 5: 'LF', ...","{1: 'LB', 2: 'F', 3: 'LE', 4: 'WK', 5: 'F', 6:...","{1: 'F', 2: 'F', 3: 'WK', 4: 'LB', 5: 'F', 6: ...","{1: 'LB', 2: 'LF', 3: 'LE', 4: 'WK', 5: 'LF', ...","{1: 'LF', 2: 'F', 3: 'LE', 4: 'WK', 5: 'LF', 6...","{1: 'G', 2: 'F', 3: 'LE', 4: 'WK', 5: 'F', 6: ...","{1: 'G', 2: 'G', 3: 'LE', 4: 'S', 5: 'G', 6: '...","{1: 'G', 2: 'WK', 3: 'LE', 4: 'WK', 5: 'F', 6:...","{1: 'G', 2: 'F', 3: 'LE', 4: 'LB', 5: 'F', 6: ...",...,,,,,,,,,,
1,2,"{1: 'YN', 2: 'YN', 3: 'NR', 4: 'TK', 5: 'YN', ...","{1: 'IT', 2: 'AT', 3: 'NR', 4: 'IR', 5: 'YN', ...","{1: 'PN', 2: 'PN', 3: 'NR', 4: 'PN', 5: 'YN', ...","{1: 'IT', 2: 'EP', 3: 'NR', 4: 'TK', 5: 'IT', ...","{1: 'IT', 2: 'YN', 3: 'NR', 4: 'TK', 5: 'IT', ...","{1: 'IT', 2: 'YN', 3: 'NR', 4: 'TK', 5: 'YN', ...","{1: 'TK', 2: 'EP', 3: 'NR', 4: 'IT', 5: 'IT', ...","{1: 'IT', 2: 'YN', 3: 'NR', 4: 'KR', 5: 'IT', ...","{1: 'YN', 2: 'MP', 3: 'NR', 4: 'TK', 5: 'TK', ...",...,,,,,,,,,,
2,3,"{1: '*', 2: '*', 3: 'ED', 4: '*', 5: '*', 6: '...","{1: 'ID', 2: 'EL', 3: 'AA', 4: 'NG', 5: 'ID', ...","{1: 'AA', 2: '*', 3: 'AT', 4: 'PA', 5: 'BA', 6...","{1: 'BA', 2: 'AA', 3: 'AT', 4: 'NG', 5: 'AA', ...","{1: 'LU', 2: 'EL', 3: 'AT', 4: 'NG', 5: 'GA', ...","{1: 'AA', 2: 'BA', 3: 'AT', 4: 'IN', 5: 'AA', ...","{1: 'BA', 2: 'ID', 3: 'ED', 4: 'PA', 5: '*', 6...","{1: 'AA', 2: 'ID', 3: 'AT', 4: 'BA', 5: 'IN', ...","{1: 'BA', 2: 'EL', 3: 'AT', 4: 'NG', 5: 'AA', ...",...,,,,,,,,,,
3,4,"{1: 'CE', 2: 'TA', 3: 'CY', 4: 'TX', 5: 'TA', ...","{1: 'CE', 2: 'SA', 3: 'KA', 4: 'TX', 5: 'SA', ...","{1: 'TX', 2: 'SA', 3: 'CY', 4: 'TX', 5: 'TA', ...","{1: 'CE', 2: 'TA', 3: 'CY', 4: 'TX', 5: 'TA', ...","{1: 'SP', 2: 'SW', 3: 'KA', 4: 'CY', 5: 'SA', ...","{1: 'CE', 2: 'SA', 3: 'CY', 4: 'TX', 5: 'KA', ...","{1: 'SP', 2: 'KA', 3: 'LA', 4: 'TX', 5: 'KA', ...","{1: 'XE', 2: 'SA', 3: 'CY', 4: 'TX', 5: 'SA', ...","{1: 'SP', 2: 'SA', 3: 'LA', 4: 'KE', 5: 'SA', ...",...,"{1: 'CE', 2: 'TA', 3: 'CY', 4: 'TX', 5: 'TA', ...","{1: 'CE', 2: 'SA', 3: 'CY', 4: 'TX', 5: 'TA', ...","{1: 'XE', 2: 'SA', 3: 'CY', 4: 'KE', 5: 'TA', ...","{1: 'XE', 2: 'CE', 3: 'CY', 4: 'TA', 5: 'SA', ...","{1: 'CE', 2: 'TA', 3: 'CY', 4: 'CY', 5: 'TA', ...","{1: 'TA', 2: 'SP', 3: 'LA', 4: 'TX', 5: 'TA', ...","{1: '*', 2: 'TA', 3: 'CY', 4: 'TX', 5: 'TA', 6...","{1: 'TA', 2: 'TA', 3: 'CY', 4: 'TX', 5: 'TA', ...","{1: 'TA', 2: 'TA', 3: 'CY', 4: 'KA', 5: 'TA', ...","{1: 'TA', 2: 'SW', 3: 'CY', 4: 'CY', 5: 'TA', ..."


In [35]:
#the count includes the 1st column 
naming_df.count(axis =1)

0      26
1      25
2      26
3      36
4       7
       ..
105    26
106    26
107    26
108    26
109    26
Length: 110, dtype: int64

In [36]:
#so the actual sum of the total speaker would be  
sum(naming_df.count(axis =1))-110

2616

In [6]:
sp_of_each_lang = [] # includes corresponding language values with the colour codes 

for index, rows in naming_df.iterrows():
    #print(type(rows))
    row = pd.DataFrame(rows).dropna()
    for i, r in row.iterrows():
        #print(r) 
        sp_of_each_lang.append(r.iloc[0])
        
    #print("---{}---".format(index))


In [74]:
sp_of_each_lang[:4]

[1,
 {1: 'LB',
  2: 'LB',
  3: 'LE',
  4: 'WK',
  5: 'LF',
  6: 'LE',
  7: 'F',
  8: 'LE',
  9: 'LE',
  10: 'LB',
  11: 'LB',
  12: 'F',
  13: 'LB',
  14: 'LB',
  15: 'LF',
  16: 'LF',
  17: 'LE',
  18: 'LB',
  19: 'LF',
  20: 'LB',
  21: 'LE',
  22: 'LF',
  23: 'LF',
  24: 'LB',
  25: 'LB',
  26: 'LB',
  27: 'LB',
  28: 'LF',
  29: 'LE',
  30: 'LB',
  31: 'LE',
  32: 'LF',
  33: 'LE',
  34: 'LB',
  35: 'LB',
  36: 'LE',
  37: 'LB',
  38: 'LB',
  39: 'LE',
  40: 'LB',
  41: 'LB',
  42: 'LE',
  43: 'F',
  44: 'LB',
  45: 'LF',
  46: 'LB',
  47: 'LF',
  48: 'LB',
  49: 'LE',
  50: 'LB',
  51: 'F',
  52: 'LF',
  53: 'LE',
  54: 'LB',
  55: 'LB',
  56: 'LE',
  57: 'LB',
  58: 'F',
  59: 'LF',
  60: 'LB',
  61: 'LE',
  62: 'F',
  63: 'LE',
  64: 'LB',
  65: 'LE',
  66: 'LF',
  67: 'F',
  68: 'LE',
  69: 'F',
  70: 'LF',
  71: 'F',
  72: 'F',
  73: 'F',
  74: 'LF',
  75: 'LB',
  76: 'LE',
  77: 'LB',
  78: 'LF',
  79: 'F',
  80: 'LB',
  81: 'F',
  82: 'LB',
  83: 'LF',
  84: 'LE',
  85: 'LB'

In [18]:
len(sp_of_each_lang) 

2726

In [24]:
sp_of_each_lg = [] # only includes the colour code doesnt include corresponding language values
unique = []

for i in range(len(sp_of_each_lang)): 
    if isinstance(sp_of_each_lang[i], dict):
        #print(i)
        il = list(sp_of_each_lang[i].values())
        unqe = len(set(il))
        sp_of_each_lg.append(il) 
        unique.append(unqe)
        

print(sp_of_each_lang[-1])
print(unique[:10])

{1: 'G', 2: 'G', 3: 'R', 4: 'P', 5: 'G', 6: 'W', 7: 'G', 8: 'P', 9: 'P', 10: 'BL', 11: 'B', 12: 'Y', 13: 'G', 14: 'B', 15: 'P', 16: 'W', 17: 'R', 18: 'B', 19: 'G', 20: 'G', 21: 'R', 22: 'Y', 23: 'P', 24: 'B', 25: 'G', 26: 'BL', 27: 'W', 28: 'R', 29: 'G', 30: 'Y', 31: 'G', 32: 'P', 33: 'G', 34: 'G', 35: 'Y', 36: 'B', 37: 'BL', 38: 'R', 39: 'G', 40: 'G', 41: 'R', 42: 'Y', 43: 'B', 44: 'W', 45: 'G', 46: 'B', 47: 'G', 48: 'Y', 49: 'G', 50: 'P', 51: 'Y', 52: 'W', 53: 'R', 54: 'G', 55: 'G', 56: 'Y', 57: 'G', 58: 'Y', 59: 'W', 60: 'B', 61: 'R', 62: 'P', 63: 'Y', 64: 'G', 65: 'R', 66: 'W', 67: 'G', 68: 'P', 69: 'B', 70: 'W', 71: 'BL', 72: 'G', 73: 'BL', 74: 'G', 75: 'B', 76: 'Y', 77: 'G', 78: 'W', 79: 'B', 80: 'G', 81: 'Y', 82: 'G', 83: 'W', 84: 'Y', 85: 'G', 86: 'R', 87: 'B', 88: 'P', 89: 'B', 90: 'B', 91: 'Y', 92: 'W', 93: 'G', 94: 'B', 95: 'W', 96: 'Y', 97: 'P', 98: 'R', 99: 'P', 100: 'G', 101: 'R', 102: 'P', 103: 'Y', 104: 'G', 105: 'P', 106: 'Y', 107: 'W', 108: 'G', 109: 'BL', 110: 'Y', 1

In [8]:
#sp_of_each_lg is a list of list each list element includes 310 color responses  
print(len(sp_of_each_lg))
print(len(unique))

2616
2616


## Demo 5: Import speaker demographic information

> Most speakers' age _(integer)_ and gender _(M/F)_ information was recorded.

______________________________________________

Load speaker information.

`speakerInfo` is a hierarchical dictionary organized as follows:

**language &rarr; speaker &rarr; (age, gender)**

In [9]:
speakerInfo = readSpeakerData('spkr-lsas.txt')
#speakerInfo

In [77]:
gender = pd.DataFrame.from_dict(speakerInfo, orient = 'index').reset_index()
gender = gender.rename(columns = {'index':'Language'})
gender.head(5)
#gender.loc[0,13]

Unnamed: 0,Language,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,1,"[(90, M)]","[(26, M)]","[(38, M)]","[(35, M)]","[(80, M)]","[(48, M)]","[(26, M)]","[(39, M)]","[(47, F)]",...,,,,,,,,,,
1,2,"[(20, F)]","[(40, F)]","[(45, F)]","[(45, F)]","[(50, F)]","[(50, F)]","[(50, F)]","[(55, F)]","[(55, F)]",...,,,,,,,,,,
2,3,"[(26, F)]","[(26, F)]","[(30, F)]","[(35, F)]","[(38, F)]","[(40, F)]","[(43, F)]","[(46, F)]","[(46, F)]",...,,,,,,,,,,
3,4,"[(15, F)]","[(17, F)]","[(18, F)]","[(20, F)]","[(20, F)]","[(22, F)]","[(23, F)]","[(24, F)]","[(24, F)]",...,"[(24, M)]","[(25, M)]","[(29, M)]","[(32, M)]","[(37, M)]","[(37, M)]","[(50, M)]","[(58, M)]","[(77, M)]","[(0, M)]"
4,5,"[(29, M)]","[(20, M)]","[(40, F)]","[(16, F)]","[(24, M)]","[(30, M)]",,,,...,,,,,,,,,,


In [40]:
#the total number of speakers for the gender data 
sum(gender.count(axis=1))

2728

In [72]:
gender[3]

0      [(38, M)]
1      [(45, F)]
2      [(30, F)]
3      [(18, F)]
4      [(40, F)]
         ...    
105    [(20, F)]
106    [(16, F)]
107    [(25, F)]
108    [(49, F)]
109    [(35, M)]
Name: 3, Length: 110, dtype: object

In [76]:
gd = [] #includes the corresponding language values 

for index, rows in gender.iterrows():
    #print(type(rows))
    row = pd.DataFrame(rows).dropna()
    for i, r in row.iterrows():
        #print(r.iloc[0]) 
        gd.append(r.iloc[0])

In [78]:
v = []
v= pd.DataFrame(gender.count(axis=1) == naming_df.count(axis =1), columns = ["matches?"])

index_list = v[(v["matches?"] == False)].index.tolist()

#these are the indices where the speakers count in gender dont match with naming's
index_list

[61, 87, 92, 94, 96]

In [91]:
#gender.shape[0]
for i in index_list: 
    print(gender.iloc[[i]])


    Language          1          2          3          4          5  \
61        62  [(48, F)]  [(25, M)]  [(18, F)]  [(60, F)]  [(25, M)]   

            6          7          8          9  ...   26   27   28   29   30  \
61  [(13, M)]  [(60, F)]  [(28, M)]  [(60, F)]  ...  NaN  NaN  NaN  NaN  NaN   

     31   32   33   34   35  
61  NaN  NaN  NaN  NaN  NaN  

[1 rows x 36 columns]
    Language          1          2          3          4          5  \
87        88  [(70, M)]  [(30, M)]  [(55, M)]  [(70, M)]  [(68, M)]   

            6          7          8          9  ...   26   27   28   29   30  \
87  [(35, M)]  [(75, M)]  [(50, M)]  [(40, M)]  ...  NaN  NaN  NaN  NaN  NaN   

     31   32   33   34   35  
87  NaN  NaN  NaN  NaN  NaN  

[1 rows x 36 columns]
    Language          1          2          3          4          5  \
92        93  [(15, F)]  [(20, F)]  [(28, F)]  [(17, M)]  [(18, M)]   

            6          7          8    9  ...   26   27   28   29   30   31  \
92  

In [20]:
len(gd)

2728

In [12]:
gd_age = [] # only includes the gender and age doesnt include corresponding language values

for i in range(len(gd)): 
    if isinstance(gd[i], list):
        #print(i)
        il = list(gd[i])
        gd_age.append(il)   

In [13]:
len(gd_age)

2618

In [63]:
profile = list(zip(gd_age, unique))
len(profile)

2616

In [92]:
profile[60:64]

[([('60', 'F')], 12),
 ([('61', 'F')], 13),
 ([('62', 'F')], 4),
 ([('19', 'M')], 14)]

In [16]:
#speaker profile with their age , gender and unique colour naming (done)
#create subplot for each language 

In [17]:
#three scatter subplots to visualize the gender distribution across languages, 
#gender vs number of unique colour names, 
#age vs number of unique colour names and identify the trends with linear regression lines across 110 languages. 
