# Demos for analyzing World Color Survey (WCS)

COG 260: Data, Computation, and The Mind (Yang Xu)

Data source: http://www1.icsi.berkeley.edu/wcs/data.html

______________________________________________

Import helper function file for WCS data analysis.

In [2]:
from wcs_helper_functions import *

Import relevant Python libraries.

In [3]:
import numpy as np
import pandas as pd
from scipy import stats
from random import random
%matplotlib inline

## Demo 3: Import color naming data
    
> Each of the 330 color chips was named by speakers of 110 different languages.

______________________________________________

Load naming data. 

`namingData` is a hierarchical dictionary organized as follows:

**language _(1 - 110)_ &rarr; speaker _(1 - *range varies per language*)_ &rarr; chip index _(1 - 330)_ &rarr; color term**

In [4]:
namingData = readNamingData('term.txt')

For example, to obtain naming data from language 1 and speaker 1 for all 330 color chips:

In [50]:
unique_colour_list = [] #contains 
#b = {}

for val in namingData: 
    #print(val)
    for s in namingData[val]:
        #b.keys() = s
        unique_colour_list.append(len(list(set(list(namingData[val][s].values())))))

In [33]:
unique_colour_list

[6,
 7,
 6,
 6,
 7,
 6,
 7,
 7,
 7,
 7,
 8,
 6,
 8,
 8,
 6,
 7,
 7,
 6,
 7,
 5,
 7,
 7,
 6,
 7,
 6,
 8,
 10,
 9,
 8,
 15,
 7,
 10,
 15,
 8,
 6,
 5,
 8,
 9,
 11,
 17,
 14,
 11,
 8,
 8,
 12,
 11,
 8,
 8,
 13,
 8,
 10,
 11,
 10,
 11,
 23,
 11,
 32,
 9,
 13,
 4,
 12,
 13,
 4,
 14,
 22,
 13,
 18,
 18,
 14,
 25,
 18,
 13,
 28,
 20,
 9,
 9,
 10,
 9,
 13,
 9,
 9,
 10,
 10,
 9,
 11,
 11,
 10,
 10,
 10,
 8,
 13,
 9,
 9,
 10,
 8,
 10,
 10,
 9,
 11,
 8,
 11,
 9,
 8,
 12,
 14,
 10,
 6,
 5,
 13,
 9,
 7,
 7,
 7,
 7,
 7,
 8,
 10,
 11,
 8,
 13,
 10,
 8,
 11,
 9,
 7,
 10,
 8,
 16,
 7,
 8,
 17,
 22,
 13,
 11,
 10,
 6,
 7,
 11,
 12,
 10,
 16,
 11,
 16,
 17,
 15,
 14,
 13,
 12,
 15,
 14,
 15,
 14,
 15,
 13,
 16,
 12,
 15,
 16,
 14,
 15,
 15,
 11,
 12,
 13,
 13,
 13,
 11,
 15,
 10,
 8,
 9,
 8,
 7,
 7,
 8,
 6,
 7,
 8,
 6,
 7,
 8,
 7,
 7,
 6,
 7,
 8,
 7,
 8,
 7,
 8,
 7,
 7,
 8,
 8,
 5,
 9,
 9,
 7,
 6,
 6,
 6,
 6,
 7,
 7,
 9,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 6,
 7,
 8,
 7,
 9,
 10,
 10,
 9,

In [51]:
print(list(namingData.keys()))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110]


## Demo 5: Import speaker demographic information

> Most speakers' age _(integer)_ and gender _(M/F)_ information was recorded.

______________________________________________

Load speaker information.

`speakerInfo` is a hierarchical dictionary organized as follows:

**language &rarr; speaker &rarr; (age, gender)**

In [9]:
speakerInfo = readSpeakerData('spkr-lsas.txt')

In [169]:
gender_age = []
gender = []
age = []

keys = list(namingData.keys())

speaker = [] # list of total number of speakers in each language 

for val in speakerInfo:
    #print(val)
    speaker.append(len(list(speakerInfo[val].keys())))
    for s in speakerInfo[val]:
        #if speakerInfo[val][s][0][1] != '*' and speakerInfo[val][s][0][1] != 'X':
        gender_age.append((speakerInfo[val][s][0][1],speakerInfo[val][s][0][0]))
        gender.append(speakerInfo[val][s][0][1])
        age.append(speakerInfo[val][s][0][0])
        
        #dic[val] = [(speakerInfo[val][s][0][1],speakerInfo[val][s][0][0])]
            
#assign language as the key to the list of genders 

In [162]:
# was trying to add the repeating language into the dataframe  

s = np.arange(1,111)
s= [str(i) for i in s] 
speaker_col = []
#print(s)

nested = [list(v*(s[i],)) for v, i in zip(speaker, range(len(s)))]
#nested


[['1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1'],
 ['2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2',
  '2'],
 ['3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3',
  '3'],
 ['4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4',
  '4'],
 ['5', '5', '5', '5', '5', '5'],
 ['6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6',
  '6'],
 ['7',
  '

In [170]:
language = [ item for elem in nested for item in elem]

In [174]:
full_df = pd.DataFrame(list(zip(language, gender, age, gender_age, unique_colour_list)), columns = ['language', 'gender', 'age', 'gender_age','unique' ])

In [175]:
full_df

Unnamed: 0,language,gender,age,gender_age,unique
0,1,M,90,"(M, 90)",6
1,1,M,26,"(M, 26)",7
2,1,M,38,"(M, 38)",6
3,1,M,35,"(M, 35)",6
4,1,M,80,"(M, 80)",7
...,...,...,...,...,...
2611,110,F,29,"(F, 29)",7
2612,110,F,23,"(F, 23)",8
2613,110,F,29,"(F, 29)",8
2614,110,F,22,"(F, 22)",9


In [25]:
#speaker profile with their age , gender and unique colour naming (done)
#create subplot for each language and identify the trends with linear regression lines across 110 languages. 

In [26]:
#three scatter subplots to 
#visualize the gender distribution across languages, 
#gender vs number of unique colour names, 
#age vs number of unique colour names 


In [None]:
fig = plt.figure(figsize=(30, 20), dpi= 80, facecolor='w', edgecolor='k')

# Loop over the subjects
for s in range(1,111):
    
    # Specify the file name of the subject in question
    fn_in = 'data-mental-rotation/sub'+str(s+1)
    
    
    # Task 1: Line fitting [3pts]
    
    #age vs unique across the langauges , gender as a the label 
    x = list[dic[s].values()]
    y = 

    m, b = np.polyfit(x, y, 1) # m = slope, b=intercept.

    
    #-------Task 1.2-------
    # Record the slope for this subject in place-holder variable "slopes"
    
    slopes = np.append(slopes,m) 
 

    # Create a subplot for this subject
    plt.subplot(6,9,s+1);
    plt.title('s'+str(s+1));


    # Task 2: Within-subject visualization [2pts]

    #-------Task 2.1-------
    # Scatter plot reaction times (y-axis) against angles (x-axis)
    plt.plot(angle, rt, 'ro');

    #-------Task 2.2-------
    # Juxtapose the fitted line onto this scatter plot
    plt.plot(x, m*x + b) 


    # Specify title of the plot by subject index
    plt.title('s'+str(s+1));
