In [303]:
import pandas as pd
import string
import pickle
from colour.notation import munsell as munsell
from colour import XYZ_to_sRGB, Lab_to_XYZ
import numpy as np

In [304]:
def clip(rgb):
   return np.clip(rgb, 0, 1)

In [333]:
# open raw/lang_info.txt using pandas
lang_info = pd.read_csv('data/raw/langs_info.txt', sep='\t', header=None)
# turn the first and second columns into a key-value pair
lang_info = dict(zip(lang_info[0], lang_info[1]))
# pickle the dictionary
with open('data/lang_info.pkl', 'wb') as f:
    pickle.dump(lang_info, f)

In [305]:
ROWS = [string.ascii_uppercase[i] for i in range(10)]

In [306]:
# read chip.txt using pandas
chips = pd.read_csv('data/raw/chip.txt', sep='\t', 
                    header=None, names=['id', 'row', 'col', 'conc'])
# replace row with indexes from ROWS
chips['row'] = chips['row'].apply(lambda x: ROWS.index(x))

In [307]:
# make an id, (row, col) dictionary
chip_dict = {}
for i, row in chips.iterrows():
    chip_dict[row['id']] = (row['row'], row['col'])

In [308]:
# find chip numbers in row 0 in chip_dict
row_0 = [k for k, v in chip_dict.items() if v[1] == 0]

In [309]:
# read data/raw/term.txt (columns: lg, speaker, chip, term)
terms = pd.read_csv('data/raw/term.txt', sep='\t', 
                    header=None, names=['lg', 'speaker', 'chip', 'term'])
# remove chips from row 0
terms = terms[~terms['chip'].isin(row_0)]

In [310]:
# match chip to coordinates
terms['row'] = terms['chip'].apply(lambda x: chip_dict[x][0])
terms['col'] = terms['chip'].apply(lambda x: chip_dict[x][1])

In [311]:
# pickle the chip_dict
with open('data/chip_dict.pkl', 'wb') as f:
    pickle.dump(chip_dict, f)

In [312]:
lg_1 = terms[terms['lg'] == 32]

# compute number of responses per chip
responses = lg_1.groupby(['row', 'col']).size().reset_index()
# return min and max number of responses
min_responses = responses[0].min()
min_responses

25

In [313]:
lg_1 = terms[terms['lg'] == 2]
# for each chip, get the most frequently used term
lg_1 = lg_1.groupby(['row', 'col', 'term']).size().reset_index(name='count')
lg_1 = lg_1.sort_values(['row', 'col', 'count'], ascending=[True, True, False])
# for each row color choose max count
lg_1 = lg_1.groupby(['row', 'col']).first().reset_index()

In [314]:
lg_1['count'].min()

3

In [315]:
def get_color_matrix(lg_id, terms=terms):
    lg_1 = terms[terms['lg'] == lg_id]
    # exclude terms like *
    lg_1 = lg_1[lg_1['term'] != '*']
    # for each chip, get the most frequently used term
    lg_1 = lg_1.groupby(['row', 'col', 'term']).size().reset_index(name='count')
    lg_1 = lg_1.sort_values(['row', 'col', 'count'], ascending=[True, True, False])
    # for each row color choose max count
    lg_1 = lg_1.groupby(['row', 'col']).first().reset_index()
    if lg_1['count'].min() >= 2:
        # convert terms to integers by enumerating them
        terms = lg_1['term'].unique()
        terms = {terms[i]: i for i in range(len(terms))}
        lg_1['term'] = lg_1['term'].apply(lambda x: terms[x])
        return lg_1.pivot(index='row', columns='col', values='term').fillna(-1).astype(int).values

In [316]:
# for every of 110 lgs, get color matrix and add id to a dictionary
lg_color = {}

for lg_id in terms['lg'].unique():
    lg_color[lg_id] = get_color_matrix(lg_id, terms)

In [318]:
# exclude none
lg_color = {k: v for k, v in lg_color.items() if v is not None}
len(lg_color)

103

In [319]:
terms[terms['lg'] == 19]['term'].unique()

array(['FC', 'B', 'CA', 'N', 'CF', 'TS', 'PI', 'F', 'FJ', 'A', 'M', 'R',
       '*', 'CE', 'SP', 'P', 'LI', 'LE', 'PL', 'T', 'J', 'BT', 'PT', 'CH',
       'BA', 'JU', 'I', 'CN', nan, 'C1', 'PE'], dtype=object)

In [320]:
# pickle the dictionary
with open('data/lg_color.pkl', 'wb') as f:
    pickle.dump(lg_color, f)

In [321]:
# read cielab.txt (names in the first row)
cielab = pd.read_csv('data/raw/cielab.txt', sep='\t', header=0)

In [322]:
# map #cnum to chip_dict
cielab['row'] = cielab['#cnum'].apply(lambda x: chip_dict[x][0])
cielab['col'] = cielab['#cnum'].apply(lambda x: chip_dict[x][1])

In [323]:
# # combine MunH C and MunH to on column Munsell (convert all to string before)
# cielab['MunH'] = cielab['MunH'].astype(str)
# cielab['C'] = cielab['C'].astype(str)
# cielab['MunV'] = cielab['MunV'].astype(str)
# combine L*	a*	b* into a list and save in the Lab column
cielab['Lab'] = cielab[['L*', 'a*', 'b*']].values.tolist()

In [324]:
cielab

Unnamed: 0,#cnum,V,H,C,MunH,MunV,L*,a*,b*,row,col,Lab
0,141,A,0,0,10.00RP,9.5,96.00,-0.06,0.06,0,0,"[96.0, -0.06, 0.06]"
1,274,B,0,0,10.00RP,9.0,91.08,-0.05,0.06,1,0,"[91.08, -0.05, 0.06]"
2,129,B,1,2,2.50R,9.0,91.08,5.53,2.22,1,1,"[91.08, 5.53, 2.22]"
3,230,B,2,2,5.00R,9.0,91.08,5.51,3.28,1,2,"[91.08, 5.51, 3.28]"
4,302,B,3,2,7.50R,9.0,91.08,5.54,4.46,1,3,"[91.08, 5.54, 4.46]"
...,...,...,...,...,...,...,...,...,...,...,...,...
325,305,I,37,8,2.50RP,2.0,20.54,34.44,-14.69,8,37,"[20.54, 34.44, -14.69]"
326,267,I,38,8,5.00RP,2.0,20.54,35.44,-10.40,8,38,"[20.54, 35.44, -10.4]"
327,243,I,39,8,7.50RP,2.0,20.54,35.97,-6.33,8,39,"[20.54, 35.97, -6.33]"
328,182,I,40,8,10.00RP,2.0,20.54,36.42,-2.08,8,40,"[20.54, 36.42, -2.08]"


In [325]:
# apply call_table to Munsell column
cielab['RGB'] = cielab['Lab'].apply(Lab_to_XYZ)
cielab['RGB'] = cielab['RGB'].apply(lambda x: XYZ_to_sRGB(x))
cielab['RGB'] = cielab['RGB'].apply(clip)
# clip values to 0-1

# clip values to 0-1
# cielab['RGB'] = cielab['RGB'].apply(lambda x: np.array(x).astype(np.uint8))

In [326]:
cielab

Unnamed: 0,#cnum,V,H,C,MunH,MunV,L*,a*,b*,row,col,Lab,RGB
0,141,A,0,0,10.00RP,9.5,96.00,-0.06,0.06,0,0,"[96.0, -0.06, 0.06]","[0.954387358159, 0.954892003224, 0.954295174125]"
1,274,B,0,0,10.00RP,9.0,91.08,-0.05,0.06,1,0,"[91.08, -0.05, 0.06]","[0.899336833427, 0.899727729263, 0.899163183654]"
2,129,B,1,2,2.50R,9.0,91.08,5.53,2.22,1,1,"[91.08, 5.53, 2.22]","[0.94991554582, 0.88551665423, 0.88388302406]"
3,230,B,2,2,5.00R,9.0,91.08,5.51,3.28,1,2,"[91.08, 5.51, 3.28]","[0.953116182727, 0.885259598086, 0.876000883738]"
4,302,B,3,2,7.50R,9.0,91.08,5.54,4.46,1,3,"[91.08, 5.54, 4.46]","[0.957009838466, 0.88484730405, 0.867232548455]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,305,I,37,8,2.50RP,2.0,20.54,34.44,-14.69,8,37,"[20.54, 34.44, -14.69]","[0.339537502389, 0.09445508905, 0.280804819344]"
326,267,I,38,8,5.00RP,2.0,20.54,35.44,-10.40,8,38,"[20.54, 35.44, -10.4]","[0.352685884568, 0.0857374677727, 0.256621324571]"
327,243,I,39,8,7.50RP,2.0,20.54,35.97,-6.33,8,39,"[20.54, 35.97, -6.33]","[0.361995221826, 0.0797425462191, 0.233832318831]"
328,182,I,40,8,10.00RP,2.0,20.54,36.42,-2.08,8,40,"[20.54, 36.42, -2.08]","[0.370197505289, 0.0741016799915, 0.210207170885]"


In [327]:
rgb_dict = {}

cielab_dict = {}

for i, row in cielab.iterrows():
    cielab_dict[(row['row'] - 1, row['col'] - 1)] = (row['L*'], row['a*'], row['b*'])
    rgb_dict[(row['row'], row['col'])] = row['RGB']

In [328]:
# pickkle the cielab_dict
with open('data/cielab_dict.pkl', 'wb') as f:
    pickle.dump(cielab_dict, f)

# pickle the rgb_dict
with open('data/rgb_dict.pkl', 'wb') as f:
    pickle.dump(rgb_dict, f)