In [106]:
import pandas as pd
import string
import pickle

In [107]:
ROWS = [string.ascii_uppercase[i] for i in range(10)]

In [108]:
# read chip.txt using pandas
chips = pd.read_csv('data/raw/chip.txt', sep='\t', 
                    header=None, names=['id', 'row', 'col', 'conc'])
# replace row with indexes from ROWS
chips['row'] = chips['row'].apply(lambda x: ROWS.index(x))

In [109]:
# make an id, (row, col) dictionary
chip_dict = {}
for i, row in chips.iterrows():
    chip_dict[row['id']] = (row['row'], row['col'])

In [110]:
# find chip numbers in row 0 in chip_dict
row_0 = [k for k, v in chip_dict.items() if v[1] == 0]

In [111]:
# read data/raw/term.txt (columns: lg, speaker, chip, term)
terms = pd.read_csv('data/raw/term.txt', sep='\t', 
                    header=None, names=['lg', 'speaker', 'chip', 'term'])
# remove chips from row 0
terms = terms[~terms['chip'].isin(row_0)]

In [112]:
# match chip to coordinates
terms['row'] = terms['chip'].apply(lambda x: chip_dict[x][0])
terms['col'] = terms['chip'].apply(lambda x: chip_dict[x][1])

In [113]:
# pickle the chip_dict
with open('data/chip_dict.pkl', 'wb') as f:
    pickle.dump(chip_dict, f)

In [127]:
def get_color_matrix(lg_id, terms=terms):
    lg_1 = terms[terms['lg'] == lg_id]
    # for each chip, get the most frequently used term
    lg_1 = lg_1.groupby(['row', 'col', 'term']).size().reset_index(name='count')
    lg_1 = lg_1.sort_values(['row', 'col', 'count'], ascending=[True, True, False])
    # for each row color choose max count
    lg_1 = lg_1.groupby(['row', 'col']).first().reset_index()
    # convert terms to integers by enumerating them
    terms = lg_1['term'].unique()
    terms = {terms[i]: i for i in range(len(terms))}
    lg_1['term'] = lg_1['term'].apply(lambda x: terms[x])
    return lg_1.pivot(index='row', columns='col', values='term').fillna(-1).astype(int).values

In [128]:
lg_ = get_color_matrix(2, terms)

In [75]:
# for every of 110 lgs, get color matrix and add id to a dictionary
lg_color = {}

for lg_id in terms['lg'].unique():
    lg_color[lg_id] = get_color_matrix(lg_id, terms)

In [78]:
lg_color[4]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2],
       [2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 4, 3, 3, 4, 4, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2],
       [2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        4, 4, 4, 3, 3, 3, 4, 3, 0, 5, 5, 1, 2, 2, 2, 2, 2, 2],
       [2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 4, 3, 3, 3, 3, 6, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2],
       [2, 2, 2, 2, 2, 7, 7, 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 6, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2],
       [2, 2, 2, 2, 7, 7, 7, 7, 8, 8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 6, 3, 3, 3, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2],
       [2, 2, 2, 8, 8, 7, 8, 8, 8, 8, 8, 

In [79]:
# pickle the dictionary
with open('data/lg_color.pkl', 'wb') as f:
    pickle.dump(lg_color, f)

In [86]:
# read cielab.txt (names in the first row)
cielab = pd.read_csv('data/raw/cielab.txt', sep='\t', header=0)

In [87]:
cielab

Unnamed: 0,#cnum,V,H,C,MunH,MunV,L*,a*,b*
0,141,A,0,0,10.00RP,9.5,96.00,-0.06,0.06
1,274,B,0,0,10.00RP,9.0,91.08,-0.05,0.06
2,129,B,1,2,2.50R,9.0,91.08,5.53,2.22
3,230,B,2,2,5.00R,9.0,91.08,5.51,3.28
4,302,B,3,2,7.50R,9.0,91.08,5.54,4.46
...,...,...,...,...,...,...,...,...,...
325,305,I,37,8,2.50RP,2.0,20.54,34.44,-14.69
326,267,I,38,8,5.00RP,2.0,20.54,35.44,-10.40
327,243,I,39,8,7.50RP,2.0,20.54,35.97,-6.33
328,182,I,40,8,10.00RP,2.0,20.54,36.42,-2.08


In [89]:
# map #cnum to chip_dict
cielab['row'] = cielab['#cnum'].apply(lambda x: chip_dict[x][0])
cielab['col'] = cielab['#cnum'].apply(lambda x: chip_dict[x][1])

In [92]:
cielab

Unnamed: 0,#cnum,V,H,C,MunH,MunV,L*,a*,b*,row,col
0,141,A,0,0,10.00RP,9.5,96.00,-0.06,0.06,0,0
1,274,B,0,0,10.00RP,9.0,91.08,-0.05,0.06,1,0
2,129,B,1,2,2.50R,9.0,91.08,5.53,2.22,1,1
3,230,B,2,2,5.00R,9.0,91.08,5.51,3.28,1,2
4,302,B,3,2,7.50R,9.0,91.08,5.54,4.46,1,3
...,...,...,...,...,...,...,...,...,...,...,...
325,305,I,37,8,2.50RP,2.0,20.54,34.44,-14.69,8,37
326,267,I,38,8,5.00RP,2.0,20.54,35.44,-10.40,8,38
327,243,I,39,8,7.50RP,2.0,20.54,35.97,-6.33,8,39
328,182,I,40,8,10.00RP,2.0,20.54,36.42,-2.08,8,40


In [104]:
# map chip coordinate to cielab coordinates
cielab_dict = {}

for i, row in cielab.iterrows():
    cielab_dict[(row['row'] - 1, row['col'] - 1)] = (row['L*'], row['a*'], row['b*'])

In [105]:
# pickkle the cielab_dict
with open('data/cielab_dict.pkl', 'wb') as f:
    pickle.dump(cielab_dict, f)