In [66]:
import pandas as pd
import numpy as np

In [67]:
data = pd.read_csv('features.txt', delimiter='\t')

### Read in and munge the scene representations

In [68]:
# we don't need all of these categorical features
data.drop(['f', 'i', 'l', 'n', 'p', 't', 'u', '_v', 'w', 'x', 'y', 'z',
       'blue', 'brown', 'gray', 'green', 'orange', 'pink', 'purple', 'red',
       'yellow', 'id', 'position'], axis=1, inplace=True)

In [69]:
# make h_skew and v_skew into 1-hot columns
data['imageid'] = data.imageid.map(lambda x: x.split('.')[0])
discrete_cols = ['h_skew', 'v_skew']
data = pd.get_dummies(data = data, columns = discrete_cols)

# add a feature that is distance from "center" (i.e., a stationary point)
from scipy.spatial import distance
center = (0,0)
data['c_diff'] = data.apply(lambda x: distance.euclidean(center, (x['pos_x'], x['pos_y'])), axis=1)

In [70]:
data.columns

Index(['pieceid', 'imageid', 'episodeid', 'r', 'g', 'b', 'h', 's', 'v',
       'orientation', 'num_edges', 'pos_x', 'pos_y', 'h_skew_left-skewed',
       'h_skew_right-skewed', 'h_skew_symmetric', 'v_skew_bottom-skewed',
       'v_skew_symmetric', 'v_skew_top-skewed', 'c_diff'],
      dtype='object')

In [71]:
data[:5]

Unnamed: 0,pieceid,imageid,episodeid,r,g,b,h,s,v,orientation,num_edges,pos_x,pos_y,h_skew_left-skewed,h_skew_right-skewed,h_skew_symmetric,v_skew_bottom-skewed,v_skew_symmetric,v_skew_top-skewed,c_diff
0,0,1,Set0/1,86.480225,57.164215,46.304261,8.293657,127.795376,86.661635,5.742743,8,199,164,0,1,0,1,0,0,257.870122
1,0,2,Set0/1,79.55544,74.452909,59.535351,22.51474,74.233586,79.337073,41.51936,10,222,159,1,0,0,0,0,1,273.065926
2,0,3,Set0/1,130.428545,111.25028,86.211567,17.137593,94.26875,131.00056,-7.716261,12,203,161,0,0,1,1,0,0,259.094577
3,0,4,Set0/1,69.591751,55.848775,83.48426,135.273859,92.572226,83.479976,-21.40881,8,222,151,0,0,1,0,0,1,268.486499
4,0,5,Set0/1,36.108723,79.887808,112.033928,102.723919,177.755478,112.230646,42.677817,6,220,169,1,0,0,0,0,1,277.418456


### Read in and munge the referring expressions

In [72]:
res = pd.read_csv('segmented-labeled.txt', delimiter='\t')

In [73]:
res['episodeid'] = res.imageurl.map(lambda x: '/'.join(x.split('/')[2:4]))
res['imageid'] = res.imageurl.map(lambda x: x.split('/')[4].split('.')[0])

In [74]:
# split up the referring expressions into individual rows
res['originaltext'] = res.originaltext.map(lambda x: x.lower().split(';'))
s = res.originaltext.apply(lambda x: pd.Series(x)).stack().reset_index(level=1,drop=True)
s.name = 'refexp'
res = res.drop('originaltext', axis=1).join(s)
# remove empties; only consider annotated referring expressions
res = res[res.refexp != '']
res = res[res.refexp.str.contains(':')] # the colon separates the target from the referring expression

In [75]:
# we don't need these columns
res.drop(['masteraudiourl', 'audiourl', 'nextaudiourl', 'imageurl'], axis=1, inplace=True)

In [76]:
res.columns

Index(['id', 'episodeid', 'imageid', 'refexp'], dtype='object')

In [77]:
res['target'] = res.refexp.map(lambda x: x.split(':')[0].strip())
res['refexp'] = res.refexp.map(lambda x: x.split(':')[1].strip())

In [78]:
res = res[~res.target.str.contains('ot')] # we don't want OTs hanging around since they don't refer to anything
res = res[~res.target.str.contains('unk')] # unk is useless too, so remove all of those

In [79]:
res['target'] = pd.to_numeric(res.target, errors='coerce') # converts non-numbers to NaN
res.dropna(inplace=True) # remove those non-number targets
res['target'] = res.target.map(lambda x: np.int32(x)) 
res['refexp'] = res.refexp.map(lambda x: x.lower().split())

res[res.id == 4]

Unnamed: 0,id,episodeid,imageid,refexp,target
3,4,Set0/1,8,"[like, off, to, the, left, like, a, reverse, l]",0


In [80]:
# melt the words array in refexp into individual rows
s = res.refexp.apply(lambda x: pd.Series(x)).stack().reset_index(level=1,drop=True)
s.name = 'word'
res = res.drop('refexp', axis=1).join(s)

In [81]:
res[res.id == 4]

Unnamed: 0,id,episodeid,imageid,target,word
3,4,Set0/1,8,0,like
3,4,Set0/1,8,0,off
3,4,Set0/1,8,0,to
3,4,Set0/1,8,0,the
3,4,Set0/1,8,0,left
3,4,Set0/1,8,0,like
3,4,Set0/1,8,0,a
3,4,Set0/1,8,0,reverse
3,4,Set0/1,8,0,l


In [96]:
# save the data to respective pickles

data.to_pickle('scenedata.pkl')
res.to_pickle('refexpdata.pkl')