In [70]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pickle

In [71]:
filename = 'data/tangrams-train.tsv'
devfilename = 'data/tangrams-dev.tsv'
opfilename = 'tangrams_5u.txt'

headers = ['rowid', 'state1', 'action1','state2', 'action2', 'state3', 'action3', 'state4', 'action4', 'state5', 'action5', 'state6']

In [72]:
# Read the CSV file
raw_data_inp= pd.read_csv(filename, sep='\t', names=headers)
raw_data_dev= pd.read_csv(devfilename, sep='\t', names=headers)
raw_data_inp =raw_data_inp.dropna(ignore_index=True)
raw_data_dev=raw_data_dev.dropna(ignore_index=True)

'''Remove following for 5-utterance'''
'''headers_3utt =  ['rowid', 'state1', 'action1','state2', 'action2', 'state3', 'action3', 'state4']
dropheaders = [h for h in headers if h not in headers_3utt]
raw_data_inp = raw_data_inp.drop(columns=dropheaders)
raw_data_dev = raw_data_dev.drop(columns=dropheaders)
headers = headers_3utt'''
'''Remove for 5-utterance'''
# Read the CSV file

'Remove for 5-utterance'

In [73]:
raw_data_inp.head()

Unnamed: 0,rowid,state1,action1,state2,action2,state3,action3,state4,action4,state5,action5,state6
0,train-437,1:2 2:1 3:4 4:0 5:3,delete the second object from the left,1:2 2:4 3:0 4:3,delete the leftmost object,1:4 2:0 3:3,swap the leftmost and the rightmost objects,1:3 2:0 3:4,swap them again,1:4 2:0 3:3,add back the object we removed on step 1,1:1 2:4 3:0 4:3
1,train-438,1:0 2:2 3:4 4:3 5:1,swap the second and third figures,1:0 2:4 3:2 4:3 5:1,remove the second figure,1:0 2:2 3:3 4:1,swap the second and third figures,1:0 2:3 3:2 4:1,remove the third figure,1:0 2:3 3:1,"add back the figure removed in step 2, and pla...",1:0 2:3 3:4 4:1
2,train-440,1:2 2:1 3:3 4:0 5:4,delete the rightmost figure,1:2 2:1 3:3 4:0,undo step 1,1:2 2:1 3:3 4:0 5:4,delete the 1st figure,1:1 2:3 3:0 4:4,swap the 1st and 3rd figure,1:0 2:3 3:1 4:4,undo step 4,1:1 2:3 3:0 4:4
3,train-441,1:4 2:0 3:1 4:2 5:3,remove center one,1:4 2:0 3:2 4:3,remove second one,1:4 2:2 3:3,remove the first one,1:2 2:3,swap places,1:3 2:2,"add the third one back, in first place",1:0 2:3 3:2
4,train-442,1:0 2:4 3:2 4:3 5:1,remove 4th figure,1:0 2:4 3:2 4:1,remove it again,1:0 2:4 3:2,add last removed figure to 2nd position,1:0 2:1 3:4 4:2,add first removed figure back,1:0 2:1 3:4 4:3 5:2,remove 5th figure,1:0 2:1 3:4 4:3


In [74]:
def binarize_state(single_instance):
    binarized = []
    curstate = single_instance.split(' ')
    for cs in curstate:
        cur_substate = cs.split(':')
        for css in cur_substate:
            binarized+= bin(int(css))[2:].zfill(3)
    return binarized


def vectorize_column(series, vectorizer):
    X = vectorizer.transform(series.fillna(''))
    return list(X.toarray())  # Each row as a list of counts


In [75]:
def binarized_train(df, headers):
    state_headers = [h for h in headers if 'state' in h]
    state_headers = state_headers[:-1]
    text_headers =[h for h in headers if 'action' in h]
    action_text = []

    vectorizer_X = CountVectorizer(
        max_features = 50,
        stop_words =['and', 'the']
    )
    
    for col in state_headers:
        df[f'binarized_{col}'] = df[col].apply(binarize_state)

    all_text = df[text_headers].fillna('').agg(' '.join, axis=1)
    vectorizer_X.fit(all_text)

    
    # Apply to each column and add new columns
    for col in text_headers:
        df[f'vectorized_{col}'] = vectorize_column(df[col], vectorizer_X)

    
    new_headers = df.columns.tolist()
    bin_state_headers = [h for h in new_headers if 'binarized_' in h]
    max_length = max(max(len(arr) for arr in df[col]) for col in bin_state_headers)
    for col in bin_state_headers:
        df[col] = df[col].apply(lambda arr: arr + ['1']*(max_length - len(arr)))

    return (vectorizer_X, df)
    #print(vectorizer_X.get_feature_names_out())
    #print(df.head())

In [76]:
binarized_data_train = binarized_train(raw_data_inp,headers)
vect = binarized_data_train[0]
binarized_data_inp = binarized_data_train[1]


In [77]:
def binarized_test(df, headers, vectorizer):
    state_headers = [h for h in headers if 'state' in h]
    state_headers = state_headers[:-1]
    text_headers =[h for h in headers if 'action' in h]
    action_text = []

    for col in state_headers:
        df[f'binarized_{col}'] = df[col].apply(binarize_state)

   
    # Apply to each column and add new columns
    for col in text_headers:
        df[f'vectorized_{col}'] = vectorize_column(df[col], vectorizer)

    bin_state_headers = [h for h in df.columns.tolist() if 'binarized_' in h]
    max_length = max(max(len(arr) for arr in df[col]) for col in bin_state_headers)
    for col in bin_state_headers:
        df[col] = df[col].apply(lambda arr: arr + ['1']*(max_length - len(arr)))

    return df

In [78]:
binarized_data_dev = binarized_test(raw_data_dev, headers, vect)

In [79]:
vect.get_feature_names_out(), len(vect.get_feature_names_out())

(array(['1st', '2nd', '3rd', '4th', '5th', 'add', 'again', 'back',
        'delete', 'deleted', 'do', 'fifth', 'figure', 'figures', 'first',
        'for', 'fourth', 'from', 'in', 'into', 'it', 'item', 'items',
        'last', 'left', 'middle', 'object', 'objects', 'of', 'one',
        'place', 'position', 'put', 'remove', 'removed', 'repeat',
        'replace', 'rightmost', 'same', 'second', 'spot', 'step', 'swap',
        'switch', 'them', 'third', 'to', 'two', 'undo', 'with'],
       dtype=object),
 50)

In [80]:
binarized_data_dev.loc[1]

rowid                                                           dev-238
state1                                              1:1 2:2 3:3 4:0 5:4
action1                                     swap the 1st and 4th figure
state2                                              1:0 2:2 3:3 4:1 5:4
action2                                           delete the 2nd figure
state3                                                  1:0 2:3 3:1 4:4
action3                                     swap the 1st and 2nd figure
state4                                                  1:3 2:0 3:1 4:4
action4               add the thing we removed in between the 1st an...
state5                                              1:3 2:2 3:0 4:1 5:4
action5                                     swap the 1st and 4th figure
state6                                              1:1 2:2 3:0 4:3 5:4
binarized_state1      [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, ...
binarized_state2      [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,

In [81]:
len(binarized_data_dev.loc[1, 'binarized_state1'])

30

In [82]:
def df_to_featureset(df, vectorized_names, y_num):
    bin_state_headers = [h for h in df.columns.tolist() if 'binarized_' in h]
    vect_action_headers  = [h for h in df.columns.tolist() if 'vectorized_' in h]
    featured_state = np.hstack([np.stack(df[col]) for col in bin_state_headers])
    print(featured_state.shape)
    featured_actions = np.hstack([np.stack(df[col]) for col in vect_action_headers])
    print(featured_actions.shape)
    X = np.hstack([featured_state, featured_actions])

    state_length =len(df.loc[1,bin_state_headers[0]])
    feature_names = []
    for col in bin_state_headers:
        feature_names.extend([f"{col}_{i+1}" for i in range(state_length)])
    print('now state features:',len(feature_names))
    for col in vect_action_headers:
        feature_names.extend([f"{col}_{vcn}" for vcn in vectorized_names])

    print('added action features',len(feature_names))

    Y =[]
    state_headers = [h for h in df.columns.tolist() if 'state' in h and 'binarized_' not in h]
    Y_header = state_headers[-1]
    print(Y_header)
    for yvals in df[Y_header]:
        try:
            yvals = yvals.split(' ')
            for ys in yvals:
                if str(y_num)+':' in ys:
                    ansasint = int(ys.replace(str(y_num)+':',''))
                    if ansasint!='_':
                        ansasint = int(ansasint)
                    else:
                        ansasint= 6
                    Y.append(ansasint)
        except Exception as e:
            print(e)
            Y.append(6)
                
    


    print ('X:',X.shape, 'features:',len(feature_names),'Y:', len(Y))
    return (X,Y, feature_names)

    

In [83]:
X_train, Y_train, features = df_to_featureset(binarized_data_inp,list(vect.get_feature_names_out()),1)
X_dev, Y_dev, features = df_to_featureset(binarized_data_dev,list(vect.get_feature_names_out()),1)

(4159, 150)
(4159, 250)
now state features: 150
added action features 400
state6
X: (4159, 400) features: 400 Y: 4159
(198, 150)
(198, 250)
now state features: 150
added action features 400
state6
X: (198, 400) features: 400 Y: 198


In [84]:
Y_dev = np.array(Y_dev).astype(np.uint32)
Y_train = np.array(Y_train).astype(np.uint32)

In [85]:
X_dev =X_dev.astype(np.uint32)
X_train = X_train.astype(np.uint32)

In [86]:
pickled = {'X_train':X_train, 'Y_train':Y_train, 'X_test':X_dev,'Y_test':Y_dev, 'features':features}
with open('action_coref_binarized_5utt.pkl', 'wb') as f:
    pickle.dump(pickled, f)

In [56]:
X_dev

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=uint32)

In [57]:
Y_dev

array([1, 3, 0, 0, 4, 0, 1, 4, 0, 3, 3, 4, 0, 3, 1, 2, 4, 2, 1, 0, 2, 4,
       3, 4, 1, 0, 1, 0, 3, 0, 3, 1, 3, 2, 1, 3, 2, 1, 0, 0, 2, 3, 2, 3,
       2, 0, 1, 4, 4, 3, 2, 4, 3, 1, 3, 0, 3, 3, 3, 1, 2, 0, 0, 2, 0, 3,
       4, 1, 1, 2, 1, 2, 4, 4, 2, 0, 4, 0, 0, 0, 2, 2, 2, 0, 4, 4, 3, 4,
       3, 1, 0, 0, 1, 4, 2, 1, 4, 1, 4, 1, 0, 4, 0, 3, 2, 3, 2, 3, 1, 0,
       1, 2, 1, 0, 3, 2, 2, 1, 2, 2, 1, 0, 2, 3, 1, 3, 2, 3, 0, 2, 3, 4,
       3, 2, 1, 1, 2, 4, 2, 4, 4, 4, 1, 0, 0, 4, 3, 0, 4, 4, 3, 4, 4, 4,
       1, 4, 4, 3, 0, 0, 0, 2, 0, 4, 3, 4, 4, 0, 1, 0, 3, 4, 4, 0, 0, 2,
       0, 4, 0, 2, 2, 3, 4, 1, 2, 2, 3, 3, 1, 0, 3, 0, 2, 3, 4, 1, 0, 1],
      dtype=uint32)

In [87]:
with open('action_coref_binarized_5utt.pkl', 'rb') as f:
    p = pickle.load(f)

In [69]:
p

{'X_train': array([[0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0]], dtype=uint32),
 'Y_train': array([3, 0, 1, ..., 4, 0, 4], dtype=uint32),
 'X_test': array([[0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0]], dtype=uint32),
 'Y_test': array([1, 3, 0, 0, 4, 0, 1, 4, 0, 3, 3, 4, 0, 3, 1, 2, 4, 2, 1, 0, 2, 4,
        3, 4, 1, 0, 1, 0, 3, 0, 3, 1, 3, 2, 1, 3, 2, 1, 0, 0, 2, 3, 2, 3,
        2, 0, 1, 4, 4, 3, 2, 4, 3, 1, 3, 0, 3, 3, 3, 1, 2, 0, 0, 2, 0, 3,
        4, 1, 1, 2, 1, 2, 4, 4, 2, 0, 4, 0, 0, 0, 2, 2, 2, 0, 4, 4, 3, 4,
        3, 1, 0, 0, 1, 4, 2, 1, 4, 1, 4, 1, 0, 4, 0, 3, 2, 3, 2, 3, 1, 0,
        1, 2, 1, 0, 3, 2, 2, 1, 2, 2, 1, 0, 2, 3, 1, 3, 2, 3, 0, 2, 3, 4,
        3, 2,