In [40]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [41]:
import pandas as pd
import json
import os

In [42]:
with open("./config.json", "r") as fp:
    config = json.load(fp)

In [43]:
DATA_DIR_PATH = config["data_dir_path"]
SNLI_DIR_PATH = os.path.join(DATA_DIR, "snli_1.0")

# Read SNLI datasets

In [44]:
df_snli_train = pd.read_csv(os.path.join(SNLI_DIR_PATH, "snli_1.0_train.txt"), delimiter='\t')
df_snli_dev = pd.read_csv(os.path.join(SNLI_DIR_PATH, "snli_1.0_dev.txt"), delimiter='\t')
df_snli_test = pd.read_csv(os.path.join(SNLI_DIR_PATH, "snli_1.0_test.txt"), delimiter='\t')

In [45]:
df_snli_train = df_snli_train[["gold_label", "sentence1_binary_parse", "sentence2_binary_parse"]].query("gold_label != '-'").dropna(how="any")
df_snli_dev = df_snli_dev[["gold_label", "sentence1_binary_parse", "sentence2_binary_parse"]].query("gold_label != '-'").dropna(how="any")
df_snli_test = df_snli_test[["gold_label", "sentence1_binary_parse", "sentence2_binary_parse"]].query("gold_label != '-'").dropna(how="any")

In [46]:
df_snli_train.head()

Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse
0,neutral,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( is ( ( training ( his horse...
1,contradiction,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...
2,entailment,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,"( ( A person ) ( ( ( ( is outdoors ) , ) ( on ..."
3,neutral,( Children ( ( ( smiling and ) waving ) ( at c...,( They ( are ( smiling ( at ( their parents ) ...
4,entailment,( Children ( ( ( smiling and ) waving ) ( at c...,( There ( ( are children ) present ) )


In [47]:
df_snli_dev.head()

Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse
0,neutral,( ( Two women ) ( ( are ( embracing ( while ( ...,( ( The sisters ) ( ( are ( ( hugging goodbye ...
1,entailment,( ( Two women ) ( ( are ( embracing ( while ( ...,( ( Two woman ) ( ( are ( holding packages ) )...
2,contradiction,( ( Two women ) ( ( are ( embracing ( while ( ...,( ( The men ) ( ( are ( fighting ( outside ( a...
3,entailment,( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...,( ( ( Two kids ) ( in ( numbered jerseys ) ) )...
4,neutral,( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...,( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w...


In [48]:
df_snli_test.head()

Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse
0,neutral,( ( This ( church choir ) ) ( ( ( sings ( to (...,( ( The church ) ( ( has ( cracks ( in ( the c...
1,entailment,( ( This ( church choir ) ) ( ( ( sings ( to (...,( ( The church ) ( ( is ( filled ( with song )...
2,contradiction,( ( This ( church choir ) ) ( ( ( sings ( to (...,( ( ( A choir ) ( singing ( at ( a ( baseball ...
3,neutral,( ( ( A woman ) ( with ( ( ( ( ( a ( green hea...,( ( The woman ) ( ( is young ) . ) )
4,entailment,( ( ( A woman ) ( with ( ( ( ( ( a ( green hea...,( ( The woman ) ( ( is ( very happy ) ) . ) )


# Generate tokens from binary_parse and concat to df

In [49]:
import re

def convert_binary_parse_to_tokens(binary_parse):
    tokens = binary_parse
    
    tokens = re.sub("\(", "", tokens) # remove left parentheses
    tokens = re.sub("\)", "", tokens) # remove right parentheses
    tokens = re.sub("\s+", " ", tokens) # remove multi space
    tokens = re.sub("``", "\"", tokens) # replace `` to " 
    tokens = re.sub("''", "\"", tokens) # replace '' to "
    tokens = tokens.strip()
    return tokens

def concat_tokens_to_df(df):
    tokens1 = []
    tokens2 = []

    for sample in list(df.itertuples()):
        tokens1.append(convert_binary_parse_to_tokens(sample.sentence1_binary_parse))
        tokens2.append(convert_binary_parse_to_tokens(sample.sentence2_binary_parse))
    
    df["tokens1"] = tokens1
    df["tokens2"] = tokens2

In [50]:
concat_tokens_to_df(df_snli_train)
df_snli_train.head()

Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse,tokens1,tokens2
0,neutral,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( is ( ( training ( his horse...,A person on a horse jumps over a broken down a...,A person is training his horse for a competiti...
1,contradiction,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,A person on a horse jumps over a broken down a...,"A person is at a diner , ordering an omelette ."
2,entailment,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,"( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...",A person on a horse jumps over a broken down a...,"A person is outdoors , on a horse ."
3,neutral,( Children ( ( ( smiling and ) waving ) ( at c...,( They ( are ( smiling ( at ( their parents ) ...,Children smiling and waving at camera,They are smiling at their parents
4,entailment,( Children ( ( ( smiling and ) waving ) ( at c...,( There ( ( are children ) present ) ),Children smiling and waving at camera,There are children present


In [51]:
concat_tokens_to_df(df_snli_dev)
df_snli_dev.head()

Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse,tokens1,tokens2
0,neutral,( ( Two women ) ( ( are ( embracing ( while ( ...,( ( The sisters ) ( ( are ( ( hugging goodbye ...,Two women are embracing while holding to go pa...,The sisters are hugging goodbye while holding ...
1,entailment,( ( Two women ) ( ( are ( embracing ( while ( ...,( ( Two woman ) ( ( are ( holding packages ) )...,Two women are embracing while holding to go pa...,Two woman are holding packages .
2,contradiction,( ( Two women ) ( ( are ( embracing ( while ( ...,( ( The men ) ( ( are ( fighting ( outside ( a...,Two women are embracing while holding to go pa...,The men are fighting outside a deli .
3,entailment,( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...,( ( ( Two kids ) ( in ( numbered jerseys ) ) )...,"Two young children in blue jerseys , one with ...",Two kids in numbered jerseys wash their hands .
4,neutral,( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...,( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w...,"Two young children in blue jerseys , one with ...",Two kids at a ballgame wash their hands .


In [52]:
concat_tokens_to_df(df_snli_test)
df_snli_test.head()

Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse,tokens1,tokens2
0,neutral,( ( This ( church choir ) ) ( ( ( sings ( to (...,( ( The church ) ( ( has ( cracks ( in ( the c...,This church choir sings to the masses as they ...,The church has cracks in the ceiling .
1,entailment,( ( This ( church choir ) ) ( ( ( sings ( to (...,( ( The church ) ( ( is ( filled ( with song )...,This church choir sings to the masses as they ...,The church is filled with song .
2,contradiction,( ( This ( church choir ) ) ( ( ( sings ( to (...,( ( ( A choir ) ( singing ( at ( a ( baseball ...,This church choir sings to the masses as they ...,A choir singing at a baseball game .
3,neutral,( ( ( A woman ) ( with ( ( ( ( ( a ( green hea...,( ( The woman ) ( ( is young ) . ) ),"A woman with a green headscarf , blue shirt an...",The woman is young .
4,entailment,( ( ( A woman ) ( with ( ( ( ( ( a ( green hea...,( ( The woman ) ( ( is ( very happy ) ) . ) ),"A woman with a green headscarf , blue shirt an...",The woman is very happy .


In [53]:
df_snli_train.to_csv(os.path.join(DATA_DIR_PATH, "snli_train.tsv"), sep="\t")
df_snli_dev.to_csv(os.path.join(DATA_DIR_PATH, "snli_dev.tsv"), sep="\t")
df_snli_test.to_csv(os.path.join(DATA_DIR_PATH, "snli_test.tsv"), sep="\t")