In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
import nltk
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/athiyadeviyani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data

In [3]:
os.listdir('snli')

['snli_1.0_train.txt',
 '.DS_Store',
 'Icon\r',
 'snli_1.0_test.jsonl',
 'README.txt',
 'snli_1.0_dev.txt',
 'snli_1.0_dev.jsonl',
 'snli_1.0_test.txt',
 'snli_1.0_train.jsonl']

In [4]:
trainDF = pd.read_table('snli/snli_1.0_train.txt', delimiter = '\t')
testDF = pd.read_table('snli/snli_1.0_test.txt', delimiter = '\t')
valDF = pd.read_table('snli/snli_1.0_dev.txt', delimiter = '\t')

In [5]:
trainDF.head()

Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse,sentence1_parse,sentence2_parse,sentence1,sentence2,captionID,pairID,label1,label2,label3,label4,label5
0,neutral,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,3416050480.jpg#4,3416050480.jpg#4r1n,neutral,,,,
1,contradiction,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",3416050480.jpg#4,3416050480.jpg#4r1c,contradiction,,,,
2,entailment,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,"( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...",(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",3416050480.jpg#4,3416050480.jpg#4r1e,entailment,,,,
3,neutral,( Children ( ( ( smiling and ) waving ) ( at c...,( They ( are ( smiling ( at ( their parents ) ...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...,Children smiling and waving at camera,They are smiling at their parents,2267923837.jpg#2,2267923837.jpg#2r1n,neutral,,,,
4,entailment,( Children ( ( ( smiling and ) waving ) ( at c...,( There ( ( are children ) present ) ),(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...,Children smiling and waving at camera,There are children present,2267923837.jpg#2,2267923837.jpg#2r1e,entailment,,,,


In [7]:
train = trainDF[['sentence1','sentence2']]
train = train.rename(columns={"sentence1":"premise", "sentence2":"hypothesis"})
train.head()

Unnamed: 0,premise,hypothesis
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse."
3,Children smiling and waving at camera,They are smiling at their parents
4,Children smiling and waving at camera,There are children present


## Preprocess

For preprocessing, lowercase, remove stopwords and tokenize the data. Note that there are duplicate premises and hypotheses in the data; remove these and just look at unique utterances.

Tokenize

In [8]:
tokenizer = RegexpTokenizer('[a-z]\w+')

def preprocess(sentence):
    if isinstance(sentence, float):
        return []
    else:
        sentence = sentence.lower()
        tokenized = tokenizer.tokenize(sentence)
        return [word for word in tokenized if word not in stop_words]
    
preprocess('A person on a horse jumps over a broken down airplane.')

['person', 'horse', 'jumps', 'broken', 'airplane']

Remove duplicates

In [9]:
train.drop_duplicates(inplace=True)

In [10]:
train.describe()

Unnamed: 0,premise,hypothesis
count,549526,549524
unique,150736,480040
top,A dog in a field.,A man is sleeping.
freq,32,335


Preprocess everything

In [11]:
train["premise"] = train["premise"].map(preprocess)
train["hypothesis"] = train["hypothesis"].map(preprocess)

In [12]:
train.head()

Unnamed: 0,premise,hypothesis
0,"[person, horse, jumps, broken, airplane]","[person, training, horse, competition]"
1,"[person, horse, jumps, broken, airplane]","[person, diner, ordering, omelette]"
2,"[person, horse, jumps, broken, airplane]","[person, outdoors, horse]"
3,"[children, smiling, waving, camera]","[smiling, parents]"
4,"[children, smiling, waving, camera]","[children, present]"


In [13]:
compression_opts = dict(method='zip', archive_name='train_parsed.csv')  
train.to_csv('out.zip', index=False, compression=compression_opts) 