In [1]:
!pip install torch==1.7.1 torchtext==0.8.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch

SEED = 1111
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


In [3]:
torch.__version__


'1.7.1'

In [4]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
len(tokenizer)


30522

In [6]:
tokens = tokenizer.tokenize("Heyy There!! See some boys are playing in rain")
print(tokens)


['hey', '##y', 'there', '!', '!', 'see', 'some', 'boys', 'are', 'playing', 'in', 'rain']


In [7]:
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)


[4931, 2100, 2045, 999, 999, 2156, 2070, 3337, 2024, 2652, 1999, 4542]


In [8]:
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
print(cls_token, sep_token, pad_token, unk_token)


[CLS] [SEP] [PAD] [UNK]


In [9]:
cls_token_idx = tokenizer.cls_token_id
sep_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(cls_token_idx, sep_token_idx, pad_token_idx, unk_token_idx)


101 102 0 100


In [10]:
max_input_length = 256


def tokenize_bert(sentence):
    tokens = tokenizer.tokenize(sentence)
    return tokens


def split_and_cut(sentence):
    tokens = sentence.strip().split(" ")
    tokens = tokens[:max_input_length]
    return tokens


def trim_sentence(sent):
    try:
        sent = sent.split()
        sent = sent[:128]
        return " ".join(sent)
    except:
        return sent


In [11]:
!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip

from zipfile import ZipFile
file_name = "snli_1.0.zip"
with ZipFile(file_name, 'r') as zip:
    zip.printdir()
    zip.extractall()

--2023-04-06 06:09:53--  https://nlp.stanford.edu/projects/snli/snli_1.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94550081 (90M) [application/zip]
Saving to: ‘snli_1.0.zip.1’


2023-04-06 06:09:56 (25.4 MB/s) - ‘snli_1.0.zip.1’ saved [94550081/94550081]

File Name                                             Modified             Size
snli_1.0/                                      2015-08-29 08:57:10            0
snli_1.0/.DS_Store                             2015-08-29 08:57:16         6148
__MACOSX/                                      2015-08-29 09:00:04            0
__MACOSX/snli_1.0/                             2015-08-29 09:00:04            0
__MACOSX/snli_1.0/._.DS_Store                  2015-08-29 08:57:16          120
                                 2015-05-21 16:21:08            0
                      2015-05-21 16

In [12]:
# Get list of 0s
def get_sent1_token_type(sent):
    try:
        return [0] * len(sent)
    except:
        return []


# Get list of 1s
def get_sent2_token_type(sent):
    try:
        return [1] * len(sent)
    except:
        return []


# combine from lists
def combine_seq(seq):
    return " ".join(seq)


# combines from lists of int
def combine_mask(mask):
    mask = [str(m) for m in mask]
    return " ".join(mask)


In [48]:
import pandas as pd

# load dataset
df_train = pd.read_csv("snli_1.0/snli_1.0_train.txt", sep="\t", on_bad_lines="skip")
df_dev = pd.read_csv("snli_1.0/snli_1.0_dev.txt", sep="\t", on_bad_lines="skip")
df_test = pd.read_csv("snli_1.0/snli_1.0_test.txt", sep="\t", on_bad_lines="skip")


In [49]:
df_train.head()


Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse,sentence1_parse,sentence2_parse,sentence1,sentence2,captionID,pairID,label1,label2,label3,label4,label5
0,neutral,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,3416050480.jpg#4,3416050480.jpg#4r1n,neutral,,,,
1,contradiction,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",3416050480.jpg#4,3416050480.jpg#4r1c,contradiction,,,,
2,entailment,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,"( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...",(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",3416050480.jpg#4,3416050480.jpg#4r1e,entailment,,,,
3,neutral,( Children ( ( ( smiling and ) waving ) ( at c...,( They ( are ( smiling ( at ( their parents ) ...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...,Children smiling and waving at camera,They are smiling at their parents,2267923837.jpg#2,2267923837.jpg#2r1n,neutral,,,,
4,entailment,( Children ( ( ( smiling and ) waving ) ( at c...,( There ( ( are children ) present ) ),(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...,Children smiling and waving at camera,There are children present,2267923837.jpg#2,2267923837.jpg#2r1e,entailment,,,,


In [50]:
# Get neccesary columns
df_train = df_train[["gold_label", "sentence1", "sentence2"]]
df_dev = df_dev[["gold_label", "sentence1", "sentence2"]]
df_test = df_test[["gold_label", "sentence1", "sentence2"]]


In [51]:
# Take small dataset
df_train = df_train[:80000]
df_dev = df_train[:8000]
df_test = df_train[:8000]


In [52]:
# Trim each sentence upto maximum length
df_train["sentence1"] = df_train["sentence1"].apply(trim_sentence)
df_train["sentence2"] = df_train["sentence2"].apply(trim_sentence)
df_dev["sentence1"] = df_dev["sentence1"].apply(trim_sentence)
df_dev["sentence2"] = df_dev["sentence2"].apply(trim_sentence)
df_test["sentence1"] = df_test["sentence1"].apply(trim_sentence)
df_test["sentence2"] = df_test["sentence2"].apply(trim_sentence)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['sentence1'] = df_dev['sentence1'].apply(trim_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['sentence2'] = df_dev['sentence2'].apply(trim_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['sentence1'] = df_test['sentence1'].apply(trim_sentence)
A value is try

In [53]:
# Add [CLS] and [SEP] tokens
df_train["sent1"] = "[CLS] " + df_train["sentence1"] + " [SEP] "
df_train["sent2"] = df_train["sentence2"] + " [SEP]"
df_dev["sent1"] = "[CLS] " + df_dev["sentence1"] + " [SEP] "
df_dev["sent2"] = df_dev["sentence2"] + " [SEP]"
df_test["sent1"] = "[CLS] " + df_test["sentence1"] + " [SEP] "
df_test["sent2"] = df_test["sentence2"] + " [SEP]"


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['sent1'] = '[CLS] ' + df_dev['sentence1'] + ' [SEP] '
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['sent2'] = df_dev['sentence2'] + ' [SEP]'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['sent1'] = '[CLS] ' + df_test['sentence1'] + ' [SEP] '
A value is trying to be set on a

In [54]:
# Apply Bert Tokenizer for tokeinizing
df_train["sent1_t"] = df_train["sent1"].apply(tokenize_bert)
df_train["sent2_t"] = df_train["sent2"].apply(tokenize_bert)
df_dev["sent1_t"] = df_dev["sent1"].apply(tokenize_bert)
df_dev["sent2_t"] = df_dev["sent2"].apply(tokenize_bert)
df_test["sent1_t"] = df_test["sent1"].apply(tokenize_bert)
df_test["sent2_t"] = df_test["sent2"].apply(tokenize_bert)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['sent1_t'] = df_dev['sent1'].apply(tokenize_bert)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['sent2_t'] = df_dev['sent2'].apply(tokenize_bert)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['sent1_t'] = df_test['sent1'].apply(tokenize_bert)
A value is trying to be set on a

In [55]:
df_train.head()


Unnamed: 0,gold_label,sentence1,sentence2,sent1,sent2,sent1_t,sent2_t
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,[CLS] A person on a horse jumps over a broken ...,A person is training his horse for a competiti...,"[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, training, his, horse, for, a, ..."
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",[CLS] A person on a horse jumps over a broken ...,"A person is at a diner, ordering an omelette. ...","[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, at, a, diner, ,, ordering, an,..."
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",[CLS] A person on a horse jumps over a broken ...,"A person is outdoors, on a horse. [SEP]","[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, outdoors, ,, on, a, horse, ., ..."
3,neutral,Children smiling and waving at camera,They are smiling at their parents,[CLS] Children smiling and waving at camera [S...,They are smiling at their parents [SEP],"[[CLS], children, smiling, and, waving, at, ca...","[they, are, smiling, at, their, parents, [SEP]]"
4,entailment,Children smiling and waving at camera,There are children present,[CLS] Children smiling and waving at camera [S...,There are children present [SEP],"[[CLS], children, smiling, and, waving, at, ca...","[there, are, children, present, [SEP]]"


In [56]:
# Get Topen type ids for both sentence
df_train["sent1_token_type"] = df_train["sent1_t"].apply(get_sent1_token_type)
df_train["sent2_token_type"] = df_train["sent2_t"].apply(get_sent2_token_type)
df_dev["sent1_token_type"] = df_dev["sent1_t"].apply(get_sent1_token_type)
df_dev["sent2_token_type"] = df_dev["sent2_t"].apply(get_sent2_token_type)
df_test["sent1_token_type"] = df_test["sent1_t"].apply(get_sent1_token_type)
df_test["sent2_token_type"] = df_test["sent2_t"].apply(get_sent2_token_type)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['sent1_token_type'] = df_dev['sent1_t'].apply(get_sent1_token_type)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['sent2_token_type'] = df_dev['sent2_t'].apply(get_sent2_token_type)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['sent1_token_type'] = df_test['sent1_t'].apply(

In [57]:
df_train.head()


Unnamed: 0,gold_label,sentence1,sentence2,sent1,sent2,sent1_t,sent2_t,sent1_token_type,sent2_token_type
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,[CLS] A person on a horse jumps over a broken ...,A person is training his horse for a competiti...,"[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, training, his, horse, for, a, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",[CLS] A person on a horse jumps over a broken ...,"A person is at a diner, ordering an omelette. ...","[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, at, a, diner, ,, ordering, an,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",[CLS] A person on a horse jumps over a broken ...,"A person is outdoors, on a horse. [SEP]","[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, outdoors, ,, on, a, horse, ., ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,neutral,Children smiling and waving at camera,They are smiling at their parents,[CLS] Children smiling and waving at camera [S...,They are smiling at their parents [SEP],"[[CLS], children, smiling, and, waving, at, ca...","[they, are, smiling, at, their, parents, [SEP]]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]"
4,entailment,Children smiling and waving at camera,There are children present,[CLS] Children smiling and waving at camera [S...,There are children present [SEP],"[[CLS], children, smiling, and, waving, at, ca...","[there, are, children, present, [SEP]]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]"


In [58]:
# Combine both sequences
df_train["sequence"] = df_train["sent1_t"] + df_train["sent2_t"]
df_dev["sequence"] = df_dev["sent1_t"] + df_dev["sent2_t"]
df_test["sequence"] = df_test["sent1_t"] + df_test["sent2_t"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['sequence'] = df_dev['sent1_t'] + df_dev['sent2_t']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['sequence'] = df_test['sent1_t'] + df_test['sent2_t']


In [59]:
df_train.head()


Unnamed: 0,gold_label,sentence1,sentence2,sent1,sent2,sent1_t,sent2_t,sent1_token_type,sent2_token_type,sequence
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,[CLS] A person on a horse jumps over a broken ...,A person is training his horse for a competiti...,"[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, training, his, horse, for, a, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[[CLS], a, person, on, a, horse, jumps, over, ..."
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",[CLS] A person on a horse jumps over a broken ...,"A person is at a diner, ordering an omelette. ...","[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, at, a, diner, ,, ordering, an,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[[CLS], a, person, on, a, horse, jumps, over, ..."
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",[CLS] A person on a horse jumps over a broken ...,"A person is outdoors, on a horse. [SEP]","[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, outdoors, ,, on, a, horse, ., ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[[CLS], a, person, on, a, horse, jumps, over, ..."
3,neutral,Children smiling and waving at camera,They are smiling at their parents,[CLS] Children smiling and waving at camera [S...,They are smiling at their parents [SEP],"[[CLS], children, smiling, and, waving, at, ca...","[they, are, smiling, at, their, parents, [SEP]]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]","[[CLS], children, smiling, and, waving, at, ca..."
4,entailment,Children smiling and waving at camera,There are children present,[CLS] Children smiling and waving at camera [S...,There are children present [SEP],"[[CLS], children, smiling, and, waving, at, ca...","[there, are, children, present, [SEP]]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]","[[CLS], children, smiling, and, waving, at, ca..."


In [60]:
# Get attention mask
df_train["attention_mask"] = df_train["sequence"].apply(get_sent2_token_type)
df_dev["attention_mask"] = df_dev["sequence"].apply(get_sent2_token_type)
df_test["attention_mask"] = df_test["sequence"].apply(get_sent2_token_type)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['attention_mask'] = df_dev['sequence'].apply(get_sent2_token_type)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['attention_mask'] = df_test['sequence'].apply(get_sent2_token_type)


In [61]:
df_train.head()


Unnamed: 0,gold_label,sentence1,sentence2,sent1,sent2,sent1_t,sent2_t,sent1_token_type,sent2_token_type,sequence,attention_mask
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,[CLS] A person on a horse jumps over a broken ...,A person is training his horse for a competiti...,"[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, training, his, horse, for, a, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[[CLS], a, person, on, a, horse, jumps, over, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",[CLS] A person on a horse jumps over a broken ...,"A person is at a diner, ordering an omelette. ...","[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, at, a, diner, ,, ordering, an,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[[CLS], a, person, on, a, horse, jumps, over, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",[CLS] A person on a horse jumps over a broken ...,"A person is outdoors, on a horse. [SEP]","[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, outdoors, ,, on, a, horse, ., ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[[CLS], a, person, on, a, horse, jumps, over, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,neutral,Children smiling and waving at camera,They are smiling at their parents,[CLS] Children smiling and waving at camera [S...,They are smiling at their parents [SEP],"[[CLS], children, smiling, and, waving, at, ca...","[they, are, smiling, at, their, parents, [SEP]]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]","[[CLS], children, smiling, and, waving, at, ca...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,entailment,Children smiling and waving at camera,There are children present,[CLS] Children smiling and waving at camera [S...,There are children present [SEP],"[[CLS], children, smiling, and, waving, at, ca...","[there, are, children, present, [SEP]]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]","[[CLS], children, smiling, and, waving, at, ca...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [62]:
# Get combined token type ids for input
df_train["token_type"] = df_train["sent1_token_type"] + df_train["sent2_token_type"]
df_dev["token_type"] = df_dev["sent1_token_type"] + df_dev["sent2_token_type"]
df_test["token_type"] = df_test["sent1_token_type"] + df_test["sent2_token_type"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['token_type'] = df_dev['sent1_token_type'] + df_dev['sent2_token_type']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['token_type'] = df_test['sent1_token_type'] + df_test['sent2_token_type']


In [63]:
df_train.head()


Unnamed: 0,gold_label,sentence1,sentence2,sent1,sent2,sent1_t,sent2_t,sent1_token_type,sent2_token_type,sequence,attention_mask,token_type
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,[CLS] A person on a horse jumps over a broken ...,A person is training his horse for a competiti...,"[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, training, his, horse, for, a, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[[CLS], a, person, on, a, horse, jumps, over, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",[CLS] A person on a horse jumps over a broken ...,"A person is at a diner, ordering an omelette. ...","[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, at, a, diner, ,, ordering, an,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[[CLS], a, person, on, a, horse, jumps, over, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",[CLS] A person on a horse jumps over a broken ...,"A person is outdoors, on a horse. [SEP]","[[CLS], a, person, on, a, horse, jumps, over, ...","[a, person, is, outdoors, ,, on, a, horse, ., ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[[CLS], a, person, on, a, horse, jumps, over, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,neutral,Children smiling and waving at camera,They are smiling at their parents,[CLS] Children smiling and waving at camera [S...,They are smiling at their parents [SEP],"[[CLS], children, smiling, and, waving, at, ca...","[they, are, smiling, at, their, parents, [SEP]]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]","[[CLS], children, smiling, and, waving, at, ca...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]"
4,entailment,Children smiling and waving at camera,There are children present,[CLS] Children smiling and waving at camera [S...,There are children present [SEP],"[[CLS], children, smiling, and, waving, at, ca...","[there, are, children, present, [SEP]]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]","[[CLS], children, smiling, and, waving, at, ca...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]"


In [64]:
# Now make all these inputs as sequential data to be easily fed into torchtext Field.
df_train["sequence"] = df_train["sequence"].apply(combine_seq)
df_dev["sequence"] = df_dev["sequence"].apply(combine_seq)
df_test["sequence"] = df_test["sequence"].apply(combine_seq)
df_train["attention_mask"] = df_train["attention_mask"].apply(combine_mask)
df_dev["attention_mask"] = df_dev["attention_mask"].apply(combine_mask)
df_test["attention_mask"] = df_test["attention_mask"].apply(combine_mask)
df_train["token_type"] = df_train["token_type"].apply(combine_mask)
df_dev["token_type"] = df_dev["token_type"].apply(combine_mask)
df_test["token_type"] = df_test["token_type"].apply(combine_mask)
df_train = df_train[["gold_label", "sequence", "attention_mask", "token_type"]]
df_dev = df_dev[["gold_label", "sequence", "attention_mask", "token_type"]]
df_test = df_test[["gold_label", "sequence", "attention_mask", "token_type"]]
df_train = df_train.loc[df_train["gold_label"].isin(["entailment", "contradiction", "neutral"])]
df_dev = df_dev.loc[df_dev["gold_label"].isin(["entailment", "contradiction", "neutral"])]
df_test = df_test.loc[df_test["gold_label"].isin(["entailment", "contradiction", "neutral"])]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['sequence'] = df_dev['sequence'].apply(combine_seq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['sequence'] = df_test['sequence'].apply(combine_seq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['attention_mask'] = df_dev['attention_mask'].apply(combine_mask)
A value is tr

In [65]:
df_train.head()


Unnamed: 0,gold_label,sequence,attention_mask,token_type
0,neutral,[CLS] a person on a horse jumps over a broken ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
1,contradiction,[CLS] a person on a horse jumps over a broken ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 ...
2,entailment,[CLS] a person on a horse jumps over a broken ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
3,neutral,[CLS] children smiling and waving at camera [S...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
4,entailment,[CLS] children smiling and waving at camera [S...,1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 1 1 1 1 1


In [66]:
# Save prepared data as csv file
df_train.to_csv("snli_1.0/snli_1.0_train.csv", index=False)
df_dev.to_csv("snli_1.0/snli_1.0_dev.csv", index=False)
df_test.to_csv("snli_1.0/snli_1.0_test.csv", index=False)


In [67]:
# To convert back attention mask and token type ids to integer.
def convert_to_int(tok_ids):
    tok_ids = [int(x) for x in tok_ids]
    return tok_ids


In [68]:
import torchtext

print(torchtext.__version__)


0.8.1


In [69]:
# For latest version use torchtext.legacy
from torchtext import data

# For sequence
TEXT = data.Field(
    batch_first=True,
    use_vocab=False,
    tokenize=split_and_cut,
    preprocessing=tokenizer.convert_tokens_to_ids,
    pad_token=pad_token_idx,
    unk_token=unk_token_idx,
)
# For label
LABEL = data.LabelField()
# For Attention mask
ATTENTION = data.Field(
    batch_first=True, use_vocab=False, tokenize=split_and_cut, preprocessing=convert_to_int, pad_token=pad_token_idx
)
# For token type ids
TTYPE = data.Field(batch_first=True, use_vocab=False, tokenize=split_and_cut, preprocessing=convert_to_int, pad_token=1)




In [70]:
LABEL


<torchtext.data.field.LabelField at 0x7f7a23bcfeb0>

In [71]:
fields = [("label", LABEL), ("sequence", TEXT), ("attention_mask", ATTENTION), ("token_type", TTYPE)]


In [72]:
def label_default(value):
    if value == "":
        return "neutral"
    else:
        return value


In [73]:
LABEL.preprocessing = label_default


In [74]:
train_data, valid_data, test_data = data.TabularDataset.splits(
    path="snli_1.0",
    train="snli_1.0_train.csv",
    validation="snli_1.0_dev.csv",
    test="snli_1.0_test.csv",
    format="csv",
    fields=fields,
    skip_header=True,
)




In [75]:
print(f"Train data {len(train_data)}")
print(f"Validation data {len(valid_data)}")
print(f"Test data {len(test_data)}")


Train data 79915
Validation data 7990
Test data 7990


In [76]:
LABEL.build_vocab(train_data)


In [77]:
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.sequence),
    sort_within_batch=False,
    device=device,
)




In [80]:
from transformers import BertModel

bert_model = BertModel.from_pretrained("bert-base-uncased")


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [81]:
import torch.nn as nn


class BERTNLIModel(nn.Module):
    def __init__(self, bert_model, hidden_dim, output_dim):
        super().__init__()
        self.bert = bert_model

        embedding_dim = bert_model.config.to_dict()["hidden_size"]
        self.out = nn.Linear(embedding_dim, output_dim)

    def forward(self, sequence, attn_mask, token_type):
        embedded = self.bert(input_ids=sequence, attention_mask=attn_mask, token_type_ids=token_type)[1]
        output = self.out(embedded)
        return output


In [82]:
# defining model
HIDDEN_DIM = 512
OUTPUT_DIM = len(LABEL.vocab)
model = BERTNLIModel(bert_model, HIDDEN_DIM, OUTPUT_DIM).to(device)


In [83]:
sum(p.numel() for p in model.parameters() if p.requires_grad)


109484547

In [84]:
!git clone https://github.com/NVIDIA/apex
!pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Cloning into 'apex'...
remote: Enumerating objects: 10936, done.[K
remote: Counting objects: 100% (62/62), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 10936 (delta 20), reused 26 (delta 4), pack-reused 10874[K
Receiving objects: 100% (10936/10936), 15.29 MiB | 9.60 MiB/s, done.
Resolving deltas: 100% (7562/7562), done.
[0mUsing pip 22.0.4 from /usr/local/lib/python3.9/dist-packages/pip (python 3.9)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Processing ./apex
  Running command python setup.py egg_info


  torch.__version__  = 1.7.1


  running egg_info
  creating /tmp/pip-pip-egg-info-5zige502/apex.egg-info
  writing /tmp/pip-pip-egg-info-5zige502/apex.egg-info/PKG-INFO
  writing dependency_links to /tmp/pip-pip-egg-info-5zige502/apex.egg-info/dependency_links.txt
  writing requirements to /tmp/pip-pip-egg-info-5zige502/apex.egg-info/requires.txt
  writing top-level names to /tmp/pip-pip-egg-info-5

In [91]:
from transformers import AdamW, get_constant_schedule_with_warmup
import torch.optim as optim

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-6, correct_bias=False)


def get_scheduler(optimizer, warmup_steps):
    scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
    return scheduler


criterion = nn.CrossEntropyLoss().to(device)


In [92]:
def categorical_accuracy(preds, y):
    max_preds = preds.argmax(dim=1, keepdim=True)

    correct = (max_preds.squeeze(1) == y).float()

    return correct.sum() / len(y)


In [93]:
max_grad_norm = 1


def train(model, iterator, optimizer, criterion, scheduler):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()  # clear gradients first
        torch.cuda.empty_cache()  # releases all unoccupied cached memory
        sequence = batch.sequence
        attn_mask = batch.attention_mask
        token_type = batch.token_type
        label = batch.label
        predictions = model(sequence, attn_mask, token_type)
        loss = criterion(predictions, label)
        acc = categorical_accuracy(predictions, label)
        loss.backward()
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [94]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            sequence = batch.sequence
            attn_mask = batch.attention_mask
            token_type = batch.token_type
            labels = batch.label
            predictions = model(sequence, attn_mask, token_type)
            loss = criterion(predictions, labels)
            acc = categorical_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [95]:
import time


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [98]:
import math

N_EPOCHS = 6
warmup_percent = 0.2
total_steps = math.ceil(N_EPOCHS * len(train_iterator) * 1.0 / BATCH_SIZE)
warmup_steps = int(total_steps * warmup_percent)
scheduler = get_scheduler(optimizer, warmup_steps)
best_valid_loss = float("inf")
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, scheduler)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "bert-nli.pt")
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%")




Epoch: 01 | Epoch Time: 13m 35s
tTrain Loss: 0.493 | Train Acc: 80.61%
t Val. Loss: 0.265 | Val. Acc: 90.82%
Epoch: 02 | Epoch Time: 13m 32s
tTrain Loss: 0.302 | Train Acc: 89.18%
t Val. Loss: 0.172 | Val. Acc: 94.73%
Epoch: 03 | Epoch Time: 13m 33s
tTrain Loss: 0.205 | Train Acc: 92.91%
t Val. Loss: 0.098 | Val. Acc: 97.28%
Epoch: 04 | Epoch Time: 13m 28s
tTrain Loss: 0.145 | Train Acc: 95.17%
t Val. Loss: 0.074 | Val. Acc: 97.78%
Epoch: 05 | Epoch Time: 13m 27s
tTrain Loss: 0.110 | Train Acc: 96.29%
t Val. Loss: 0.056 | Val. Acc: 98.32%
Epoch: 06 | Epoch Time: 13m 26s
tTrain Loss: 0.089 | Train Acc: 97.06%
t Val. Loss: 0.034 | Val. Acc: 98.96%


In [99]:
torch.__version__


'1.7.1'

In [100]:
model.load_state_dict(torch.load("bert-nli.pt"))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f"Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%")




Test Loss: 0.034 |  Test Acc: 98.96%


In [101]:
import pickle

with open("bert-nli-tokens.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [None]:
def predict_inference(premise, hypothesis, model, device):
    model.eval()
    premise = "[CLS] " + premise + " [SEP]"
    hypothesis = hypothesis + " [SEP]"
    prem_t = tokenize_bert(premise)
    hypo_t = tokenize_bert(hypothesis)
    prem_type = get_sent1_token_type(prem_t)
    hypo_type = get_sent2_token_type(hypo_t)
    indexes = prem_t + hypo_t
    indexes = tokenizer.convert_tokens_to_ids(indexes)
    indexes_type = prem_type + hypo_type
    attn_mask = get_sent2_token_type(indexes)
    indexes = torch.LongTensor(indexes).unsqueeze(0).to(device)
    indexes_type = torch.LongTensor(indexes_type).unsqueeze(0).to(device)
    attn_mask = torch.LongTensor(attn_mask).unsqueeze(0).to(device)
    prediction = model(indexes, attn_mask, indexes_type)
    prediction = prediction.argmax(dim=-1).item()
    return LABEL.vocab.itos[prediction]
