# Preparing data

In [None]:
import pandas as pd
import json

In [None]:
data_path = "../data/nli/multinli_1.0_train.jsonl"
data = pd.read_json(data_path, lines=True)

In [None]:
data.head()

In [None]:
to_count = {}
for i in range(len(data)):
  if data['gold_label'][i] not in to_count:
    to_count[data['gold_label'][i]] = 1
  else:
    to_count[data['gold_label'][i]] += 1
to_count

In [None]:
_c = 0
for i in range(len(data)):
  if data['gold_label'][i] == '-':
    _c += 1
_c

In [None]:
pair_and_label = []
for i in range(len(data)):
    pair_and_label.append((data['sentence1'][i], data['sentence2'][i], data['gold_label'][i]))

In [None]:
len(pair_and_label)

In [None]:
data['pair_label'] = pair_and_label

In [None]:
data.head()

# Features extraction

## Lexical Overlap

In [None]:
def lexical_overlap(pair_label):
    premise = pair_label[0]
    hypothesis = pair_label[1]
    label = pair_label[2]
    #all = {"neutral": [], "contradiction": [], "entailment": [], "else": []} 

    prem_words = []
    hyp_words = []

    for word in premise.split():
        if word not in [".", "?", "!"]:
            prem_words.append(word.lower())

    for word in hypothesis.split():
        if word not in [".", "?", "!"]:
            hyp_words.append(word.lower())

    prem_filtered = " ".join(prem_words)
    hyp_filtered = " ".join(hyp_words)

    count = 0
    for word in hyp_words:
        if word in prem_words:
            count += 1

    if count >= len(hyp_words)*80/100:
        all_in = True
    else:
        all_in = False

    if all_in:
        if premise == 'entailment':
            return 'easy'
        elif premise == 'neutral':
            return 'hard'
        elif premise == 'contradiction':
            return 'hard'
        else:
            return '-'

In [None]:
lexical_overlap(data['pair_label'][0])

In [None]:
data['lexical_overlap'] = data['pair_label'].apply(lexical_overlap)

In [None]:
data.head()

## Word Swapping

In [None]:
def all_count(pair_label):
    premise = pair_label[0]
    hypothesis = pair_label[1]
    label = pair_label[2]

    all = {"neutral": [], "contradiction": [], "entailment": [], "else": []} 
    twen = []
    prem_words = []
    hyp_words = []

    for word in premise.split():
        if word not in [".", "?", "!"]:
            prem_words.append(word.lower())

    for word in hypothesis.split():
        if word not in [".", "?", "!"]:
            hyp_words.append(word.lower())

    prem_filtered = " ".join(prem_words)
    hyp_filtered = " ".join(hyp_words)

    count = 0
    for word in hyp_words:
        if word in prem_words:
            count += 1

    if count > len(hyp_words)*0/100:
        all_in = True
    else:
        all_in = False

    if all_in:
        twen.append((premise, hypothesis))

    return twen

In [None]:
def detect_swapping(prem_and_hypo):
    new_prem, new_hypo = '', ''
    prem = prem_and_hypo[0].replace("  ", " ")
    hypo = prem_and_hypo[1].replace("  ", " ")
    tokens_prem = prem.lower().split()
    tokens_hypo = hypo.lower().split()

    intersect = list(set(tokens_prem).intersection(tokens_hypo))

    for i in range(len(tokens_prem)): 
        if tokens_prem[i] in intersect:
            new_prem += tokens_prem[i]+' '
    for i in range(len(tokens_hypo)):
        if tokens_hypo[i] in intersect:
            new_hypo += tokens_hypo[i]+' '

    if len(new_prem.split()) == 1 and len(new_hypo.split()) == 1:
        return "-"
    elif len(new_prem.split()) == 0 and len(new_hypo.split()) == 0:
        return "-"

    if len(new_prem.strip()) > len(new_hypo.strip()):
        dif = len(new_prem.strip())-len(new_hypo.strip())
        distance = Levenshtein.distance(new_prem.strip(), new_hypo.strip())
        if dif == distance:
            return "Not Swap"
        else:
            return "Swap"
    else:
        dif = len(new_hypo.strip())-len(new_prem.strip())
        distance = Levenshtein.distance(new_prem.strip(), new_hypo.strip())
        if dif == distance:
            return "Not Swap"
        else:
            return "Swap"

In [None]:
detect_swapping(("Carl Newton and I wrote a letter", "Carl wrote a letter"))

In [None]:
swap, not_swap, dash = 0, 0, 0
for i in data['pair_label']:
    res = detect_swapping(i)
    if res == "Swap":
        swap += 1
    elif res == "Not Swap":
        not_swap += 1
    else:
        dash += 1

In [None]:
data['word_swapping'] = data['pair_label'].apply(detect_swapping)

In [None]:
data.head()

## Hypothesis Lenght

In [None]:
from sklearn import preprocessing
import numpy as np

In [None]:
def hypo_len(pair_label):
    hypo_tokens = pair_label[1].strip().split()
    return len(hypo_tokens)

In [None]:
data['hypothesis length'] = data['pair_label'].apply(hypo_len)

In [None]:
length = np.array([data['hypothesis length']])
length

In [None]:
length = np.array([data['hypothesis length']]).reshape(-1, 1)
scaler = preprocessing.MinMaxScaler()
scale = scaler.fit_transform(length)
scale

In [None]:
scale_list = []
for i in range(len(scale)):
    scale_list.append(scale[i][0])

In [None]:
data['hypo_len'] = scale_list

In [None]:
data = data.drop("hypothesis length", axis=1)

In [None]:
data.head()

## Negation

In [None]:
def have_negation(pair_label):
    premise = pair_label[0]
    hypothesis = pair_label[1]
    label = pair_label[2]

    keywords = [" not ", " no ", "n't", "none", "nobody", "nothing", "neither", "nowhere", "never", "cannot", " nor "]
    count = 0
    for key in keywords:
    if key in premise or key in hypothesis:
          count += 1
    # elif key not in premise and key not in hypothesis:
        # none += 1

    if count > 0:
        if label.strip() != 'contradiction':
            return 'hard'
        else:
            return 'easy'
    elif count == 0:
        return '-'

In [None]:
have_negation(("I do miss you", "I do not miss you", "neutral"))

In [None]:
data['negation'] = data['pair_label'].apply(have_negation)

In [None]:
data.head()

## Subsequence

In [None]:
def subseq(pair_label):
    premise = pair_label[0]
    hypothesis = pair_label[1]
    label = pair_label[2]

    prem_words = []
    hyp_words = []

    for word in premise.split():
        if word not in [".", "?", "!"]:
            prem_words.append(word.lower())

    for word in hypothesis.split():
        if word not in [".", "?", "!"]:
            hyp_words.append(word.lower())

    prem_filtered = " ".join(prem_words)
    hyp_filtered = " ".join(hyp_words)

    if hyp_filtered in prem_filtered:
        if label == 'entailment':
            return 'easy'
        else:
            return 'hard'
    else:
        return '-'

In [None]:
data['subsequence'] = data['pair_label'].apply(subseq)

In [None]:
data.head()

## Constituent

In [None]:
fi = open("../data/nli/multinli_1.0_train.txt", "r")

In [None]:
def parse_phrase_list(parse, phrases):
    #print(parse)
    if parse == "":
        return phrases
    
    phrase_list = phrases

    words = parse.split()
    this_phrase = []
    next_level_parse = []
    for index, word in enumerate(words):
        if word == "(":
            next_level_parse += this_phrase
            this_phrase = ["("]

        elif word == ")" and len(this_phrase) > 0 and this_phrase[0] == "(":
            phrase_list.append(" ".join(this_phrase[1:]))
            next_level_parse += this_phrase[1:]
            this_phrase = []
        elif word == ")":
            next_level_parse += this_phrase
            next_level_parse.append(")")
            this_phrase = []
        else:
            this_phrase.append(word)

    return parse_phrase_list(" ".join(next_level_parse), phrase_list)

In [None]:
constituent = []
count_entailment = 0
count_neutral = 0
count_contradiction = 0
first = True
counter = 0
for line in fi:
    counter += 1

    if first:
        first = False
        continue


    parts = line.strip().split("\t")

    premise = parts[5]
    hypothesis = parts[6]
    label = parts[0]
    parse = parts[1]

    parse_new = []
    for word in parse.split():
        if word not in [".", "?", "!"]:
            parse_new.append(word.lower())

    all_phrases = parse_phrase_list(" ".join(parse_new), [])

    prem_words = []
    hyp_words = []

    for word in premise.split():
        if word not in [".", "?", "!"]:
            prem_words.append(word.lower().replace(".", "").replace("?", "").replace("!", ""))

    for word in hypothesis.split():
        if word not in [".", "?", "!"]:
            hyp_words.append(word.lower().replace(".", "").replace("?", "").replace("!", ""))

    prem_filtered = " ".join(prem_words)
    hyp_filtered = " ".join(hyp_words)

    if hyp_filtered in all_phrases:
        if label == "entailment":
            constituent.append((premise, hypothesis, label))
        if label == "neutral":
            constituent.append((premise, hypothesis, label))
        if label == "contradiction":
            constituent.append((premise, hypothesis, label))

In [None]:
def cons(pair_label):
    if pair_label in constituent:
        if pair_label[2] == 'entailment':
            return 'easy'
        else:
            return 'hard'
    else:
        return '-'

In [None]:
data['constituent'] = data['pair_label'].apply(cons)

In [None]:
data.head()

## Antonyms

In [None]:
from urllib.request import urlopen as uReq
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from tqdm.notebook import tqdm

nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

In [None]:
stop_words = stopwords.words('english')

In [None]:
def find_antonyms(string):
    successful = 0
    error404 = 0
    etc = 0
    try:
        # Remove whitespace before and after word and use underscore between words
        stripped_string = string.strip()
        fixed_string = stripped_string.replace(" ", "_")

        # Set the url using the amended string
        my_url = f'https://thesaurus.plus/thesaurus/{fixed_string}'

        res = requests.get(my_url)
        res.encoding = "utf-8"

        if res.status_code == 200:
            successful += 1
#             print("Successful")
        elif res.status_code == 404:
            error404 += 1
#             print("Error 404 page not found")
        else:
            etc += 1
#             print("Not both 200 and 404")

        soup = BeautifulSoup(res.text, 'html.parser')
        nyms = soup.find_all("ul", {"class": "list paper"})

        output = [[], []]

        for idx, n in enumerate(nyms):
            a = n.find_all("div", "list_item")
            for b in a:
                output[idx].append(b.text.strip())

    except Exception as e:
        print(e)
    
    antonym = output[1]
    
    return antonym

In [None]:
all_antonyms = {}
for i in range(len(word_overlap)):
    tokens = word_overlap['sentence1'][i].split()
    for token in tokens:
        token = token.replace(".", "")
        token = token.replace(",", "")
        token = token.replace("-", " ")
        token = token.replace("'s", "")
        token = token.lower().strip()
        if token not in all_antonyms and token not in stop_words and " " not in token and "\"" not in token and "?" not in token and not token.isdigit() and "(" not in token and ")" not in token and "/" not in token and "'" not in token and "$" not in token and ";" not in token and ":" not in token and "[" not in token and "]" not in token:
            lemma_token = lemmatizer.lemmatize(token)
            all_antonyms[lemma_token] = []

In [None]:
antonymy = []
for key in all_antonyms:
    antonymy.append(key)
len(antonymy)

In [None]:
count = 0
every_antonym = {}
for word in tqdm(antonymy):
    every_antonym[word] = []
    # print(count, word)
    result = find_antonyms(word.strip())
    if len(result) == 0:
        count += 1
        continue
    else:
        # print(result)
        for each in result:
            every_antonym[word].append(each)
    count += 1

In [None]:
print(f'Successful: {successful}')
print(f'Error 404 page not foun: {error404}')
print(f'Not both 200 and 404: {etc}')

In [None]:
every_antonym

In [None]:
def antonym(pair_label):
    premise = pair_label[0]
    hypothesis = pair_label[1]
    label = pair_label[2]

    premise_tokens = premise.split()
    hypo_tokens = hypothesis.split()

    count = 0
    for token in premise_tokens:
        if token in all_antonyms:
            for ant in all_antonyms[token]:
                if ant in hypo_tokens:
                    count += 1
  
    if count > 0:
        if label == 'contradiction':
            return 'easy'
        else:
            return 'hard'
    else:
        return '-'

In [None]:
data['antonym'] = data['pair_label'].apply(antonym)

In [None]:
data.head()

## Overlapping Score

### PMI

In [None]:
import re

In [None]:
pat = "[.,!?]"
word_overlap['sentence1'] = word_overlap['sentence1'].str.replace(pat, "", regex=False)
word_overlap['sentence2'] = word_overlap['sentence2'].str.replace(pat, "", regex=False)

In [None]:
pmi = {"entailment": {}, "neutral": {}, "contradiction": {}}
for i in range(len(data)):
#     print(i)
    prem = data['sentence1'][i].lower().split()
    hypo = data['sentence2'][i].lower().split()

    if data['gold_label'][i] == "entailment":
        for token in prem:
            if token not in pmi['entailment']:
                pmi['entailment'][token] = 1
            else:
                pmi['entailment'][token] += 1
        for token in hypo:
            if token not in pmi['entailment']:
                pmi['entailment'][token] = 1
            else:
                pmi['entailment'][token] += 1
    elif data['gold_label'][i] == "neutral":
        for token in prem:
            if token not in pmi['neutral']:
                pmi['neutral'][token] = 1
            else:
                pmi['neutral'][token] += 1
        for token in hypo:
            if token not in pmi['neutral']:
                pmi['neutral'][token] = 1
            else:
                pmi['neutral'][token] += 1
    else:
        for token in prem:
            if token not in pmi['contradiction']:
                pmi['contradiction'][token] = 1
            else:
                pmi['contradiction'][token] += 1
        for token in hypo:
            if token not in pmi['contradiction']:
                pmi['contradiction'][token] = 1
            else:
                pmi['contradiction'][token] += 1

In [None]:
total = {}
for i in range(len(data)):
    print(i)
    prem = data['sentence1'][i].lower().split()
    hypo = data['sentence2'][i].lower().split()
    for token in prem:
        if token not in total:
            total[token] = 1
        else:
            total[token] += 1
    for token in hypo:
        if token not in total:
            total[token] = 1
        else:
            total[token] += 1 

In [None]:
entail_word, con_word, neu_word = 0, 0, 0
for num in pmi['entailment']:
    entail_word += pmi['entailment'][num]

for num in pmi['contradiction']:
    con_word += pmi['contradiction'][num]

for num in pmi['neutral']:
    neu_word += pmi['neutral'][num]

In [None]:
print(f'entialment: {entail_word}')
print(f'contradiction: {con_word}')
print(f'neutral: {neu_word}')

In [None]:
def pmi_entail(word):
    prob_word_class = pmi['entailment'][word]/float(all)
    prob_word = total[word]/float(all)
    prob_class = entail_word/float(all)
    result = max(np.log(prob_word_class/(prob_word*prob_class)), 0.0)
    return result

def pmi_con(word):
    prob_word_class = pmi['contradiction'][word]/float(all)
    prob_word = total[word]/float(all)
    prob_class = con_word/float(all)
    result = max(np.log(prob_word_class/(prob_word*prob_class)), 0.0)
    return result

def pmi_neu(word):
    prob_word_class = pmi['neutral'][word]/float(all)
    prob_word = total[word]/float(all)
    prob_class = neu_word/float(all)
    result = max(np.log(prob_word_class/(prob_word*prob_class)), 0.0)
    return result

In [None]:
entailment_pmi = {}
for word in pmi["entailment"]:
    if pmi['entailment'][word] > 500:
        entailment_pmi[word] = pmi_entail(word)

sort_pmi_entail = sorted(entailment_pmi.items(), key=lambda x: x[1], reverse=True)
sort_pmi_entail[:40]

In [None]:
contradiction_pmi = {}
for word in pmi["contradiction"]:
    if pmi['contradiction'][word] > 500:
    contradiction_pmi[word] = pmi_con(word)

sort_pmi_cons = sorted(contradiction_pmi.items(), key=lambda x: x[1], reverse=True)
sort_pmi_cons[:40]

In [None]:
neutral_pmi = {}
for word in pmi["neutral"]:
    if pmi['neutral'][word] > 500:
        neutral_pmi[word] = pmi_neu(word)
    
sort_pmi_neutral = sorted(neutral_pmi.items(), key=lambda x: x[1], reverse=True)
sort_pmi_neutral[:40]

In [None]:
pattern = re.compile(r"\d+")
pmi_neutral_dict = {"neutral": []}
count_neu = 0
for i in sort_pmi_neutral[:50]:
    count_neu += 1
    result = re.match(pattern, i[0])
    if not result:
        pmi_neutral_dict['neutral'].append(i[0])
    if len(pmi_neutral_dict['neutral']) == 40:
        break

In [None]:
pmi_neutral_dict['neutral']

In [None]:
pmi_ent_dict = {"entailment": []}
for i in sort_pmi_entail[:40]:
    pmi_ent_dict['entailment'].append(i[0])

In [None]:
pmi_cons_dict = {"contradiction": []}
for i in sort_pmi_cons[:40]:
    pmi_cons_dict['contradiction'].append(i[0])

In [None]:
pmi_df = pd.DataFrame(pmi_neutral_dict)
pmi_df["entailment"] = pmi_ent_dict["entailment"]
pmi_df["contradiction"] = pmi_cons_dict["contradiction"]
pmi_df.head()

### Make bag of words

In [None]:
bow = []
bow.append(pmi_df.neutral.values.tolist())
bow.append(pmi_df.entailment.values.tolist())
bow.append(pmi_df.contradiction.values.tolist())

bow1d = [item for nest in bow for item in nest]

In [None]:
for b in bow1d:
    def make_bow(pair_label):
        count = 0
        prem = pair_label[0]
        hypo = pair_label[1]
        combine = prem+" "+hypo
        tokens = combine.lower().split()
            for token in tokens:
                if token.strip() == b:
                count += 1
            if count > 0:
                return 1
            else:
                return 0
        data[b] = data['pair_label'].apply(make_bow)

In [None]:
def get_overlap_score(pair_label):
    prem_words = []
    hyp_words = []

    premise = pair_label[0].strip()
    hypothesis = pair_label[1].strip()
    gold_label = pair_label[2].strip()

    for word in premise.split():
        if word not in [".", "?", "!"]:
            prem_words.append(word.lower())

    for word in hypothesis.split():
        if word not in [".", "?", "!"]:
            hyp_words.append(word.lower())

    prem_filtered = " ".join(prem_words)
    hyp_filtered = " ".join(hyp_words)

    count = 0
    for word in hyp_words:
        if word in prem_words:
            count+=1

    overlap_score = count/len(hyp_words)        
    return overlap_score

In [None]:
data['overlapping score'] = data['pair_label'].apply(get_overlap_score)

In [None]:
data.head()

In [None]:
data.to_csv("../data/nli/multinli_1.0_train_features_path.csv")