In [3]:
from pathlib import Path

while Path.cwd().name != 'ambient':
    %cd ..

/mmfs1/gscratch/xlab/alisaliu/ambient


In [4]:
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from utils.utils import predict_nli
from collections import Counter
from tqdm import tqdm
import os
from utils.constants import label2id, NLI_LABELS

## imppres

In [59]:
data_dir = Path('data/imppres')
dfs = []
for filename in os.listdir(data_dir / 'raw/implicature'):
    if filename.endswith('.jsonl'):
        dfs.append(pd.read_json(data_dir / 'raw/implicature' / filename, lines=True))

In [60]:
implicature = pd.concat(dfs)
implicature = implicature.loc[implicature['gold_label_log'] != implicature['gold_label_prag']]
implicature = implicature.rename({'sentence1': 'premise', 'sentence2': 'hypothesis'}, axis=1).drop(columns=['item_type', 'lexemes', 'spec_relation'])

In [62]:
implicature.to_json(data_dir / 'examples.jsonl', lines=True, orient='records')

## chaos NLI

In [113]:
data_dir = Path('data/chaosNLI')
chaos_nli = pd.read_json(data_dir / 'raw/chaosNLI_mnli_m.jsonl', lines=True)

In [119]:
examples = []
for i, row in chaos_nli.iterrows():
    examples.append({
        'premise': row['example']['premise'],
        'hypothesis': row['example']['hypothesis'],
        'label_counter': row['label_counter'],
        'entropy': row['entropy']
    })

In [126]:
pd.DataFrame(examples).to_json(data_dir / 'mnli_m.jsonl', lines=True, orient='records')

## uncertain NLI

In [9]:
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
splits = ['train', 'dev', 'test']
for split in splits:
    df = pd.read_csv(f'data/unli/raw/{split}.csv')
    df = df.replace({'nli': label_map})
    df = df.rename({'pre': 'premise', 'hyp': 'hypothesis', 'nli': 'nli_label', 'unli': 'gold'}, axis=1)
    df.to_json(f'data/unli/{split}.jsonl', lines=True, orient='records')

In [22]:
snli_train = pd.read_json('data/snli/train.jsonl', lines=True)
"""
surrogate function T → [0, 1] that maps SNLI categorical labels {ENT, NEU, CON} 
to the average score of all u-SNLI training annotations labeled with t in SNLI

s: {ENT → 0.9272; NEU → 0.4250; CON → 0.0209}
"""
surrogate_function = {'entailment': 0.9272, 'neutral': 0.4250, 'contradiction': 0.0209}

snli_train = snli_train.drop('genre', axis=1)
snli_train = snli_train.replace({'gold': surrogate_function})
print(f"Dropping {len(snli_train[snli_train['gold'] == '-'])} rows")
snli_train = snli_train[snli_train['gold'] != '-']
snli_train.to_json('data/unli/snli_regression.jsonl', lines=True, orient='records')

Dropping 785 rows


## AmbiNLI

In [77]:
# create SMNLI dataset for pretraining

dfs = []
for dataset in ['mnli', 'snli']:
    df = pd.read_json(f'data/{dataset}/train.jsonl', lines=True)
    df = df[df['gold'] != '-']
    df['dist_gold'] = None
    for i, row in df.iterrows():
        idx = label2id[row['gold']]
        dist_label = [0] * len(NLI_LABELS)
        dist_label[idx] = 1.0
        df.at[i, 'dist_gold'] = dist_label
    df = df.drop('gold', axis=1).rename({'dist_gold': 'gold'}, axis=1)
    dfs.append(df)

smnli = pd.concat(dfs)
smnli.to_json('data/ambinli/smnli.jsonl', orient='records', lines=True)

In [65]:
# create AmbiNLI

dfs = []
for dataset in ['snli', 'mnli', 'unli']:
    df = pd.read_json(f'data/ambinli/raw/ambi-{dataset}.jsonl', lines=True)
    if dataset in ['snli', 'mnli']:
        df = df[['sentence1', 'sentence2', 'index', 'label']]
    else:
        df = df[['pre', 'hyp', 'id', 'label']]
    df = df.rename({'sentence1': 'premise', 'sentence2': 'hypothesis', 'pre': 'premise', 'hyp': 'hypothesis', 'label': 'gold', 'index': 'id'}, axis=1)
    df['gold'] = [[g[2], g[0], g[1]] for g in df['gold']] # label order in original data: [entailment, neutral, contradiction]
    df.to_json(f'data/ambinli/ambi-{dataset}.jsonl', lines=True, orient='records')
    dfs.append(df)

In [66]:
ambinli = pd.concat(dfs)
ambinli.to_json('data/ambinli/ambinli.jsonl', lines=True, orient='records')

## distilled SMNLI

In [16]:
df = pd.read_json('data/distilled-smnli/relabel_s0.jsonl', lines=True)

In [17]:
df['id'] = df.index
df = df.rename({'sentence1': 'premise', 'sentence2': 'hypothesis'}, axis=1)
df['gold'] = [[g[2], g[0], g[1]] for g in df['label_dist']]

In [19]:
df[['premise', 'hypothesis', 'gold', 'id']].to_json('data/distilled-smnli/relabeled0.jsonl', lines=True, orient='records')

## multi-label data from Jiang & de Marneffe

In [37]:
split = 'dev'
df = pd.read_json(f'data/taxonomy/raw/{split}.jsonl', lines=True)

In [38]:
letter_to_label = {lab[0]: lab for lab in NLI_LABELS}
label_map = {lab:', '.join([letter_to_label[l] for l in lab]) for lab in df.gold_label.value_counts().keys()}

In [39]:
df = df.replace({'gold_label': label_map})
df = df.rename(columns={'index': 'id', 'gold_label': 'gold'}).drop(['u_index', 'uid'], axis=1)

In [40]:
df.to_json(f'data/taxonomy/{split}.jsonl', lines=True, orient='records')