# SNLI

Creating `text_data.csv` and `annotations.csv` for SNLI dataset.

In [11]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../../../')

import os
import json
import math
import sys
from collections import defaultdict

from typing import List, Optional, Union, Dict

import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase

from multitask_nlp.settings import DATASETS_DIR

dataset_path = DATASETS_DIR / 'snli'


color_pallette = 'muted'
sns.set_theme(style='whitegrid', font_scale=2, palette=color_pallette, 
              rc={'font.family': 'serif', 'font.serif': 'Times New Roman', 'mathtext.fontset': 'dejavuserif'})

DEFAULT_RANDOM = 42

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df_list = []

for split in ['train', 'dev', 'test']:
    df = pd.read_json(dataset_path / f'snli_1.0_{split}.jsonl', lines=True)
    df = df.drop(columns=['annotator_labels', 'captionID', 'pairID', 'sentence1_binary_parse', 'sentence1_parse',
                           'sentence2_binary_parse', 'sentence2_parse'])
    df['text_id'] = f'{split}_' + df.index.astype(str) 
    df = df.rename(columns={'gold_label': 'label'})
    df = df[~(df['label'] == '-')]
    df['split'] = split
    df_list.append(df)

In [3]:
df = pd.concat(df_list, ignore_index=False)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 569033 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   label      569033 non-null  object
 1   sentence1  569033 non-null  object
 2   sentence2  569033 non-null  object
 3   text_id    569033 non-null  object
 4   split      569033 non-null  object
dtypes: object(5)
memory usage: 26.0+ MB


In [28]:
df = df[~(df.sentence2.isin(['N/A','n/a']))]

In [29]:
texts_df = df[['text_id', 'sentence1', 'sentence2', 'split']]
annotations_df = df[['text_id', 'label']]
annotations_df['annotator_id'] = 0

texts_df.to_csv(dataset_path / f'text_data.csv', index=False)
annotations_df.to_csv(dataset_path / f'annotations.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotations_df['annotator_id'] = 0
