## Le lien kernel pour ce projet est disponible à [ici](https://www.kaggle.com/code/alihigo/nli-xlm-roberta).

## Le repo github est disponible [ici](https://github.com/aliheadou/NLI_KaggleCompetition).

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
#
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# from tqdm.notebook import tqdm
from tqdm import tqdm
tqdm.pandas()

In [None]:
%ls /kaggle

In [None]:
train_data = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test_data  = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
train_data.shape, test_data.shape

In [None]:
train_data.head()

In [None]:
labels, frequencies = np.unique(train_data.language.values, return_counts = True)

plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

In [None]:
labels = train_data.label.value_counts()
plt.figure(figsize = (10,5))
labels.plot.bar()
plt.title("Number of items in each category")
plt.ylabel("Number (item)")
plt.show()

In [None]:
## Comparing length of sentences in train_set
hypo_len = train_data['hypothesis'].apply(len)
prem_len = train_data['premise'].apply(len)

plt.figure(figsize=(15,6))
plt.hist(hypo_len, bins=50, label="hypothesis length in train set", alpha=0.5)
plt.hist(prem_len, bins=50, label="premise length in train set", alpha=0.5)
plt.title("Length Comparison")
plt.legend(loc='best')
plt.show()

In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
# TensorFlow
import tensorflow as tf

# SKLearn Library
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
# Troncate Premises & Hypothesis sequences at max_len
max_len=64

# Loading Data Into TensorFlow Dataset
AUTO = tf.data.experimental.AUTOTUNE
batch_size = 16

In [None]:
def troncate_sentence(sent, seq_len=max_len):
    """" Return troncated sentence up to seq_len """
    try:
        sent = sent.split()
        sent = sent[:seq_len]
        return " ".join(sent)
    except:
        return sent

def generate_tokens(df, tokenizer, seq_len=2*max_len):
    """" Return list of token input_ids for a given dataframe"""
    input_ids = []
    input_masks = []
    
    for i, text in tqdm(enumerate(df.values.tolist()), total=len(df)):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=seq_len, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            # return_tensors='tf'
        )
    
        input_ids.append(tokenized_text['input_ids'])
        input_masks.append(tokenized_text['attention_mask'])
    return input_ids, input_masks

def get_body_encoding(hypothesis, premises, tokenizer):
    """" Return encoded body from hyp and prem  """
    body = hypothesis.apply(troncate_sentence) + "[SEP]" + premises.apply(troncate_sentence)
    return generate_tokens(body, tokenizer)

## On applique une des variantes du modèle pré-entraîné multilingue `XLMRoberta` 

In [None]:
# Transformer Model
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel

# Transformer Model Name
transformer_model = 'joeddav/xlm-roberta-large-xnli' #'jplu/tf-xlm-roberta-large'

# Define Tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained(transformer_model)

In [None]:
# print(generate_tokens(pd.Series(["The rules developed in the interim"]), tokenizer=tokenizer))

In [None]:
#Pretrained Transformer Model
transformer_encoder = TFXLMRobertaModel.from_pretrained(transformer_model)

In [None]:
# Build model function
def build_model(transformer, final_len=2*max_len, learnin_rate=2e-6):
    # Input
    # input_layer = tf.keras.layers.Input(shape=(final_len,), dtype=tf.int32, name="input_layer")
    input_ids = tf.keras.layers.Input(shape=(final_len,), name='input_ids', dtype='int32')
    input_masks = tf.keras.layers.Input(shape=(final_len,), name='attention_mask', dtype='int32')

    # Sequence output
    sequence_output = transformer(input_ids, attention_mask=input_masks)[0]
    cls_token = sequence_output[:, 0, :]
    cls_token = tf.keras.layers.Dropout(0.3)(cls_token)
    cls_token = tf.keras.layers.Dense(64, activation='relu')(cls_token)
    # Output Layers
    output_layer = tf.keras.layers.Dense(3, activation='softmax')(cls_token)
    # Model graph
    model = tf.keras.models.Model(inputs=[input_ids, input_masks], outputs=output_layer)
    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learnin_rate), 
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), 
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
    )
    return model

In [None]:
# Split into Training (90%) & Validation (10%)
x_train, x_val, y_train, y_val = train_test_split(
    train_data[['premise','hypothesis']], 
    train_data.label.values, 
    test_size=0.1,
    random_state=42,
)
# Prepare inputs
# Training set
train_input_ids, train_attn_masks = get_body_encoding(
    x_train['hypothesis'],
    x_train['premise'],
    tokenizer=tokenizer
)
train_ds = tf.data.Dataset.from_tensor_slices((train_input_ids, train_attn_masks, y_train)).repeat().shuffle(2048).batch(batch_size).prefetch(AUTO)
# Validation set
valid_input_ids, valid_attn_masks = get_body_encoding(
    x_val['hypothesis'],
    x_val['premise'],
    tokenizer=tokenizer
)
val_ds = tf.data.Dataset.from_tensor_slices((valid_input_ids, valid_attn_masks, y_val)).batch(batch_size).prefetch(AUTO)

In [None]:
def map_func(input_ids, masks, labels):
    # we convert our three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# then we use the dataset map method to apply this transformation
train_ds = train_ds.map(map_func)
val_ds = val_ds.map(map_func)

In [None]:
# Application to xlm_roberta
xlm_model = build_model(transformer = transformer_encoder)
# Model Summary
xlm_model.summary()

In [None]:
tf.keras.utils.plot_model(xlm_model, show_shapes=True)

In [None]:
# callbacks
stop_early = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=1,
    restore_best_weights=True
)

# On-the-fly data augmentation
n_steps = len(train_data) // batch_size 

# Train the Model
hist_xlm = xlm_model.fit(
    train_ds, 
    validation_data = val_ds,
    epochs = 10,
    steps_per_epoch = n_steps,
    callbacks=[stop_early],
#    batch_size = 32,
)

In [None]:
pd.DataFrame(hist_xlm.history).loc[:, ["accuracy", "val_accuracy"]].plot()
pd.DataFrame(hist_xlm.history).loc[:, ["loss", "val_loss"]].plot()
plt.show()

In [None]:
# Evaluate on Validation Set
xlm_model.evaluate(val_ds)

In [None]:
# Get predicted labels
y_pred = xlm_model.predict(val_ds)
y_pred = np.argmax(y_pred, axis=1)

# Construction of cm
cm = confusion_matrix(y_val, y_pred)
# Plot
plt.figure(figsize=(10,6))
c = sns.heatmap(cm, annot=True, fmt='g', cbar=False)
c.set_xlabel('Predicted')
c.set_ylabel('Truth')
plt.xticks(rotation=0)
plt.yticks(rotation=0)
plt.show()

In [None]:
pd.DataFrame(classification_report(
    y_val,
    y_pred,
    output_dict=True,
    zero_division=0
)).T

# Submission

In [None]:
# del test_ds
def map_func_test(input_ids, masks):
    # we convert our three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': input_ids, 'attention_mask': masks}

In [None]:
test_input_ids, test_attn_masks = get_body_encoding(
    test_data['hypothesis'],
    test_data['premise'],
    tokenizer=tokenizer
)
test_ds = tf.data.Dataset.from_tensor_slices((test_input_ids, test_attn_masks)).batch(batch_size).prefetch(AUTO)
test_ds = test_ds.map(map_func_test)
#test_ds = (tf.data.Dataset.from_tensor_slices((test_encode)).batch(batch_size).prefetch(AUTO))

In [None]:
submissions = xlm_model.predict(test_ds, verbose=0)
y_subm = np.argmax(submissions, axis=1)

    0 == entailment                        
    1 == neutral                                  
    2 == contradiction

In [None]:
# Submission 
sample_subm = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/sample_submission.csv')
sample_subm['prediction'] = y_subm
sample_subm

In [None]:
sample_subm.to_csv("submission.csv", index=False)

# Le notebook ci-dessus est inspiré des kernels suivant:
    https://www.kaggle.com/code/antoinegoubert/xlmroberta-curriculum-learning-xnli-data-aug/notebook?scriptVersionId=106747526
    https://www.kaggle.com/code/francescoliveras/contradictory-nn-tpu-en-es/notebook?scriptVersionId=107715163