In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install hugginface
!pip install -U sentence-transformers
!pip install seaborn
!pip install -q -U keras-tuner

In [None]:
import tensorflow as tf
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")


In [None]:
train = train[['premise', 'hypothesis', 'lang_abv', 'label']]
#test = test[['premise', 'hypothesis', 'lang_abv']]
train

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

#distribuzione dei dati in base a lingua e sub distribuzione in base al label
fig, ax = plt.subplots(figsize = (12,5))

graph1 = sns.countplot(data=train, x = "lang_abv", hue="label")

#set title
graph1.set_title('Distribution of Languages and Labels')

plt.tight_layout()
plt.show()

In [None]:
#train = train[train["lang_abv"]=="en"]

In [None]:
'''
#Utilizzato per Experiment 1 e 2
#Script per bilanciare il dataset
print("------------------------------------")
print("Distribuzione delle label prima del bilanciamento")
print(train.value_counts(train['label']))
print("------------------------------------")
minLab= train.value_counts(train['label'])[1]
normal = train.value_counts(train['label'])[0]- minLab
contradictory = train.value_counts(train['label'])[2] - minLab
print("Minima quantita di lable: ", minLab, normal, contradictory)
#Normalizzazione delle label al valore minimo 2166
for index, row in train.iterrows():
    if (row['label']==0 and normal > 0):
        train = train.drop(index, axis='index')
        normal -= 1
    elif(row['label']==2 and contradictory>0):
        train = train.drop(index, axis='index')
        contradictory -= 1
        
print("------------------------------------")  
print("Distribuzione delle label dopo il bilanciamento")
print(train.value_counts(train['label']))
print("------------------------------------")
'''

In [None]:
from sklearn.model_selection import train_test_split
#ci sono pochi dati per tutte le lingue tranne l'inglese quindi restringo il training solo alla lingua dominante del dataset.
#divido il dataset in train validation e test dato che sto riducendo solo ad inglese.
X = train[['premise', 'hypothesis', 'lang_abv']]
Y = train [['label']]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15)

In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
#"facebook/bart-large-mnli" Experiment 2
#facebook/bart-large Experiment 1
#"joeddav/xlm-roberta-large-xnli" Experiment 3
tokenizer = AutoTokenizer.from_pretrained("joeddav/xlm-roberta-large-xnli")

#model = AutoModelForSequenceClassification.from_pretrained("joeddav/xlm-roberta-large-xnli")

In [None]:
#tokenizer.tokenize("test tokenizer test test ")

In [None]:
SEQ_LEN = 236  #max(train.astype('str').applymap(lambda x: len(x)).max())

def bert_encode(df, tokenizer):    
    batch_premises = df['premise'].tolist()
    batch_hypothesis = df['hypothesis'].tolist()

    tokens = tokenizer(batch_premises, batch_hypothesis, max_length = SEQ_LEN,
                   truncation=True, padding='max_length',
                   add_special_tokens= True, return_attention_mask=True,
                   return_tensors='tf') #return tf.constant 
    inputs = {
          'input_ids': tokens['input_ids'], 
          'attention_mask': tokens['attention_mask'],
          }
    return inputs

In [None]:
train_input = bert_encode(x_train, tokenizer)
test_input = bert_encode(x_test, tokenizer)
train_input

In [None]:
# from tensorflow.keras import regularizers
from transformers import TFAutoModel
import tensorflow as tf
#"facebook/bart-large"
#"facebook/bart-large-mnli"
#"joeddav/xlm-roberta-large-xnli"
def build_model():
    transformer = TFAutoModel.from_pretrained("joeddav/xlm-roberta-large-xnli")
    input_ids = tf.keras.Input(shape=(SEQ_LEN,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(SEQ_LEN,), dtype=tf.int32, name="attention_mask")
    embedding = transformer([input_ids, attention_mask])
    inputs=[input_ids, attention_mask]
    x = tf.keras.layers.GlobalAveragePooling1D()(embedding[0])
    x = tf.keras.layers.Dense(units = 512, activation=tf.nn.relu)(x)
    x = tf.keras.layers.Dense(units = 256, activation=tf.nn.relu)(x)
    output = tf.keras.layers.Dense(3, activation='softmax')(x)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    transformer.trainable = False
    hp_learning_rate = 1e-5
    model.compile(tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['accuracy'])
    return model 

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()     

In [None]:
history = model.fit(train_input, y_train , epochs = 20, batch_size=64, validation_split = 0.2)

In [None]:
eval_result = model.evaluate(test_input, y_test)
print("[test loss, test accuracy]:", eval_result)

In [None]:
model.save("/kaggle/working/"+'contradictory_classifier_roBERTa')

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(20)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')


plt.show()

In [None]:
def get_prediction(y_pred):
    prediction = np.empty(y_pred.shape[0], dtype=float)
    for i in range(y_pred.shape[0]):
        maxValue = max(y_pred[i])
        itemIndex = np.where(y_pred[i] == maxValue)
        prediction[i]= itemIndex[0][0]
    prediction = pd.DataFrame(prediction, columns=['label'])
    return prediction


In [None]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(test_input)
y_pred = get_prediction(y_pred)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm,
                     index = ['0','1','2'], 
                     columns = ['0','1','2'])

#Plotting the confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.show()