<a href="https://colab.research.google.com/github/a-t-em/nlp_logical_contradiction_classification/blob/main/nlp_logical_contradiction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [2]:
from transformers import BertTokenizer, TFBertModel, TFBertForSequenceClassification
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras import layers
import sklearn

In [3]:
# set up TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
    print('Running on TPU')
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

Running on TPU


In [4]:
# Read the zipped CSV file into a DataFrame
df = pd.read_csv('train.csv.zip', compression='zip')

# Print the first 5 rows of the DataFrame
df.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [5]:
# Initialize the tokenizer and model
with strategy.scope():
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
X = df[['premise', 'hypothesis']].applymap(str)
X

Unnamed: 0,premise,hypothesis
0,and these comments were considered in formulat...,The rules developed in the interim were put to...
1,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...
2,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.
3,you know they can't really defend themselves l...,They can't defend themselves because of their ...
4,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร
...,...,...
12115,The results of even the most well designed epi...,All studies have the same amount of uncertaint...
12116,But there are two kinds of the pleasure of do...,But there are two kinds of the pleasure of doi...
12117,The important thing is to realize that it's wa...,"It cannot be moved, now or ever."
12118,At the west end is a detailed model of the who...,The model temple complex is at the east end.


In [7]:
y = df.label
y

0        0
1        2
2        0
3        0
4        1
        ..
12115    2
12116    0
12117    2
12118    2
12119    0
Name: label, Length: 12120, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of the training and testing datasets
print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

Training dataset shape: (9696, 2) (9696,)
Testing dataset shape: (2424, 2) (2424,)


In [9]:
def get_encodings(df): 
    premises = df['premise'].to_list()
    hypotheses = df['hypothesis'].to_list()
    print(type(premises), type(hypotheses))
    encodings = tokenizer(premises, hypotheses, padding='max_length', truncation=False, max_length=259, return_tensors='tf')
    inputs = {key: tf.constant(val) for key, val in encodings.items()}
    return inputs

train_inputs = get_encodings(X_train)
val_inputs = get_encodings(X_test)

<class 'list'> <class 'list'>
<class 'list'> <class 'list'>


In [12]:
train_inputs, val_inputs

({'input_ids': <tf.Tensor: shape=(9696, 259), dtype=int32, numpy=
  array([[  101, 76295,   763, ...,     0,     0,     0],
         [  101,   183, 10237, ...,     0,     0,     0],
         [  101, 10798, 11084, ...,     0,     0,     0],
         ...,
         [  101,   140,   112, ...,     0,     0,     0],
         [  101, 10117, 96731, ...,     0,     0,     0],
         [  101, 74400, 10336, ...,     0,     0,     0]], dtype=int32)>,
  'token_type_ids': <tf.Tensor: shape=(9696, 259), dtype=int32, numpy=
  array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>,
  'attention_mask': <tf.Tensor: shape=(9696, 259), dtype=int32, numpy=
  array([[1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         ...,
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 

In [11]:
# Prepare the input data
train_labels = tf.constant(y_train)
val_labels = tf.constant(y_test)
train_labels, val_labels

(<tf.Tensor: shape=(9696,), dtype=int64, numpy=array([2, 2, 1, ..., 0, 2, 2])>,
 <tf.Tensor: shape=(2424,), dtype=int64, numpy=array([1, 0, 2, ..., 1, 0, 0])>)

In [14]:
# Compile the model
with strategy.scope():
  optimizer = keras.optimizers.Adam(learning_rate=2e-5)
  loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  metric = keras.metrics.SparseCategoricalAccuracy('accuracy')
  model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model
with strategy.scope():
    history = model.fit(train_inputs, train_labels, epochs=3, batch_size=32, validation_data=(val_inputs, val_labels))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [15]:
# val_loss, val_acc = model.evaluate(val_inputs, val_labels)
# print(f'Validation loss: {val_loss}, Validation accuracy: {val_acc}')

Validation loss: 0.9635542631149292, Validation accuracy: 0.6485148668289185


In [35]:
# Make test prediction
test = get_encodings(pd.DataFrame({'premise': 'This is a test', 'hypothesis': 'Testing'}, index=[1]))
with strategy.scope():
  test_predictions = model.predict(test)

test_predictions[0].argmax(axis=-1)

<class 'list'> <class 'list'>


array([0])

In [19]:
df_test = pd.read_csv('test.csv.zip', compression='zip')
df_test.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language
0,c6d58c3f69,بکس، کیسی، راہیل، یسعیاہ، کیلی، کیلی، اور کولم...,"کیسی کے لئے کوئی یادگار نہیں ہوگا, کولمین ہائی...",ur,Urdu
1,cefcc82292,هذا هو ما تم نصحنا به.,عندما يتم إخبارهم بما يجب عليهم فعله ، فشلت ال...,ar,Arabic
2,e98005252c,et cela est en grande partie dû au fait que le...,Les mères se droguent.,fr,French
3,58518c10ba,与城市及其他公民及社区组织代表就IMA的艺术发展进行对话&amp,IMA与其他组织合作，因为它们都依靠共享资金。,zh,Chinese
4,c32b0d16df,Она все еще была там.,"Мы думали, что она ушла, однако, она осталась.",ru,Russian


In [20]:
test_inputs = get_encodings(df_test)
test_inputs

<class 'list'> <class 'list'>


{'input_ids': <tf.Tensor: shape=(5195, 259), dtype=int32, numpy=
 array([[  101,   764, 28744, ...,     0,     0,     0],
        [  101, 13498, 11917, ...,     0,     0,     0],
        [  101, 10131, 24552, ...,     0,     0,     0],
        ...,
        [  101,  3239,  5755, ...,     0,     0,     0],
        [  101, 98370,   112, ...,     0,     0,     0],
        [  101, 10167, 15078, ...,     0,     0,     0]], dtype=int32)>,
 'token_type_ids': <tf.Tensor: shape=(5195, 259), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>,
 'attention_mask': <tf.Tensor: shape=(5195, 259), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, .

In [21]:
with strategy.scope():
  y_preds = model.predict(test_inputs)
  y_preds



In [23]:
y_preds[0].argmax(axis=-1)

array([2, 1, 0, ..., 1, 0, 2])

In [26]:
len(y_preds[0]), len(df_test)

(5195, 5195)

In [27]:
df_sub = pd.read_csv('sample_submission.csv')
df_sub.head()

Unnamed: 0,id,prediction
0,c6d58c3f69,1
1,cefcc82292,1
2,e98005252c,1
3,58518c10ba,1
4,c32b0d16df,1


In [28]:
df_sub['prediction'] = y_preds[0].argmax(axis=-1)

In [29]:
df_sub.describe()

Unnamed: 0,prediction
count,5195.0
mean,1.082964
std,0.812546
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,2.0


In [30]:
df_sub.to_csv('submission.csv', index=False)