In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install deep_translator
!pip install transformers

import numpy as np
import pandas as pd

import seaborn as sns
from deep_translator import GoogleTranslator
import tensorflow as tf
import transformers
from sklearn.model_selection import train_test_split

In [None]:
os.environ["WANDB_API_KEY"] = "0"

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)
    
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

In [None]:
train.shape

In [None]:
train = train.reset_index()
test = test.reset_index()
train.columns


In [None]:
train['lang_abv'].value_counts()

# Translate To English using Google Translator

In [None]:
def trans_to_eng(row):
    premise = row['premise']
    hypothesis = row['hypothesis']
    
    nmber = row['index']
    source = row['lang_abv']
    target = 'en'
    
    if source != 'en':
        en_return_premise = GoogleTranslator(source=source, target=target).translate(premise)
        en_return_hypothesis = GoogleTranslator(source=source, target=target).translate(hypothesis)
    else:
        en_return_premise = premise
        en_return_hypothesis = hypothesis
       
    #print(en_return)
    print(nmber)
    return en_return_premise, en_return_hypothesis



In [None]:
train['premise_en'], train['hypothesis_en'] = zip(*train.apply(lambda x: trans_to_eng(x), axis = 1 ))

#train['premise_en'] = np.where(train['lang_abv'] != 'en',GoogleTranslator(source=train['lang_abv'], target='en').translate(train['premise']),train['premise'])

In [None]:
test['premise_en'], test['hypothesis_en'] = zip(*test.apply(lambda x: trans_to_eng(x), axis = 1 ))

# Sample Data

In [None]:
print(f"premise: {train.loc[4, 'premise_en']}")
print(f"hypothesis: {train.loc[4, 'hypothesis_en']}")
print(f"label: {train.loc[4, 'label']}")

# Configurarion for the model

In [None]:
max_length = 100  # Maximum length of input sentence to the model.
batch_size = 16
epochs = 10

# Tokenization of the input in the shape that BERT requires

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
encoding_train = tokenizer(text=list(train.premise_en.values),
                    text_pair=list(train.hypothesis_en.values),
                    add_special_tokens=True,
                    max_length=max_length,
                    truncation=True,
                    padding=True,
                    return_attention_mask=True,
                    return_token_type_ids=True,
                    return_tensors='tf'
                    )

In [None]:

valid_size = int(len(train)*0.33)

encoded_train_data = tf.data.Dataset.from_tensor_slices((encoding_train.data, train.label.values))
validation_dataset = (encoded_train_data.take(valid_size).batch(batch_size))
train_dataset = (encoded_train_data.skip(valid_size).batch(batch_size))

# Initialise the Model

In [None]:
MODEL_NAME = 'bert-base-uncased'
with tpu_strategy.scope():
    transformer = transformers.TFAutoModel.from_pretrained(MODEL_NAME)

In [None]:
def base_bert_model():

    input_ids = tf.keras.Input(shape=(max_length,),name='input_ids', dtype='int32')
    attention_mask = tf.keras.Input(shape=(max_length,),name='attention_mask', dtype='int32')
    token_type_ids = tf.keras.Input(shape=(max_length,),name='token_type_ids', dtype='int32')

    bert_transformed = transformer((input_ids, attention_mask, token_type_ids))[0]
    output_1 = tf.keras.layers.Dense(300, activation='relu')(bert_transformed[:,0,:])
    output_2 = tf.keras.layers.Dense(100, activation='relu')(output_1)
    output = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(output_2)
    
    model = tf.keras.Model(inputs=(input_ids, attention_mask, token_type_ids), outputs=output)
    return model

In [None]:
with tpu_strategy.scope():
    model = base_bert_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()

# Run Model 

In [None]:
loss_reduction = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.3,
    patience= 1,
    min_lr=5e-7
)

history = model.fit(
    train_dataset,
    epochs = epochs,
    verbose = 2,
    batch_size = batch_size,
    callbacks=[loss_reduction],
    validation_data=validation_dataset
)

# Prepare Submission

In [None]:
encoding_test = tokenizer(text=list(test.premise_en.values),
                    text_pair=list(test.hypothesis_en.values),
                    add_special_tokens=True,
                    max_length=max_length,
                    truncation=True,
                    padding=True,
                    return_attention_mask=True,
                    return_token_type_ids=True,
                    return_tensors='tf'
                    )

In [None]:
pred_submission = model.predict(encoding_test.data, batch_size=128, verbose=1)

In [None]:
test_pred_labels = np.argmax(pred_submission, axis=1)

In [None]:
submission = pd.DataFrame()
submission['prediction'] = test_pred_labels

In [None]:
submission.to_csv("submission.csv", index = False)