In [1]:
pip install tensorflow

Collecting keras<2.16,>=2.15.0 (from tensorflow)
  Downloading keras-2.15.0-py3-none-any.whl.metadata (2.4 kB)
Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 3.3.3
    Uninstalling keras-3.3.3:
      Successfully uninstalled keras-3.3.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.[0m[31m
[0mSuccessfully installed keras-2.15.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import tensorflow as tf
print(tf.__version__)

2024-06-23 07:50:08.567955: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-23 07:50:08.568049: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-23 07:50:08.673140: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


2.15.0


In [3]:
pip install num2words

  pid, fd = os.forkpty()


Collecting num2words
  Downloading num2words-0.5.13-py3-none-any.whl.metadata (12 kB)
Downloading num2words-0.5.13-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.3/143.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: num2words
Successfully installed num2words-0.5.13
Note: you may need to restart the kernel to use updated packages.


In [16]:
# basics
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# text_processing
from num2words import num2words
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# data preparation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# TensorFlow and Keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dropout, BatchNormalization, Attention, concatenate, Flatten, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.regularizers import l2

# other
from tqdm import tqdm
import gc

# warnings
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Load the dataset
TRAIN_DATA_PATH = '/kaggle/input/quoradata/train.csv'
df = pd.read_csv(TRAIN_DATA_PATH)


In [7]:
# Fill missing values
df['question1'] = df['question1'].fillna('')
df['question2'] = df['question2'].fillna('')

In [8]:
# Preprocessing functions
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

# Clean text
df['question1'] = df['question1'].apply(clean_text)
df['question2'] = df['question2'].apply(clean_text)

In [10]:
# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pd.concat([df['question1'], df['question2']]))

def tokenize_and_pad(text_series, max_len):
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=max_len, padding='post')

In [11]:
max_len = 50

X_q1 = tokenize_and_pad(df['question1'], max_len)
X_q2 = tokenize_and_pad(df['question2'], max_len)

In [12]:
# GloVe embeddings
embedding_index = {}
GLOVE_PATH = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt'
with open(GLOVE_PATH, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [13]:
word_index = tokenizer.word_index
embedding_dim = 200
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [17]:
# Model definition
def create_lstm_model(embedding_matrix, max_len):
    embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_len,
                                trainable=True)

    question1_input = Input(shape=(max_len,), name='question1_input')
    question2_input = Input(shape=(max_len,), name='question2_input')

    embedded_question1 = embedding_layer(question1_input)
    embedded_question2 = embedding_layer(question2_input)

    lstm_layer1 = Bidirectional(LSTM(32, return_sequences=True))
    lstm_layer2 = Bidirectional(LSTM(16, return_sequences=True))

    lstm_question1 = lstm_layer1(embedded_question1)
    lstm_question2 = lstm_layer1(embedded_question2)

    dropout1 = Dropout(0.2)(lstm_question1)
    dropout2 = Dropout(0.2)(lstm_question2)

    bn1 = BatchNormalization()(dropout1)
    bn2 = BatchNormalization()(dropout2)

    lstm_question1 = lstm_layer2(bn1)
    lstm_question2 = lstm_layer2(bn2)

    attention1 = Attention()([lstm_question1, lstm_question2])
    attention2 = Attention()([lstm_question2, lstm_question1])

    merged_output = concatenate([attention1, attention2])
    flatten_output = Flatten()(merged_output)
    output = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))(flatten_output)

    model = Model(inputs=[question1_input, question2_input], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [18]:
model = create_lstm_model(embedding_matrix, max_len)

# Split data
X = np.concatenate([X_q1, X_q2], axis=1)
y = df['is_duplicate']

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape inputs for the model
X_train_q1 = X_train[:, :max_len]
X_train_q2 = X_train[:, max_len:]
X_val_q1 = X_val[:, :max_len]
X_val_q2 = X_val[:, max_len:]


In [29]:
# Train the model
model.fit([X_train_q1, X_train_q2], y_train, epochs=30, batch_size=512, validation_data=([X_val_q1, X_val_q2], y_val))


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x78c8f8e612d0>

In [30]:
# Predict on validation set
y_pred_val = (model.predict([X_val_q1, X_val_q2]) > 0.5).astype("int32")



In [31]:
# Print performance metrics
accuracy = accuracy_score(y_val, y_pred_val)
report = classification_report(y_val, y_pred_val)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.824828712063123
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.82      0.85     50803
           1       0.73      0.83      0.78     30055

    accuracy                           0.82     80858
   macro avg       0.81      0.83      0.82     80858
weighted avg       0.83      0.82      0.83     80858

