In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-question-pairs/train.csv.zip
/kaggle/input/quora-question-pairs/sample_submission.csv.zip
/kaggle/input/quora-question-pairs/test.csv
/kaggle/input/quora-question-pairs/test.csv.zip


In [None]:
import pandas as pd
import numpy as np
import zipfile
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print("Assignment 5: Quora Duplicate Question Detection (Optimized)")
print("=" * 60)

# Step 1: Data Loading and Exploration
print("\n1. Loading and Exploring Data")
print("-" * 30)

# Extract and load the data
with zipfile.ZipFile('/kaggle/input/quora-question-pairs/train.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('/kaggle/working/')

with zipfile.ZipFile('/kaggle/input/quora-question-pairs/test.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('/kaggle/working/')

# Load datasets
train_df = pd.read_csv('/kaggle/working/train.csv')
test_df = pd.read_csv('/kaggle/working/test.csv')

print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")
print(f"\nTrain columns: {list(train_df.columns)}")
print(f"Test columns: {list(test_df.columns)}")

# Data exploration
print(f"\nClass distribution in training data:")
print(train_df['is_duplicate'].value_counts())
print(f"Duplicate percentage: {train_df['is_duplicate'].mean():.2%}")

# Check for missing values
print(f"\nMissing values in train:")
print(train_df.isnull().sum())

# Display sample data
print(f"\nSample data:")
print(train_df.head())

# Step 2: Data Preprocessing
print("\n2. Data Preprocessing")
print("-" * 30)

def preprocess_text(text):
    """Basic text preprocessing"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    # Remove extra whitespaces
    text = ' '.join(text.split())
    return text

# Apply preprocessing
train_df['question1_clean'] = train_df['question1'].apply(preprocess_text)
train_df['question2_clean'] = train_df['question2'].apply(preprocess_text)

# Remove rows with empty questions
train_df = train_df[(train_df['question1_clean'] != "") & (train_df['question2_clean'] != "")]

print(f"Dataset shape after cleaning: {train_df.shape}")

# Create train/validation/test splits - USING SMALLER SUBSET FOR SPEED
# Use only 10% of data for much faster training
sample_size = min(20000, len(train_df))  # Reduced from 50k to 20k
train_sample = train_df.sample(n=sample_size, random_state=42)

print(f"Using sample size: {sample_size} (for faster training)")

# Split into train/val/test (60/20/20)
train_data, temp_data = train_test_split(train_sample, test_size=0.4, random_state=42, stratify=train_sample['is_duplicate'])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['is_duplicate'])

print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)} (constant for all experiments)")

# For fine-tuning experiments, use only 10% of training data
TRAINING_SUBSET_RATIO = 0.1
training_subset_size = int(len(train_data) * TRAINING_SUBSET_RATIO)
train_subset = train_data.sample(n=training_subset_size, random_state=42)

print(f"\nFor fine-tuning experiments:")
print(f"Using {TRAINING_SUBSET_RATIO*100}% of training data: {len(train_subset)} samples")

# Step 3: Helper Functions
print("\n3. Setting up Helper Functions")
print("-" * 30)

def calculate_metrics(y_true, y_pred):
    """Calculate evaluation metrics"""
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    return {
        'f1_score': f1,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall
    }

def predict_from_similarity(similarities, threshold=0.5):
    """Convert similarity scores to binary predictions"""
    return (similarities >= threshold).astype(int)

def find_best_threshold(similarities, y_true):
    """Find best threshold based on F1 score"""
    best_threshold = 0.5
    best_f1 = 0
    
    for threshold in np.arange(0.1, 0.9, 0.05):
        y_pred = predict_from_similarity(similarities, threshold)
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    return best_threshold, best_f1

# Results storage
results = {}

# Step 4: Experiment 1 - Benchmark with Default Weights
print("\n4. Experiment 1: Benchmark with Default Weights")
print("-" * 50)

# Load pre-trained model
model_name = 'all-MiniLM-L6-v2'
benchmark_model = SentenceTransformer(model_name)

print(f"Using model: {model_name}")

# Encode validation questions
print("Encoding validation questions...")
val_q1_embeddings = benchmark_model.encode(val_data['question1_clean'].tolist(), show_progress_bar=True)
val_q2_embeddings = benchmark_model.encode(val_data['question2_clean'].tolist(), show_progress_bar=True)

# Calculate cosine similarities
from sklearn.metrics.pairwise import cosine_similarity
val_similarities = []
for i in range(len(val_q1_embeddings)):
    sim = cosine_similarity([val_q1_embeddings[i]], [val_q2_embeddings[i]])[0][0]
    val_similarities.append(sim)

val_similarities = np.array(val_similarities)

# Find best threshold
best_threshold, best_val_f1 = find_best_threshold(val_similarities, val_data['is_duplicate'].values)
print(f"Best threshold on validation: {best_threshold:.3f} (F1: {best_val_f1:.3f})")

# Test on test set
print("Encoding test questions...")
test_q1_embeddings = benchmark_model.encode(test_data['question1_clean'].tolist(), show_progress_bar=True)
test_q2_embeddings = benchmark_model.encode(test_data['question2_clean'].tolist(), show_progress_bar=True)

test_similarities = []
for i in range(len(test_q1_embeddings)):
    sim = cosine_similarity([test_q1_embeddings[i]], [test_q2_embeddings[i]])[0][0]
    test_similarities.append(sim)

test_similarities = np.array(test_similarities)
test_predictions = predict_from_similarity(test_similarities, best_threshold)

# Calculate metrics
benchmark_metrics = calculate_metrics(test_data['is_duplicate'].values, test_predictions)
results['Benchmark (Default)'] = benchmark_metrics

print(f"Benchmark Results:")
for metric, value in benchmark_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Step 5: Experiment 2 - Bi-encoder with Cosine Similarity Loss (10% data)
print("\n5. Experiment 2: Bi-encoder with Cosine Similarity Loss (10% training data)")
print("-" * 70)

# Prepare training data for sentence transformers using SUBSET
def prepare_training_data(df):
    examples = []
    for _, row in df.iterrows():
        score = float(row['is_duplicate'])  # Convert to similarity score
        example = InputExample(texts=[row['question1_clean'], row['question2_clean']], label=score)
        examples.append(example)
    return examples

train_examples = prepare_training_data(train_subset)  # Using subset now
print(f"Created {len(train_examples)} training examples (10% of original)")

# Initialize model for fine-tuning
cosine_model = SentenceTransformer(model_name)

# Create data loader with larger batch size for faster training
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)  # Increased batch size

# Define loss function
train_loss = losses.CosineSimilarityLoss(cosine_model)

# Fine-tune the model with fewer epochs
print("Fine-tuning with Cosine Similarity Loss...")
cosine_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,  # Further reduced epochs
    warmup_steps=50,  # Reduced warmup steps
    show_progress_bar=True
)

# Evaluate
print("Evaluating Cosine Similarity model...")
val_q1_embeddings = cosine_model.encode(val_data['question1_clean'].tolist())
val_q2_embeddings = cosine_model.encode(val_data['question2_clean'].tolist())

val_similarities = []
for i in range(len(val_q1_embeddings)):
    sim = cosine_similarity([val_q1_embeddings[i]], [val_q2_embeddings[i]])[0][0]
    val_similarities.append(sim)

val_similarities = np.array(val_similarities)
best_threshold, _ = find_best_threshold(val_similarities, val_data['is_duplicate'].values)

# Test evaluation
test_q1_embeddings = cosine_model.encode(test_data['question1_clean'].tolist())
test_q2_embeddings = cosine_model.encode(test_data['question2_clean'].tolist())

test_similarities = []
for i in range(len(test_q1_embeddings)):
    sim = cosine_similarity([test_q1_embeddings[i]], [test_q2_embeddings[i]])[0][0]
    test_similarities.append(sim)

test_similarities = np.array(test_similarities)
test_predictions = predict_from_similarity(test_similarities, best_threshold)

cosine_metrics = calculate_metrics(test_data['is_duplicate'].values, test_predictions)
results['Bi-encoder (Cosine Loss)'] = cosine_metrics

print(f"Cosine Similarity Loss Results:")
for metric, value in cosine_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Step 6: Experiment 3 - Bi-encoder with Contrastive Loss (10% data)
print("\n6. Experiment 3: Bi-encoder with Contrastive Loss (10% training data)")
print("-" * 65)

# Initialize model for contrastive loss
contrastive_model = SentenceTransformer(model_name)

# Prepare data for contrastive loss (needs positive/negative pairs) using subset
contrastive_examples = []
for _, row in train_subset.iterrows():  # Using subset
    label = int(row['is_duplicate'])
    example = InputExample(texts=[row['question1_clean'], row['question2_clean']], label=label)
    contrastive_examples.append(example)

contrastive_dataloader = DataLoader(contrastive_examples, shuffle=True, batch_size=32)

# Define contrastive loss
contrastive_loss = losses.ContrastiveLoss(contrastive_model)

# Fine-tune
print("Fine-tuning with Contrastive Loss...")
contrastive_model.fit(
    train_objectives=[(contrastive_dataloader, contrastive_loss)],
    epochs=1,
    warmup_steps=50,
    show_progress_bar=True
)

# Evaluate contrastive model
print("Evaluating Contrastive Loss model...")
val_q1_embeddings = contrastive_model.encode(val_data['question1_clean'].tolist())
val_q2_embeddings = contrastive_model.encode(val_data['question2_clean'].tolist())

val_similarities = []
for i in range(len(val_q1_embeddings)):
    sim = cosine_similarity([val_q1_embeddings[i]], [val_q2_embeddings[i]])[0][0]
    val_similarities.append(sim)

val_similarities = np.array(val_similarities)
best_threshold, _ = find_best_threshold(val_similarities, val_data['is_duplicate'].values)

# Test evaluation
test_q1_embeddings = contrastive_model.encode(test_data['question1_clean'].tolist())
test_q2_embeddings = contrastive_model.encode(test_data['question2_clean'].tolist())

test_similarities = []
for i in range(len(test_q1_embeddings)):
    sim = cosine_similarity([test_q1_embeddings[i]], [test_q2_embeddings[i]])[0][0]
    test_similarities.append(sim)

test_similarities = np.array(test_similarities)
test_predictions = predict_from_similarity(test_similarities, best_threshold)

contrastive_metrics = calculate_metrics(test_data['is_duplicate'].values, test_predictions)
results['Bi-encoder (Contrastive Loss)'] = contrastive_metrics

print(f"Contrastive Loss Results:")
for metric, value in contrastive_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Step 7: Experiment 4 - Multiple Negative Ranking Loss (10% data)
print("\n7. Experiment 4: Bi-encoder with Multiple Negative Ranking Loss (10% training data)")
print("-" * 75)

# Initialize model for MNR loss
mnr_model = SentenceTransformer(model_name)

# Prepare data for MNR loss (only positive pairs) from subset
mnr_examples = []
positive_pairs = train_subset[train_subset['is_duplicate'] == 1]  # Using subset
for _, row in positive_pairs.iterrows():
    example = InputExample(texts=[row['question1_clean'], row['question2_clean']])
    mnr_examples.append(example)

print(f"Using {len(mnr_examples)} positive pairs for MNR loss (from 10% subset)")

mnr_dataloader = DataLoader(mnr_examples, shuffle=True, batch_size=32)

# Define MNR loss
mnr_loss = losses.MultipleNegativesRankingLoss(mnr_model)

# Fine-tune
print("Fine-tuning with Multiple Negative Ranking Loss...")
mnr_model.fit(
    train_objectives=[(mnr_dataloader, mnr_loss)],
    epochs=1,
    warmup_steps=50,
    show_progress_bar=True
)

# Evaluate MNR model
print("Evaluating MNR Loss model...")
val_q1_embeddings = mnr_model.encode(val_data['question1_clean'].tolist())
val_q2_embeddings = mnr_model.encode(val_data['question2_clean'].tolist())

val_similarities = []
for i in range(len(val_q1_embeddings)):
    sim = cosine_similarity([val_q1_embeddings[i]], [val_q2_embeddings[i]])[0][0]
    val_similarities.append(sim)

val_similarities = np.array(val_similarities)
best_threshold, _ = find_best_threshold(val_similarities, val_data['is_duplicate'].values)

# Test evaluation
test_q1_embeddings = mnr_model.encode(test_data['question1_clean'].tolist())
test_q2_embeddings = mnr_model.encode(test_data['question2_clean'].tolist())

test_similarities = []
for i in range(len(test_q1_embeddings)):
    sim = cosine_similarity([test_q1_embeddings[i]], [test_q2_embeddings[i]])[0][0]
    test_similarities.append(sim)

test_similarities = np.array(test_similarities)
test_predictions = predict_from_similarity(test_similarities, best_threshold)

mnr_metrics = calculate_metrics(test_data['is_duplicate'].values, test_predictions)
results['Bi-encoder (MNR Loss)'] = mnr_metrics

print(f"Multiple Negative Ranking Loss Results:")
for metric, value in mnr_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Step 8: Experiment 5 - Cross-encoder (10% data)
print("\n8. Experiment 5: Cross-encoder (10% training data)")
print("-" * 50)

from sentence_transformers.cross_encoder import CrossEncoder

# Initialize cross-encoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', num_labels=2)

# Prepare training data for cross-encoder using subset
ce_train_samples = []
for _, row in train_subset.iterrows():  # Using subset
    ce_train_samples.append([row['question1_clean'], row['question2_clean'], int(row['is_duplicate'])])

print(f"Created {len(ce_train_samples)} cross-encoder training samples (from 10% subset)")

# Fine-tune cross-encoder
print("Fine-tuning Cross-encoder...")
cross_encoder.fit(
    train_samples=ce_train_samples,
    epochs=1,
    batch_size=32,
    warmup_steps=50,
    show_progress_bar=True
)

# Evaluate cross-encoder
print("Evaluating Cross-encoder...")
test_pairs = [[row['question1_clean'], row['question2_clean']] for _, row in test_data.iterrows()]
ce_predictions = cross_encoder.predict(test_pairs)

# Convert probabilities to binary predictions
ce_binary_predictions = (ce_predictions > 0.5).astype(int)

ce_metrics = calculate_metrics(test_data['is_duplicate'].values, ce_binary_predictions)
results['Cross-encoder'] = ce_metrics

print(f"Cross-encoder Results:")
for metric, value in ce_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Step 9: Results Comparison
print("\n9. Final Results Comparison")
print("=" * 50)

# Create results DataFrame
results_df = pd.DataFrame(results).T
print(results_df.round(4))

# Plot F1 scores
plt.figure(figsize=(12, 6))
models = list(results.keys())
f1_scores = [results[model]['f1_score'] for model in models]

plt.subplot(1, 2, 1)
bars = plt.bar(range(len(models)), f1_scores)
plt.xlabel('Models')
plt.ylabel('F1 Score')
plt.title('F1 Score Comparison')
plt.xticks(range(len(models)), models, rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, bar in enumerate(bars):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001, 
             f'{f1_scores[i]:.3f}', ha='center', va='bottom')

# Plot all metrics
plt.subplot(1, 2, 2)
metrics_to_plot = ['f1_score', 'accuracy', 'precision', 'recall']
x_pos = np.arange(len(models))
width = 0.2

for i, metric in enumerate(metrics_to_plot):
    values = [results[model][metric] for model in models]
    plt.bar(x_pos + i*width, values, width, label=metric.replace('_', ' ').title())

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('All Metrics Comparison')
plt.xticks(x_pos + width*1.5, models, rotation=45, ha='right')
plt.legend()
plt.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Best performing model
best_model = max(results, key=lambda x: results[x]['f1_score'])
print(f"\nBest performing model: {best_model}")
print(f"Best F1 Score: {results[best_model]['f1_score']:.4f}")

2025-09-07 03:35:01.352897: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757216101.709164      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757216101.817880      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Assignment 5: Quora Duplicate Question Detection (Optimized)

1. Loading and Exploring Data
------------------------------
Train dataset shape: (404290, 6)
Test dataset shape: (3563475, 3)

Train columns: ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']
Test columns: ['test_id', 'question1', 'question2']

Class distribution in training data:
is_duplicate
0    255027
1    149263
Name: count, dtype: int64
Duplicate percentage: 36.92%

Missing values in train:
id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

Sample data:
   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which on

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Using model: all-MiniLM-L6-v2
Encoding validation questions...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Best threshold on validation: 0.750 (F1: 0.741)
Encoding test questions...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Benchmark Results:
  f1_score: 0.7285
  accuracy: 0.7630
  precision: 0.6269
  recall: 0.8694

5. Experiment 2: Bi-encoder with Cosine Similarity Loss (10% training data)
----------------------------------------------------------------------
Created 1200 training examples (10% of original)
Fine-tuning with Cosine Similarity Loss...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>