In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
train_file = '/content/train.csv'
vali_file = '/content/vali.csv'
test_file = '/content/test.csv'

# Load datasets
train_data = pd.read_csv(train_file)
vali_data = pd.read_csv(vali_file)
test_data = pd.read_csv(test_file)

all_queries = pd.concat([
    train_data['query1'], train_data['query2'],
    vali_data['query1'], vali_data['query2'],
    test_data['query1'], test_data['query2']
])

In [None]:
def generate_combined_embeddings(all_queries, individual_queries, embedding_dim=50):
    """
    Train Word2Vec on combined queries and generate embeddings for individual queries.
    """
    # Tokenize all queries
    tokenized_all_queries = [query.split() for query in all_queries.tolist()]

    # Train Word2Vec on the combined dataset
    model = Word2Vec(
        sentences=tokenized_all_queries,
        vector_size=100,  # Larger embedding dimension for richer representations
        min_count=5,      # Exclude very rare words
        sg=1,             # Use Skip-Gram model
        negative=10,      # Negative sampling for efficiency
        epochs=10         # Train for more epochs
    )

    # Generate embeddings for the individual queries
    embeddings = []
    tokenized_individual_queries = [query.split() for query in individual_queries]
    for sentence in tokenized_individual_queries:
        word_vectors = [model.wv[word] for word in sentence if word in model.wv]
        if word_vectors:
            avg_vector = np.mean(word_vectors, axis=0)
        else:
            avg_vector = np.zeros(embedding_dim)
        embeddings.append(avg_vector)

    return np.array(embeddings)

# Generate embeddings for train, validation, and test sets
train_query1_embeddings = generate_combined_embeddings(all_queries, train_data['query1'].tolist())
train_query2_embeddings = generate_combined_embeddings(all_queries, train_data['query2'].tolist())
train_embeddings = train_query1_embeddings + train_query2_embeddings

vali_query1_embeddings = generate_combined_embeddings(all_queries, vali_data['query1'].tolist())
vali_query2_embeddings = generate_combined_embeddings(all_queries, vali_data['query2'].tolist())
vali_embeddings = vali_query1_embeddings + vali_query2_embeddings

test_query1_embeddings = generate_combined_embeddings(all_queries, test_data['query1'].tolist())
test_query2_embeddings = generate_combined_embeddings(all_queries, test_data['query2'].tolist())
test_embeddings = test_query1_embeddings + test_query2_embeddings

# Convert embeddings to tensors
x_train = torch.tensor(train_embeddings, dtype=torch.float32)
y_train = torch.tensor(train_data['label'].values, dtype=torch.float32).view(-1, 1)

x_vali = torch.tensor(vali_embeddings, dtype=torch.float32)
y_vali = torch.tensor(vali_data['label'].values, dtype=torch.float32).view(-1, 1)

x_test = torch.tensor(test_embeddings, dtype=torch.float32)
y_test = torch.tensor(test_data['label'].values, dtype=torch.float32).view(-1, 1)

print("Embedding generation complete!")

Embedding generation complete!


In [None]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

best_accuracy = 0
best_hyperparams = {}

for hidden_dim in [32, 64, 128]:  # Try different hidden dimensions
    for lr in [0.1, 0.01, 0.001]:     # Try different learning rates
        model = BinaryClassifier(x_train.shape[1], hidden_dim)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        # Training
        for epoch in range(50):  # Train for fewer epochs to save time during tuning
            model.train()
            outputs = model(x_train)
            loss = nn.BCELoss()(outputs, y_train)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Evaluate on validation set
        model.eval()
        with torch.no_grad():
            predictions = model(x_vali)
            predicted_labels = (predictions > 0.5).float()
            accuracy = (predicted_labels == y_vali).float().mean().item()

        # Track the best hyperparameters
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_hyperparams = {'hidden_dim': hidden_dim, 'lr': lr}

print(f"Best Hyperparameters: {best_hyperparams}, Validation Accuracy: {best_accuracy:.4f}")

# Train final model with best hyperparameters
model = BinaryClassifier(x_train.shape[1], best_hyperparams['hidden_dim'])
optimizer = optim.Adam(model.parameters(), lr=best_hyperparams['lr'])

# Train on the combined training and validation sets
x_combined = torch.cat((x_train, x_vali), dim=0)
y_combined = torch.cat((y_train, y_vali), dim=0)

epochs = 100
for epoch in range(epochs):  # Full training
    model.train()
    outputs = model(x_combined)
    loss = nn.BCELoss()(outputs, y_combined)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Test the final model
    model.eval()
    with torch.no_grad():
        predictions = model(x_test)
        predicted_labels = (predictions > 0.5).float()
        accuracy = (predicted_labels == y_test).float().mean()
        print(f"Test Accuracy: {accuracy:.4f}")

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")

print(f"Final Accuracy: {accuracy:.4f}")

Best Hyperparameters: {'hidden_dim': 64, 'lr': 0.01}, Validation Accuracy: 0.6560
Test Accuracy: 0.5000
Test Accuracy: 0.5300
Test Accuracy: 0.5090
Test Accuracy: 0.5050
Test Accuracy: 0.5280
Test Accuracy: 0.5730
Test Accuracy: 0.6220
Test Accuracy: 0.6230
Test Accuracy: 0.6230
Test Accuracy: 0.6290
Epoch [10/100], Loss: 0.6654
Test Accuracy: 0.6370
Test Accuracy: 0.6390
Test Accuracy: 0.6340
Test Accuracy: 0.6360
Test Accuracy: 0.6400
Test Accuracy: 0.6430
Test Accuracy: 0.6360
Test Accuracy: 0.6380
Test Accuracy: 0.6410
Test Accuracy: 0.6500
Epoch [20/100], Loss: 0.6241
Test Accuracy: 0.6520
Test Accuracy: 0.6550
Test Accuracy: 0.6590
Test Accuracy: 0.6640
Test Accuracy: 0.6660
Test Accuracy: 0.6690
Test Accuracy: 0.6780
Test Accuracy: 0.6760
Test Accuracy: 0.6690
Test Accuracy: 0.6800
Epoch [30/100], Loss: 0.5936
Test Accuracy: 0.6870
Test Accuracy: 0.6720
Test Accuracy: 0.6840
Test Accuracy: 0.6920
Test Accuracy: 0.6860
Test Accuracy: 0.6930
Test Accuracy: 0.6930
Test Accuracy: 0.