In [1]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import torch.nn.functional as F
from tqdm import tqdm 
import random
import pandas as pd

In [2]:
# Load data
texts = []
labels = []

with open('domain1_train.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        texts.append(data["text"])
        labels.append(data["label"])

# data_with_label_1 = [pair for pair in zip(texts[:9750], labels[:9750]) if pair[1] == 1]
# data_with_label_0 = [pair for pair in zip(texts[-9750:], labels[-9750:]) if pair[1] == 0]

# selected_data_label_1 = random.sample(data_with_label_1, 2150)
# selected_data_label_0 = random.sample(data_with_label_0, 2150)

# selected_data = selected_data_label_1 + selected_data_label_0
# texts, labels = zip(*selected_data)
# texts = list(texts)
# labels = list(labels)

with open('domain2_train.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        texts.append(data["text"])
        labels.append(data["label"])



In [3]:
new_texts = []
for text in texts:
    new_text = [x + 1 for x in text]
    new_texts.append(new_text)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(new_texts, labels, test_size=0.2, random_state=42)

In [65]:

batch_size = 256
from torch.utils.data import DataLoader, TensorDataset
def pad_or_truncate_sequences(sequences, fixed_length, padding_value=0):
    new_sequences = []
    for seq in sequences:
        length = len(seq)
        if length >= fixed_length:
            new_seq = seq[:fixed_length]
        else:
            new_seq = seq + [padding_value] * (fixed_length - length)
        new_sequences.append(torch.tensor(new_seq))
    return torch.stack(new_sequences)  # Stack into a single tensor

fixed_length = 350  # Change this to your desired fixed length

# Pad or truncate training and testing sequences
X_train_padded_sequences = pad_or_truncate_sequences(X_train, fixed_length)
X_test_padded_sequences = pad_or_truncate_sequences(X_test, fixed_length)

# Convert labels to PyTorch tensors
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoader
train_data = TensorDataset(X_train_padded_sequences, y_train_tensor)
test_data = TensorDataset(X_test_padded_sequences, y_test_tensor)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [68]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextCNNLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, hidden_dim, lstm_hidden_dim, output_dim, dropout):
        super(TextCNNLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        
        self.lstm = nn.LSTM(num_filters * len(filter_sizes), lstm_hidden_dim, batch_first=True)
        
        self.fc = nn.Linear(lstm_hidden_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.output = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text).unsqueeze(1)
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool2d(conv, (conv.shape[2], 1)).squeeze(3).squeeze(2) for conv in conved]
        
        cat = torch.cat(pooled, dim=1)
        
        lstm_out, (hn, cn) = self.lstm(cat.unsqueeze(1))
        
        hn = hn.squeeze(0)
        
        dense = self.fc(hn)
        dense = self.dropout(dense)
        
        return self.output(dense)

# Example hyperparameters
vocab_size = 5001
embedding_dim = 128
num_filters = 16
filter_sizes = [3, 5, 7]
hidden_dim = 128
lstm_hidden_dim = 128
output_dim = 1
dropout = 0.6

# Include L2 regularization in your optimizer (this is an example with Adam)
model = TextCNNLSTM(vocab_size, embedding_dim, num_filters, filter_sizes, hidden_dim, lstm_hidden_dim, output_dim, dropout)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss()  # Assuming binary classification

In [69]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Training loop
# Hyperparameters and other setup
epochs = 100
patience = 2  # Number of epochs with no improvement to wait before stopping
best_val_loss = float('inf')  # Initialize best validation loss as infinity
counter = 0  # Initialize counter for early stopping
model.to(device)
# Training loop with early stopping
for epoch in range(epochs):
    # Training phase
    model.train()
    with tqdm(total=len(train_loader), desc=f'Epoch {epoch + 1}/{epochs}', unit='batch') as pbar:
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            pbar.update(1)

    # Validation phase
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y.unsqueeze(1))
            val_loss += loss.item()

            # Compute accuracy
            predicted = torch.round(torch.sigmoid(outputs))  # Assuming binary classification
            total += batch_y.size(0)
            correct += (predicted == batch_y.unsqueeze(1)).sum().item()

    val_loss /= len(test_loader)  # Average validation loss
    val_acc = 100 * correct / total  # Validation accuracy

    print(f'Validation loss: {val_loss}, Validation Accuracy: {val_acc}%')

    # Check early stopping conditions
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0  # Reset counter
        torch.save(model.state_dict(), 'best_model.pth')  # Save best model
    else:
        counter += 1
        print(f'EarlyStopping counter: {counter} out of {patience}')
        if counter >= patience:
            print('Early stopping triggered.')
            break  # Stop training

Epoch 1/100: 100%|██████████| 108/108 [00:10<00:00, 10.68batch/s]


Validation loss: 0.33085234959920246, Validation Accuracy: 87.58720930232558%


Epoch 2/100: 100%|██████████| 108/108 [00:10<00:00, 10.04batch/s]


Validation loss: 0.3011147203268828, Validation Accuracy: 88.99709302325581%


Epoch 3/100: 100%|██████████| 108/108 [00:10<00:00, 10.20batch/s]


Validation loss: 0.30138570732540554, Validation Accuracy: 88.25581395348837%
EarlyStopping counter: 1 out of 2


Epoch 4/100: 100%|██████████| 108/108 [00:10<00:00, 10.72batch/s]


Validation loss: 0.3340665113042902, Validation Accuracy: 86.90406976744185%
EarlyStopping counter: 2 out of 2
Early stopping triggered.


In [70]:
test_ids = []
test_texts = []
# Open file for reading
with open('test_set.json', 'r') as f:
    for line in f:
        # Parse the JSON line into a Python dictionary
        obj = json.loads(line)
        test_ids.append(obj['id'])
        test_texts.append(obj['text'])
        
test_texts = pad_or_truncate_sequences(test_texts, fixed_length)
test_texts_tensor = torch.tensor(test_texts, dtype=torch.long).to(device)
model.eval()
# Make the prediction

with torch.no_grad():
    output = model(test_texts_tensor.to(device))

# Convert logits to probabilities
probabilities = torch.sigmoid(output)

# Threshold probabilities to get binary predictions
predicted_classes = (probabilities >= 0.5).int()

# Move to CPU and convert to NumPy array
predicted_classes = predicted_classes.cpu().numpy().flatten()



  test_texts_tensor = torch.tensor(test_texts, dtype=torch.long).to(device)


In [71]:
predicted_classes

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,

In [72]:
import pandas as pd
output_df = pd.DataFrame({"id":test_ids, "class": predicted_classes})
output_df.to_csv("output.csv", index=False)