# 1.Load Data

In [18]:
# Functions loading data from json file to dataframe
import json

def load_json_data(domain1_train_file, domain2_train_file, test_data_file):
    def load_json_lines(file):
        with open(file, "r") as f:
            return [json.loads(line) for line in f if line.strip()]  # avoid empty line

    domain1_train_data = load_json_lines(domain1_train_file)
    domain2_train_data = load_json_lines(domain2_train_file)
    test_data = load_json_lines(test_data_file)

    return domain1_train_data, domain2_train_data, test_data



import pandas as pd

# Convert to dataframe from list
def convert_to_dataframe(domain1_train_data, domain2_train_data, test_data):

    df_train_domain1 = pd.DataFrame(domain1_train_data)

    df_train_domain2 = pd.DataFrame(domain2_train_data)

    df_test = pd.DataFrame(test_data)

    return df_train_domain1, df_train_domain2, df_test


# Load data from json file to dataframe
domain1_train_file = "domain1_train_data.json"
domain2_train_file = "domain2_train_data.json"
test_data_file = "test_data.json"

domain1_train_data, domain2_train_data, test_data = load_json_data(domain1_train_file, domain2_train_file, test_data_file)
df_train_domain1, df_train_domain2, df_test = convert_to_dataframe(domain1_train_data, domain2_train_data, test_data)

# 2.Process Data

In [19]:
# Apply Undersampling to domain2
df_major = df_train_domain2[df_train_domain2['label'] == 1]
df_minor = df_train_domain2[df_train_domain2['label'] == 0]

df_major_down = df_major.sample(len(df_minor), random_state=42)
df_balanced = pd.concat([df_major_down, df_minor]).sample(frac=1).reset_index(drop=True)

In [20]:
# Process text in training texts to same length
from torch.utils.data import Dataset
import torch

class TextDataset(Dataset):
    def __init__(self, df, max_len=512, pad_token=0):
        self.max_len = max_len
        self.pad_token = pad_token
        self.texts = [self._pad_or_truncate(seq) for seq in df['text']]
        self.labels = df['label'].tolist() if 'label' in df.columns else None

    def _pad_or_truncate(self, seq):
        if len(seq) < self.max_len:
            return seq + [self.pad_token] * (self.max_len - len(seq))
        else:
            return seq[:self.max_len]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text_tensor = torch.tensor(self.texts[idx], dtype=torch.long)
        if self.labels:
            label_tensor = torch.tensor(self.labels[idx], dtype=torch.float)
            return text_tensor, label_tensor
        else:
            return text_tensor

# Create tow loaders for Transformer
df_combined = pd.concat([df_train_domain1, df_balanced]).sample(frac=1).reset_index(drop=True)
train_dataset = TextDataset(df_combined, max_len=512)
test_dataset = TextDataset(df_test, max_len=512)

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 3.Model Define

In [21]:
# Basic Transformer implementation
'''
vocab_size=17119+1
emb_dim=128 -> dimensions for word token embeddings
n_heads=4 -> lightweight with good performance
n_layers=2 -> 2 layers of transformer encoders
max_len=512 -> can cover 93.8% of the samples
'''

import torch
import torch.nn as nn

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size=17120, emb_dim=128, n_heads=4, n_layers=2, max_len=512, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.pos_encoding = self._positional_encoding(max_len, emb_dim)
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        self.fc = nn.Linear(emb_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def _positional_encoding(self, max_len, d_model):
        pos = torch.arange(0, max_len).unsqueeze(1)
        i = torch.arange(0, d_model, 2)
        angle_rates = 1 / torch.pow(10000, (i.float() / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(pos * angle_rates)
        pe[:, 1::2] = torch.cos(pos * angle_rates)
        pe = pe.unsqueeze(0)
        return pe

    def forward(self, x):
        # x: [batch_size, seq_len]
        mask = (x == 0)
        emb = self.embedding(x) + self.pos_encoding[:, :x.size(1)].to(x.device)
        out = self.transformer(emb, src_key_padding_mask=mask)
        out = out.mean(dim=1) 
        out = self.sigmoid(self.fc(out)).squeeze(-1)
        return out


# 4. Model Implementation

In [22]:
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerClassifier(vocab_size=17120).to(device)
criterion = nn.BCELoss()  # already undersampling, no need for pos_weight
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Train
for epoch in range(5):
    model.train()
    total_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for inputs, labels in progress_bar:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)  # Output shape: [batch_size]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Avg Loss: {avg_loss:.4f}")

        # Update tqdm progress bar with current loss
        progress_bar.set_postfix(loss=loss.item())
    
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")



Epoch 1, Loss: 32.1812
Epoch 2, Loss: 28.1049
Epoch 3, Loss: 20.5170
Epoch 4, Loss: 15.5824
Epoch 5, Loss: 14.1194


In [25]:
model.eval()
predictions = []
with torch.no_grad():
    for inputs in DataLoader(test_dataset, batch_size=32):
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = (outputs > 0.5).long()
        predictions.extend(preds.cpu().tolist())

In [27]:
# Output prediction
df_test['label'] = predictions

df_submission = df_test[['id', 'label']]

df_submission.to_csv("submission_trans_v1.csv", index=False)

print(df_submission.shape)          # (4000, 2)
print(df_submission.head())        
print(df_submission['label'].value_counts())  # check distribution

(4000, 2)
   id  label
0   0      1
1   1      0
2   2      0
3   3      1
4   4      0
label
1    2273
0    1727
Name: count, dtype: int64
