# 1.Load Data

In [12]:
# Functions loading data from json file to dataframe
import json

def load_json_data(domain1_train_file, domain2_train_file, test_data_file):
    def load_json_lines(file):
        with open(file, "r") as f:
            return [json.loads(line) for line in f if line.strip()]  # avoid empty line

    domain1_train_data = load_json_lines(domain1_train_file)
    domain2_train_data = load_json_lines(domain2_train_file)
    test_data = load_json_lines(test_data_file)

    return domain1_train_data, domain2_train_data, test_data



import pandas as pd

# Convert to dataframe from list
def convert_to_dataframe(domain1_train_data, domain2_train_data, test_data):

    df_train_domain1 = pd.DataFrame(domain1_train_data)

    df_train_domain2 = pd.DataFrame(domain2_train_data)

    df_test = pd.DataFrame(test_data)

    return df_train_domain1, df_train_domain2, df_test


# Load data from json file to dataframe
domain1_train_file = "domain1_train_data.json"
domain2_train_file = "domain2_train_data.json"
test_data_file = "test_data.json"

domain1_train_data, domain2_train_data, test_data = load_json_data(domain1_train_file, domain2_train_file, test_data_file)
df_train_domain1, df_train_domain2, df_test = convert_to_dataframe(domain1_train_data, domain2_train_data, test_data)

# 2.Process Data

In [13]:
# Apply Oversampling to domain2
df_major = df_train_domain2[df_train_domain2['label'] == 1]
df_minor = df_train_domain2[df_train_domain2['label'] == 0]

df_minor_up = df_minor.sample(len(df_major), replace=True, random_state=42)
df_balanced = pd.concat([df_minor_up, df_major]).sample(frac=1).reset_index(drop=True)

In [14]:
# Process text in training texts to same length
from torch.utils.data import Dataset
import torch

class TextDataset(Dataset):
    def __init__(self, df, max_len=512, pad_token=0):
        self.max_len = max_len
        self.pad_token = pad_token
        self.texts = [self._pad_or_truncate(seq) for seq in df['text']]
        self.labels = df['label'].tolist() if 'label' in df.columns else None

    def _pad_or_truncate(self, seq):
        if len(seq) < self.max_len:
            return seq + [self.pad_token] * (self.max_len - len(seq))
        else:
            return seq[:self.max_len]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text_tensor = torch.tensor(self.texts[idx], dtype=torch.long)
        if self.labels:
            label_tensor = torch.tensor(self.labels[idx], dtype=torch.float)
            return text_tensor, label_tensor
        else:
            return text_tensor

# Create tow loaders for Transformer
df_combined = pd.concat([df_train_domain1, df_balanced]).sample(frac=1).reset_index(drop=True)
train_dataset = TextDataset(df_combined, max_len=512)
test_dataset = TextDataset(df_test, max_len=512)

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 3.Model Define

In [15]:
# Basic Transformer implementation
'''
vocab_size=17119+1
emb_dim=128 -> dimensions for word token embeddings
n_heads=4 -> lightweight with good performance
n_layers=2 -> 2 layers of transformer encoders
max_len=512 -> can cover 93.8% of the samples
'''

import torch
import torch.nn as nn

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size=17120, emb_dim=128, n_heads=4, n_layers=2, max_len=512, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.pos_encoding = self._positional_encoding(max_len, emb_dim)
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        self.fc = nn.Linear(emb_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def _positional_encoding(self, max_len, d_model):
        pos = torch.arange(0, max_len).unsqueeze(1)
        i = torch.arange(0, d_model, 2)
        angle_rates = 1 / torch.pow(10000, (i.float() / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(pos * angle_rates)
        pe[:, 1::2] = torch.cos(pos * angle_rates)
        pe = pe.unsqueeze(0)
        return pe

    def forward(self, x):
        # x: [batch_size, seq_len]
        mask = (x == 0)
        emb = self.embedding(x) + self.pos_encoding[:, :x.size(1)].to(x.device)
        out = self.transformer(emb, src_key_padding_mask=mask)
        out = out.mean(dim=1) 
        out = self.sigmoid(self.fc(out)).squeeze(-1)
        return out


In [None]:
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerClassifier(vocab_size=17120).to(device)
criterion = nn.BCELoss()  # already undersampling, no need for pos_weight
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Train
for epoch in range(5):
    model.train()
    total_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for inputs, labels in progress_bar:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)  # Output shape: [batch_size]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Avg Loss: {avg_loss:.4f}")

        # Update tqdm progress bar with current loss
        progress_bar.set_postfix(loss=loss.item())
    
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1:   0%|▎                                                                                                                  | 1/329 [00:05<31:26,  5.75s/it, loss=0.721]

Epoch 1, Avg Loss: 0.0022


Epoch 1:   1%|▋                                                                                                                  | 2/329 [00:10<27:29,  5.04s/it, loss=0.728]

Epoch 1, Avg Loss: 0.0044


Epoch 1:   1%|█                                                                                                                  | 3/329 [00:16<31:27,  5.79s/it, loss=0.683]

Epoch 1, Avg Loss: 0.0065


Epoch 1:   1%|█▍                                                                                                                 | 4/329 [00:24<34:24,  6.35s/it, loss=0.704]

Epoch 1, Avg Loss: 0.0086


Epoch 1:   2%|█▋                                                                                                                 | 5/329 [00:29<32:26,  6.01s/it, loss=0.685]

Epoch 1, Avg Loss: 0.0107


Epoch 1:   2%|██                                                                                                                 | 6/329 [00:34<30:19,  5.63s/it, loss=0.685]

Epoch 1, Avg Loss: 0.0128


Epoch 1:   2%|██▍                                                                                                                | 7/329 [00:39<28:36,  5.33s/it, loss=0.687]

Epoch 1, Avg Loss: 0.0149


Epoch 1:   2%|██▊                                                                                                                | 8/329 [00:43<27:04,  5.06s/it, loss=0.707]

Epoch 1, Avg Loss: 0.0170


Epoch 1:   3%|███▏                                                                                                               | 9/329 [00:48<26:32,  4.98s/it, loss=0.698]

Epoch 1, Avg Loss: 0.0191


Epoch 1:   3%|███▍                                                                                                              | 10/329 [00:57<32:51,  6.18s/it, loss=0.696]

Epoch 1, Avg Loss: 0.0213


Epoch 1:   3%|███▊                                                                                                               | 11/329 [01:04<33:43,  6.36s/it, loss=0.69]

Epoch 1, Avg Loss: 0.0234


Epoch 1:   4%|████▏                                                                                                             | 12/329 [01:11<35:19,  6.69s/it, loss=0.674]

Epoch 1, Avg Loss: 0.0254


Epoch 1:   4%|████▌                                                                                                             | 13/329 [01:22<42:34,  8.08s/it, loss=0.671]

Epoch 1, Avg Loss: 0.0274


Epoch 1:   4%|████▊                                                                                                             | 14/329 [01:30<41:19,  7.87s/it, loss=0.635]

Epoch 1, Avg Loss: 0.0294


Epoch 1:   5%|█████▏                                                                                                            | 15/329 [01:37<40:02,  7.65s/it, loss=0.745]

Epoch 1, Avg Loss: 0.0316


Epoch 1:   5%|█████▌                                                                                                            | 16/329 [01:43<38:00,  7.28s/it, loss=0.712]

Epoch 1, Avg Loss: 0.0338


Epoch 1:   5%|█████▉                                                                                                            | 17/329 [01:49<35:30,  6.83s/it, loss=0.769]

Epoch 1, Avg Loss: 0.0361


Epoch 1:   5%|██████▏                                                                                                           | 18/329 [01:56<35:25,  6.83s/it, loss=0.711]

Epoch 1, Avg Loss: 0.0383


Epoch 1:   6%|██████▌                                                                                                           | 19/329 [02:02<33:47,  6.54s/it, loss=0.653]

Epoch 1, Avg Loss: 0.0403


Epoch 1:   6%|██████▉                                                                                                            | 20/329 [02:09<34:21,  6.67s/it, loss=0.67]

Epoch 1, Avg Loss: 0.0423


Epoch 1:   6%|███████▎                                                                                                          | 21/329 [02:15<33:29,  6.52s/it, loss=0.716]

Epoch 1, Avg Loss: 0.0445


Epoch 1:   7%|███████▌                                                                                                          | 22/329 [02:25<38:22,  7.50s/it, loss=0.734]

Epoch 1, Avg Loss: 0.0467


Epoch 1:   7%|███████▉                                                                                                          | 23/329 [02:32<38:13,  7.49s/it, loss=0.687]

Epoch 1, Avg Loss: 0.0488


Epoch 1:   7%|████████▎                                                                                                         | 24/329 [02:39<37:10,  7.31s/it, loss=0.666]

Epoch 1, Avg Loss: 0.0508


Epoch 1:   8%|████████▋                                                                                                         | 25/329 [02:45<35:12,  6.95s/it, loss=0.655]

Epoch 1, Avg Loss: 0.0528


Epoch 1:   8%|█████████                                                                                                         | 26/329 [02:52<34:55,  6.91s/it, loss=0.638]

Epoch 1, Avg Loss: 0.0548


Epoch 1:   8%|█████████▎                                                                                                        | 27/329 [02:57<32:29,  6.45s/it, loss=0.599]

Epoch 1, Avg Loss: 0.0566


Epoch 1:   9%|█████████▋                                                                                                        | 28/329 [03:03<30:57,  6.17s/it, loss=0.643]

Epoch 1, Avg Loss: 0.0585


Epoch 1:   9%|██████████                                                                                                        | 29/329 [03:10<32:29,  6.50s/it, loss=0.579]

Epoch 1, Avg Loss: 0.0603


Epoch 1:   9%|██████████▍                                                                                                       | 30/329 [03:16<31:58,  6.42s/it, loss=0.694]

Epoch 1, Avg Loss: 0.0624


Epoch 1:   9%|██████████▋                                                                                                       | 31/329 [03:22<30:20,  6.11s/it, loss=0.782]

Epoch 1, Avg Loss: 0.0648


Epoch 1:  10%|███████████                                                                                                       | 32/329 [03:27<28:49,  5.82s/it, loss=0.728]

Epoch 1, Avg Loss: 0.0670


Epoch 1:  10%|███████████▍                                                                                                      | 33/329 [03:34<29:51,  6.05s/it, loss=0.637]

Epoch 1, Avg Loss: 0.0689


Epoch 1:  10%|███████████▊                                                                                                      | 34/329 [03:38<28:01,  5.70s/it, loss=0.613]

Epoch 1, Avg Loss: 0.0708


Epoch 1:  11%|████████████▏                                                                                                     | 35/329 [03:43<26:35,  5.43s/it, loss=0.695]

Epoch 1, Avg Loss: 0.0729


Epoch 1:  11%|████████████▌                                                                                                      | 36/329 [03:48<25:40,  5.26s/it, loss=0.64]

Epoch 1, Avg Loss: 0.0749


Epoch 1:  11%|████████████▊                                                                                                     | 37/329 [03:53<25:01,  5.14s/it, loss=0.596]

Epoch 1, Avg Loss: 0.0767


Epoch 1:  12%|█████████████▏                                                                                                    | 38/329 [04:04<34:10,  7.05s/it, loss=0.658]

Epoch 1, Avg Loss: 0.0787


Epoch 1:  12%|█████████████▌                                                                                                    | 39/329 [04:11<33:34,  6.95s/it, loss=0.592]

Epoch 1, Avg Loss: 0.0805


Epoch 1:  12%|█████████████▉                                                                                                     | 40/329 [04:19<35:05,  7.29s/it, loss=0.76]

Epoch 1, Avg Loss: 0.0828


Epoch 1:  12%|██████████████▏                                                                                                   | 41/329 [04:28<37:03,  7.72s/it, loss=0.693]

Epoch 1, Avg Loss: 0.0849


Epoch 1:  13%|██████████████▊                                                                                                     | 42/329 [04:36<37:17,  7.80s/it, loss=0.7]

Epoch 1, Avg Loss: 0.0870


Epoch 1:  13%|██████████████▉                                                                                                   | 43/329 [04:43<35:59,  7.55s/it, loss=0.563]

Epoch 1, Avg Loss: 0.0887


Epoch 1:  13%|███████████████▏                                                                                                  | 44/329 [04:50<35:09,  7.40s/it, loss=0.612]

Epoch 1, Avg Loss: 0.0906


Epoch 1:  14%|███████████████▋                                                                                                   | 45/329 [04:55<31:34,  6.67s/it, loss=0.61]

Epoch 1, Avg Loss: 0.0924


Epoch 1:  14%|███████████████▉                                                                                                  | 46/329 [05:00<29:38,  6.28s/it, loss=0.597]

Epoch 1, Avg Loss: 0.0943


Epoch 1:  14%|████████████████▎                                                                                                 | 47/329 [05:07<29:28,  6.27s/it, loss=0.594]

Epoch 1, Avg Loss: 0.0961


Epoch 1:  15%|████████████████▋                                                                                                 | 48/329 [05:12<27:37,  5.90s/it, loss=0.596]

Epoch 1, Avg Loss: 0.0979


Epoch 1:  15%|████████████████▉                                                                                                 | 49/329 [05:18<27:44,  5.94s/it, loss=0.578]

Epoch 1, Avg Loss: 0.0996


Epoch 1:  15%|█████████████████▎                                                                                                | 50/329 [05:26<30:49,  6.63s/it, loss=0.595]

Epoch 1, Avg Loss: 0.1014


Epoch 1:  16%|█████████████████▋                                                                                                | 51/329 [05:34<32:17,  6.97s/it, loss=0.625]

Epoch 1, Avg Loss: 0.1033


Epoch 1:  16%|██████████████████                                                                                                | 52/329 [05:40<30:55,  6.70s/it, loss=0.562]

Epoch 1, Avg Loss: 0.1050


Epoch 1:  16%|██████████████████▎                                                                                               | 53/329 [05:45<28:53,  6.28s/it, loss=0.538]

Epoch 1, Avg Loss: 0.1067


Epoch 1:  16%|██████████████████▋                                                                                               | 54/329 [05:50<26:55,  5.87s/it, loss=0.547]

Epoch 1, Avg Loss: 0.1083


Epoch 1:  17%|███████████████████                                                                                               | 55/329 [05:55<25:29,  5.58s/it, loss=0.567]

Epoch 1, Avg Loss: 0.1101


Epoch 1:  17%|███████████████████▍                                                                                              | 56/329 [06:00<24:51,  5.46s/it, loss=0.495]

Epoch 1, Avg Loss: 0.1116


Epoch 1:  17%|███████████████████▊                                                                                              | 57/329 [06:05<24:21,  5.37s/it, loss=0.512]

Epoch 1, Avg Loss: 0.1131


Epoch 1:  18%|████████████████████▎                                                                                              | 58/329 [06:10<23:44,  5.26s/it, loss=0.52]

Epoch 1, Avg Loss: 0.1147


Epoch 1:  18%|████████████████████▍                                                                                             | 59/329 [06:16<23:52,  5.31s/it, loss=0.461]

Epoch 1, Avg Loss: 0.1161


Epoch 1:  18%|████████████████████▊                                                                                             | 60/329 [06:22<25:52,  5.77s/it, loss=0.559]

Epoch 1, Avg Loss: 0.1178


Epoch 1:  19%|█████████████████████▏                                                                                            | 61/329 [06:28<24:55,  5.58s/it, loss=0.522]

Epoch 1, Avg Loss: 0.1194


Epoch 1:  19%|█████████████████████▍                                                                                            | 62/329 [06:32<23:54,  5.37s/it, loss=0.488]

Epoch 1, Avg Loss: 0.1209


Epoch 1:  19%|█████████████████████▊                                                                                            | 63/329 [06:37<23:14,  5.24s/it, loss=0.491]

Epoch 1, Avg Loss: 0.1224


Epoch 1:  19%|██████████████████████▏                                                                                           | 64/329 [06:44<24:49,  5.62s/it, loss=0.445]

Epoch 1, Avg Loss: 0.1237


Epoch 1:  20%|██████████████████████▌                                                                                           | 65/329 [06:49<23:48,  5.41s/it, loss=0.368]

Epoch 1, Avg Loss: 0.1248


Epoch 1:  20%|███████████████████████                                                                                            | 66/329 [06:54<23:02,  5.26s/it, loss=0.46]

Epoch 1, Avg Loss: 0.1262


Epoch 1:  20%|███████████████████████▏                                                                                          | 67/329 [06:59<22:29,  5.15s/it, loss=0.385]

Epoch 1, Avg Loss: 0.1274


Epoch 1:  21%|███████████████████████▌                                                                                          | 68/329 [07:04<22:09,  5.09s/it, loss=0.381]

Epoch 1, Avg Loss: 0.1286


Epoch 1:  21%|███████████████████████▉                                                                                          | 69/329 [07:08<21:49,  5.04s/it, loss=0.387]

Epoch 1, Avg Loss: 0.1298


Epoch 1:  21%|████████████████████████▎                                                                                         | 70/329 [07:13<21:37,  5.01s/it, loss=0.381]

Epoch 1, Avg Loss: 0.1309


Epoch 1:  22%|████████████████████████▌                                                                                         | 71/329 [07:18<21:25,  4.98s/it, loss=0.346]

Epoch 1, Avg Loss: 0.1320


Epoch 1:  22%|████████████████████████▉                                                                                         | 72/329 [07:25<22:59,  5.37s/it, loss=0.301]

Epoch 1, Avg Loss: 0.1329


Epoch 1:  22%|█████████████████████████▎                                                                                        | 73/329 [07:29<22:12,  5.21s/it, loss=0.346]

Epoch 1, Avg Loss: 0.1339


Epoch 1:  22%|█████████████████████████▋                                                                                        | 74/329 [07:34<21:32,  5.07s/it, loss=0.252]

Epoch 1, Avg Loss: 0.1347


Epoch 1:  23%|█████████████████████████▉                                                                                        | 75/329 [07:39<21:17,  5.03s/it, loss=0.368]

Epoch 1, Avg Loss: 0.1358


Epoch 1:  23%|██████████████████████████▎                                                                                       | 76/329 [07:44<21:08,  5.01s/it, loss=0.267]

Epoch 1, Avg Loss: 0.1366


Epoch 1:  23%|██████████████████████████▉                                                                                        | 77/329 [07:49<20:30,  4.88s/it, loss=0.24]

Epoch 1, Avg Loss: 0.1374


Epoch 1:  24%|███████████████████████████                                                                                       | 78/329 [07:55<22:31,  5.39s/it, loss=0.238]

Epoch 1, Avg Loss: 0.1381


Epoch 1:  24%|███████████████████████████▎                                                                                      | 79/329 [08:00<21:24,  5.14s/it, loss=0.263]

Epoch 1, Avg Loss: 0.1389


Epoch 1:  24%|███████████████████████████▋                                                                                      | 80/329 [08:05<21:00,  5.06s/it, loss=0.181]

Epoch 1, Avg Loss: 0.1394


Epoch 1:  25%|████████████████████████████                                                                                      | 81/329 [08:11<22:15,  5.38s/it, loss=0.157]

Epoch 1, Avg Loss: 0.1399


Epoch 1:  25%|████████████████████████████▍                                                                                     | 82/329 [08:19<25:37,  6.22s/it, loss=0.205]

Epoch 1, Avg Loss: 0.1405


Epoch 1:  25%|████████████████████████████▊                                                                                     | 83/329 [08:26<26:29,  6.46s/it, loss=0.152]

Epoch 1, Avg Loss: 0.1410


Epoch 1:  26%|█████████████████████████████                                                                                     | 84/329 [08:32<26:12,  6.42s/it, loss=0.182]

Epoch 1, Avg Loss: 0.1415


Epoch 1:  26%|█████████████████████████████▋                                                                                     | 85/329 [08:39<25:58,  6.39s/it, loss=0.17]

Epoch 1, Avg Loss: 0.1421


Epoch 1:  26%|█████████████████████████████▊                                                                                    | 86/329 [08:46<26:35,  6.57s/it, loss=0.177]

Epoch 1, Avg Loss: 0.1426


Epoch 1:  26%|██████████████████████████████▏                                                                                   | 87/329 [08:51<25:10,  6.24s/it, loss=0.169]

Epoch 1, Avg Loss: 0.1431


Epoch 1:  27%|██████████████████████████████▏                                                                                  | 88/329 [08:57<24:15,  6.04s/it, loss=0.0822]

Epoch 1, Avg Loss: 0.1434


Epoch 1:  27%|██████████████████████████████▊                                                                                   | 89/329 [09:05<26:54,  6.73s/it, loss=0.101]

Epoch 1, Avg Loss: 0.1437


Epoch 1:  27%|███████████████████████████████▏                                                                                  | 90/329 [09:12<27:05,  6.80s/it, loss=0.358]

Epoch 1, Avg Loss: 0.1448


Epoch 1:  28%|███████████████████████████████▌                                                                                  | 91/329 [09:17<25:16,  6.37s/it, loss=0.158]

Epoch 1, Avg Loss: 0.1452


Epoch 1:  28%|████████████████████████████████▏                                                                                  | 92/329 [09:23<23:54,  6.05s/it, loss=0.22]

Epoch 1, Avg Loss: 0.1459


Epoch 1:  28%|████████████████████████████████▏                                                                                 | 93/329 [09:28<22:23,  5.69s/it, loss=0.147]

Epoch 1, Avg Loss: 0.1464


Epoch 1:  29%|████████████████████████████████▌                                                                                 | 94/329 [09:32<21:17,  5.44s/it, loss=0.187]

Epoch 1, Avg Loss: 0.1469


In [None]:
model.eval()
predictions = []
with torch.no_grad():
    for inputs in DataLoader(test_dataset, batch_size=32):
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = (outputs > 0.5).long()
        predictions.extend(preds.cpu().tolist())

In [None]:
# Output prediction
df_test['label'] = predictions

df_submission = df_test[['id', 'label']]

df_submission.to_csv("submission_trans_v2.csv", index=False)

print(df_submission.shape)          # (4000, 2)
print(df_submission.head())        
print(df_submission['label'].value_counts())  # check distribution