In [None]:
!pip install scikit-learn
!pip install nltk

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[0m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from multiprocessing import Pool
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Download NLTK data files
nltk.download('punkt')

# Step 1: Load and preprocess the data
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

# Handle missing values by replacing NaN in 'text' column
data_train['text'] = data_train['text'].fillna('').astype(str)
data_test['text'] = data_test['text'].fillna('').astype(str)

# Tokenize text with multiprocessing
def tokenize(sentence):
    return word_tokenize(sentence.lower())

with Pool() as process_pool:
    tokens_train = list(tqdm(process_pool.imap(tokenize, data_train['text']), total=len(data_train)))
    tokens_test = list(tqdm(process_pool.imap(tokenize, data_test['text']), total=len(data_test)))

data_train['tokens'] = tokens_train
data_test['tokens'] = tokens_test


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 1804874/1804874 [03:45<00:00, 8013.38it/s]
100%|██████████| 97320/97320 [00:11<00:00, 8386.11it/s]


In [None]:

# Build vocabulary with unique indices
all_words = [word for tokenized in tokens_train for word in tokenized]
word_counter = Counter(all_words)
word_index = {word: idx + 2 for idx, (word, _) in enumerate(word_counter.items())}
word_index['<PAD>'] = 0
word_index['<UNK>'] = 1

# Convert tokenized text to indices
def map_to_indices(token_list):
    return [word_index.get(word, word_index['<UNK>']) for word in token_list]

data_train['indexed'] = data_train['tokens'].apply(map_to_indices)
data_test['indexed'] = data_test['tokens'].apply(map_to_indices)

# Handle class imbalance by oversampling
class_positive = data_train[data_train.toxicity > 0]
class_negative = data_train[data_train.toxicity == 0]

class_negative_upsampled = resample(
    class_negative,
    replace=True,
    n_samples=len(class_positive),
    random_state=42
)

balanced_data = pd.concat([class_positive, class_negative_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:

# Dataset class for Toxicity Classification
class TextDataset(Dataset):
    def __init__(self, sequences, labels, max_len=100):
        self.sequences = sequences
        self.labels = labels
        self.max_len = max_len
        self.pad_index = word_index['<PAD>']

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx][:self.max_len]
        seq += [self.pad_index] * (self.max_len - len(seq))
        return torch.tensor(seq, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

# Prepare DataLoaders
categories = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']

train_features, val_features, train_labels, val_labels = train_test_split(
    balanced_data['indexed'].tolist(),
    balanced_data[categories].values,
    test_size=0.1,
    random_state=42
)

train_set = TextDataset(train_features, train_labels)
val_set = TextDataset(val_features, val_labels)
test_set = TextDataset(data_test['indexed'].tolist(), np.zeros((len(data_test), len(categories))))

train_loader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=4)
val_loader = DataLoader(val_set, batch_size=64, shuffle=False, num_workers=4)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=4)

In [None]:
# Define the LSTM with Attention Model
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, layers=1, bidirectional=True, dropout_rate=0.5):
        super(TextClassifier, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=word_index['<PAD>'])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=layers, bidirectional=bidirectional, dropout=dropout_rate if layers > 1 else 0, batch_first=True)
        self.attn_layer = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 1)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def attention(self, lstm_out):
        scores = torch.tanh(self.attn_layer(lstm_out)).squeeze(2)
        weights = torch.softmax(scores, 1)
        context_vector = torch.bmm(weights.unsqueeze(1), lstm_out).squeeze(1)
        return context_vector

    def forward(self, x):
        embedded = self.embed(x)
        lstm_out, _ = self.lstm(embedded)
        attn_out = self.attention(lstm_out)
        output = self.dropout(attn_out)
        return self.fc(output)

In [None]:
# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TextClassifier(
    vocab_size=len(word_index),
    embed_dim=128,
    hidden_dim=256,
    output_dim=len(categories),
    layers=3,
    bidirectional=True,
    dropout_rate=0.5
).to(device)

loss_fn = nn.BCEWithLogitsLoss()
optimiser = optim.Adam(model.parameters(), lr=1e-3)

In [None]:

epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for seq_batch, label_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", unit="batch"):
        seq_batch, label_batch = seq_batch.to(device), label_batch.to(device)
        optimiser.zero_grad()
        predictions = model(seq_batch)
        loss = loss_fn(predictions, label_batch)
        loss.backward()
        optimiser.step()
        epoch_loss += loss.item()
    print(f"Training Loss: {epoch_loss / len(train_loader):.4f}")

    # Validation loop
    model.eval()
    validation_loss = 0
    with torch.no_grad():
        for val_seq, val_label in val_loader:
            val_seq, val_label = val_seq.to(device), val_label.to(device)
            val_predictions = model(val_seq)
            val_loss = loss_fn(val_predictions, val_label)
            validation_loss += val_loss.item()
    print(f"Validation Loss: {validation_loss / len(val_loader):.4f}")

Epoch 1/10: 100%|██████████| 15191/15191 [19:04<00:00, 13.27batch/s]

Training Loss: 0.1549





Validation Loss: 0.1432


Epoch 2/10: 100%|██████████| 15191/15191 [18:50<00:00, 13.43batch/s]

Training Loss: 0.1416





Validation Loss: 0.1420


Epoch 3/10: 100%|██████████| 15191/15191 [18:58<00:00, 13.34batch/s]

Training Loss: 0.1388





Validation Loss: 0.1414


Epoch 4/10: 100%|██████████| 15191/15191 [18:58<00:00, 13.34batch/s]

Training Loss: 0.1366





Validation Loss: 0.1418


Epoch 5/10: 100%|██████████| 15191/15191 [18:57<00:00, 13.35batch/s]

Training Loss: 0.1344





Validation Loss: 0.1423


Epoch 6/10: 100%|██████████| 15191/15191 [18:57<00:00, 13.35batch/s]

Training Loss: 0.1324





Validation Loss: 0.1429


Epoch 7/10: 100%|██████████| 15191/15191 [19:47<00:00, 12.79batch/s]

Training Loss: 0.1306





Validation Loss: 0.1445


Epoch 8/10: 100%|██████████| 15191/15191 [19:35<00:00, 12.92batch/s]

Training Loss: 0.1291





Validation Loss: 0.1448


Epoch 9/10: 100%|██████████| 15191/15191 [19:41<00:00, 12.86batch/s]

Training Loss: 0.1278





Validation Loss: 0.1457


Epoch 10/10: 100%|██████████| 15191/15191 [14:21<00:00, 17.62batch/s]

Training Loss: 0.1267





Validation Loss: 0.1458


In [None]:
# ***Save the Model Weights***
# Save the trained model's state dictionary for future use
torch.save(model.state_dict(), 'weights.pth')
print("Model weights saved'.")

# Step 5: Generate predictions for the test dataset
model.eval()
test_predictions = []
with torch.no_grad():
    for test_batch, _ in tqdm(test_loader, desc='Generating Predictions', unit='batch'):
        test_batch = test_batch.to(device)
        logits = model(test_batch)
        probabilities = torch.sigmoid(logits)
        test_predictions.extend(probabilities.cpu().numpy())

# ***Convert Probabilities to Binary Outputs***
# Apply a threshold to classify probabilities into binary labels
binary_threshold = 0.5
test_predictions = np.array(test_predictions)
binary_outputs = (test_predictions > binary_threshold).astype(int)

# Save the binary predictions to a CSV file
submission_df = pd.DataFrame(binary_outputs, columns=categories)
submission_df.insert(0, 'id', data_test['id'])
submission_df.to_csv('best_pp15.csv', index=False)
print("Predictions on test data saved in 'best_pp15.csv'")


Model weights saved'.


Generating Predictions: 100%|██████████| 1521/1521 [00:18<00:00, 82.69batch/s]


Predictions on test data saved in 'best_pp15.csv'
