In [1]:
!pip uninstall torchtext -y
!pip uninstall torch -y
!pip install torch==2.2.0 torchtext==0.17.0

[0mFound existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121
Collecting torch==2.2.0
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchtext==0.17.0
  Downloading torchtext-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12

In [2]:
import re
import string
import zipfile

import emoji
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm

In [4]:
test_df = pd.read_csv("/kaggle/input/rahul2nd/dataset_instagram-scraper_2025-03-30_09-30-50-381.csv")
test_df.rename(columns={'text': 'comment_text'}, inplace=True)

In [14]:
vocab = torch.load("/kaggle/input/vocab-file/vocab.pth")
PAD_IDX = vocab['<pad>']

In [15]:
class Improved_BI_LSTM_GloVe(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, pad_idx, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        
        # Enhanced Architecture
        self.lstm = nn.LSTM(embed_dim, hidden_dim, 
                           num_layers=2,              # Stacked LSTMs
                           bidirectional=True, 
                           batch_first=True,
                           dropout=0.3)               # Inter-layer dropout
        
        self.attention = nn.Linear(hidden_dim * 2, 1) # Simple attention mechanism
        self.bn1 = nn.BatchNorm1d(hidden_dim * 2)     # Batch normalization
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.5),                          # Increased dropout
            nn.Linear(hidden_dim, output_dim)
        )
        
        # Initialize with kaiming normal for better convergence
        for layer in [self.attention, *self.fc]:
            if isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight)

    def forward(self, text, lengths):
        # Embedding with dropout
        embedded = F.dropout(self.embedding(text), p=0.2, training=self.training)
        
        # Packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        
        # BiLSTM with 2 layers
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # Attention mechanism
        attention_weights = F.softmax(self.attention(output), dim=1)
        context_vector = torch.sum(attention_weights * output, dim=1)
        
        # Batch norm + FC
        context_vector = self.bn1(context_vector)
        return self.fc(context_vector)

In [17]:
import pandas as pd
import torch
import re
import string
import emoji
from nltk.corpus import stopwords
import spacy
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import numpy as np

class ToxicityClassifierPipeline:
    def __init__(self, model, vocab_path="/kaggle/input/vocab-file/vocab.pth"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model.to(self.device)
        self.model.eval()
        self.tokenizer = get_tokenizer("basic_english")
        self.vocab = torch.load(vocab_path)
        self.PAD_IDX = self.vocab['<pad>']
        self.nlp = spacy.load("en_core_web_sm")
        self.stpwds = stopwords.words('english')
        
        # Define preprocessing patterns
        self.punc = string.punctuation.replace('#', '').replace('!', '').replace('?', '') + "∞θ÷α•à−β∅³π‘₹´°£€\\×™√²—"
        self.patterns = [
            r'\\[nrtbfv\\]',         # \n, \t etc
            '<.*?>',                 # HTML tags
            r'https?://\S+|www\.\S+', # Links
            r'\ufeff',               # BOM characters
            r'^[^a-zA-Z0-9]+$',      # Non-alphanumeric tokens
            r'ｗｗｗ．\S+',            # Full-width URLs
            r'[\uf700-\uf7ff]',      # Unicode private-use chars
            r'^[－—…]+$',            # Special punctuation
            r'[︵︶]'                # CJK parentheses
        ]
        
        # Chat words mapping (truncated for brevity)
        self.chat_words = {
            "AFAIK": "As Far As I Know",
            "AFK": "Away From Keyboard",
            # ... include all your chat words mapping
        }
        
        self.time_zone_abbreviations = [
            "UTC", "GMT", "EST", "CST", "PST", "MST",
            "EDT", "CDT", "PDT", "MDT", "CET", "EET",
            "WET", "AEST", "ACST", "AWST", "HST",
            "AKST", "IST", "JST", "KST", "NZST"
        ]

    def preprocess_text(self, text):
        """Apply all preprocessing steps to a single text"""
        if not isinstance(text, str) or not text.strip():
            return ""
            
        # Apply regex patterns
        for regex in self.patterns:
            text = re.sub(regex, '', text)
            
        # Remove punctuation
        text = text.translate(str.maketrans(self.punc, ' ' * len(self.punc)))
        
        # Remove time zones and stopwords
        text = ' '.join(word for word in text.split() 
                       if word not in self.time_zone_abbreviations 
                       and word not in self.stpwds)
        
        # Expand chat words
        text = ' '.join(self.chat_words.get(word.lower(), word) for word in text.split())
        
        # Lowercase and emoji handling
        text = text.lower()
        text = emoji.demojize(text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

    def tokenize_and_numericalize(self, text, max_length=256):
        """Tokenize and convert to numerical tokens"""
        if not text:  # Handle empty text
            return torch.empty(0, dtype=torch.long)
            
        tokens = [token for token in self.tokenizer(text) if 1 < len(token) < 25]
        tokens = tokens[:max_length]
        numericalized = [self.vocab[token] if token in self.vocab else self.vocab['<unk>'] 
                        for token in tokens]
        return torch.tensor(numericalized, dtype=torch.long)

    def predict_toxicity(self, df, text_column='comment_text', batch_size=64):
        """
        Predict toxicity for a DataFrame of texts
        
        Args:
            df: Input DataFrame containing text to classify
            text_column: Name of column containing text
            batch_size: Batch size for prediction
            
        Returns:
            DataFrame with original text and toxicity predictions
        """
        # Create a copy of the original DataFrame to preserve indices
        result_df = df.copy()
        
        # Preprocess all texts and keep track of non-empty texts
        processed_data = []
        valid_indices = []
        
        for idx, text in enumerate(df[text_column]):
            processed = self.preprocess_text(text)
            if processed:  # Only keep non-empty texts
                processed_data.append(processed)
                valid_indices.append(idx)
        
        # If all texts are empty after preprocessing
        if not processed_data:
            # Return all zeros for all predictions
            result_df['toxic'] = 0
            result_df['severe_toxic'] = 0
            result_df['obscene'] = 0
            result_df['threat'] = 0
            result_df['insult'] = 0
            result_df['identity_hate'] = 0
            return result_df
        
        # Tokenize and numericalize only non-empty texts
        tokenized = [self.tokenize_and_numericalize(text) for text in processed_data]
        
        # Create batches only for valid sequences
        batches = []
        for i in range(0, len(tokenized), batch_size):
            batch_texts = tokenized[i:i+batch_size]
            lengths = torch.tensor([len(t) for t in batch_texts])
            
            # Filter out empty sequences in this batch
            valid_mask = lengths > 0
            if not valid_mask.any():
                continue
                
            batch_texts = [t for t, valid in zip(batch_texts, valid_mask) if valid]
            lengths = lengths[valid_mask]
            
            # Pad sequences
            padded = pad_sequence(batch_texts, batch_first=True, padding_value=self.PAD_IDX)
            batches.append((padded, lengths, valid_mask))
        
        # Make predictions
        all_preds = np.zeros((len(df), 6), dtype=int)  # Initialize with zeros
        
        with torch.no_grad():
            current_idx = 0
            for batch, lengths, valid_mask in batches:
                batch = batch.to(self.device)
                outputs = self.model(batch, lengths.to(self.device))
                preds = (outputs > 0.5).int().cpu().numpy()
                
                # Assign predictions to the correct positions
                batch_size = len(preds)
                for i in range(batch_size):
                    if current_idx + i < len(valid_indices):
                        all_preds[valid_indices[current_idx + i]] = preds[i]
                
                current_idx += batch_size
        
        # Add predictions to result DataFrame
        result_df['toxic'] = all_preds[:, 0]
        result_df['severe_toxic'] = all_preds[:, 1]
        result_df['obscene'] = all_preds[:, 2]
        result_df['threat'] = all_preds[:, 3]
        result_df['insult'] = all_preds[:, 4]
        result_df['identity_hate'] = all_preds[:, 5]
        
        return result_df

# Example usage:
if __name__ == "__main__":
    # Load your trained model (example using the final model)
    final_model = Improved_BI_LSTM_GloVe(
        vocab_size=len(vocab),
        embed_dim=100,
        hidden_dim=256,
        pad_idx=PAD_IDX,
        output_dim=6
    )
    final_model.load_state_dict(torch.load("/kaggle/input/final/pytorch/default/1/final.pth", map_location=torch.device('cpu')))

    
    # Initialize pipeline
    pipeline = ToxicityClassifierPipeline(final_model)
    
    # Example test DataFrame
    test_df = pd.read_csv("/kaggle/input/rahul2nd/dataset_instagram-scraper_2025-03-30_09-30-50-381.csv")
    test_df.rename(columns={'text': 'comment_text'}, inplace=True)
    
    # Get predictions
    results = pipeline.predict_toxicity(test_df)
    print(results[['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])

              comment_text  toxic  severe_toxic  obscene  threat  insult  \
0    Why bro in sea always      0             0        0       0       0   
1     😂😂😂😂😂😂😂 misericórdia      0             0        0       0       0   
2                        🙌      0             0        0       0       0   
3                        🔥      0             0        0       0       0   
4               nigga what      1             1        1       0       1   
5   Vessel of NBA youngboy      0             0        0       0       0   
6                 JAJAJAJA      0             0        0       0       0   
7                      wtf      1             0        1       0       1   
8          fuck that bitch      1             1        1       0       1   
9             Is this real      0             0        0       0       0   
10           kill yourself      1             0        0       1       1   
11         @hyunmin._.x 고고      0             0        0       0       0   
12          

In [19]:
final_ans = results.drop('identity_hate',axis = 1)
final_ans

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult
0,18069096703645733,Why bro in sea always,0,0,0,0,0
1,18052302512112156,😂😂😂😂😂😂😂 misericórdia,0,0,0,0,0
2,18119491555435268,🙌,0,0,0,0,0
3,17884548831142635,🔥,0,0,0,0,0
4,18059375443875265,nigga what,1,1,1,0,1
5,18097418647542771,Vessel of NBA youngboy,0,0,0,0,0
6,18071614537687683,JAJAJAJA,0,0,0,0,0
7,17947968803927907,wtf,1,0,1,0,1
8,17916781080051191,fuck that bitch,1,1,1,0,1
9,18036427937403276,Is this real,0,0,0,0,0


In [20]:
final_ans.to_csv("result.csv")