        self.categories = [
            "Any Other Cyber Crime",
            "Crime Against Women & Children",
            "Cryptocurrency Crime",
            "Cyber Attack/ Dependent Crimes",
            "Cyber Terrorism",
            "Hacking Damage to computercomputer system etc",
            "Online Cyber Trafficking",
            "Online Financial Fraud",
            "Online Gambling Betting",
            "Online and Social Media Related Crime",
            "Ransomware",
            "Report Unlawful Content"
        ]
        self.subcategories_map = {
            "Any Other Cyber Crime": ["Other"],
            "Crime Against Women & Children": [
                "Computer Generated CSAM/CSEM",
                "Cyber Blackmailing & Threatening",
                "Sexual Harassment"
            ],
            "Cryptocurrency Crime": ["Cryptocurrency Fraud"],
            "Cyber Attack/ Dependent Crimes": [
                "Data Breach/Theft",
                "Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks",
                "Hacking/Defacement",
                "Malware Attack",
                "Ransomware Attack", 
                "SQL Injection",
                "Tampering with computer source documents"
            ],
            "Cyber Terrorism": ["Cyber Terrorism"],
            "Hacking Damage to computercomputer system etc": [
                "Damage to computer computer systems etc",
                "Email Hacking",
                "Tampering with computer source documents",
                "Unauthorised AccessData Breach",
                "Website DefacementHacking"
            ],
            "Online Cyber Trafficking": ["Online Trafficking"],
            "Online Financial Fraud": [
                "Business Email CompromiseEmail Takeover",
                "DebitCredit Card FraudSim Swap Fraud",
                "DematDepository Fraud",
                "EWallet Related Fraud",
                "Fraud CallVishing",
                "Internet Banking Related Fraud",
                "UPI Related Frauds"
            ],
            "Online Gambling Betting": ["Online Gambling Betting"],
            "Online and Social Media Related Crime": [
                "Cheating by Impersonation",
                "Cyber Bullying Stalking Sexting",
                "EMail Phishing",
                "FakeImpersonating Profile",
                "Impersonating Email",
                "Intimidating Email",
                "Online Job Fraud",
                "Online Matrimonial Fraud",
                "Profile Hacking Identity Theft",
                "Provocative Speech for unlawful acts"
            ],
            "Ransomware": ["Ransomware"],
            "Report Unlawful Content": ["Against Interest of sovereignty or integrity of India"]
        

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

class CybercrimeDataset(Dataset):
    def __init__(self, texts, category_labels, subcategory_labels, tokenizer, max_length=256):
        self.texts = texts
        self.category_labels = category_labels
        self.subcategory_labels = subcategory_labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        # Return the length of the dataset
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        text = ' '.join(text.split())  # Clean extra spaces
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'category_labels': torch.FloatTensor(self.category_labels[idx]),
            'subcategory_labels': torch.FloatTensor(self.subcategory_labels[idx])
        }

def preprocess_text(text):
    # Basic text cleaning
    text = str(text).lower()
    text = ' '.join(text.split())
    return text

def balance_dataset(df):
    majority_classes = [
        'Online Financial Fraud', 'Any Other Cyber Crime', 'Crime Against Women & Children','Cyber Attack/ Dependent Crimes',
        'Online and Social Media Related Crime','Hacking  Damage to computercomputer system etc','Cyber Stalking','Cryptocurrency Crime'
    ]
    
    severely_undersampled = [
        
        'Cyber Terrorism', 
        'Online Cyber Trafficking',
        'Ransomware',
        'Report Unlawful Content'
    ]

    balanced_dfs = []
    
    print("Available categories:", df['category'].unique())
    
    for category in df['category'].unique():
        try:
            category_df = df[df['category'] == category]
            subcategory_counts = category_df['sub_category'].value_counts()
            
            if len(category_df) < 2:
                # Handle single sample cases by duplication
                balanced_df = pd.concat([category_df] * 3)
                
            elif category in severely_undersampled:
                # Triple the samples for severely undersampled categories
                balanced_df = pd.concat([category_df] * 3)
                
            elif category in majority_classes:
                # Reduce majority classes to median size
                target_size = int(df['category'].value_counts().median())
                balanced_df = category_df.sample(n=min(len(category_df), target_size), 
                                              random_state=42)
            else:
                # Keep other categories as is
                balanced_df = category_df
                
            balanced_dfs.append(balanced_df)
            
        except Exception as e:
            print(f"Warning: Category '{category}' skipped due to: {str(e)}")
            balanced_dfs.append(category_df)
    
    return pd.concat(balanced_dfs, ignore_index=True)

def create_dataloaders(df, tokenizer, batch_size=16):
    # Clean text
    df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(lambda x: str(x).lower().strip())
    
    # Balance dataset
    balanced_df = balance_dataset(df)
    
    # Save balanced dataset to CSV
    balanced_df.to_csv('balancedtrain.csv', index=False)
    print(f"Balanced dataset saved to 'balancedtrain.csv' with {len(balanced_df)} samples")
    print("Category distribution in balanced dataset:")
    print(balanced_df['category'].value_counts())
    
    # Prepare labels
    mlb_category = MultiLabelBinarizer()
    mlb_subcategory = MultiLabelBinarizer()
    
    # Transform labels
    category_encoded = mlb_category.fit_transform(
        balanced_df['category'].apply(lambda x: [x])
    )
    subcategory_encoded = mlb_subcategory.fit_transform(
        balanced_df['sub_category'].apply(lambda x: [x])
    )
    
    # Create dataset
    dataset = CybercrimeDataset(
        balanced_df['crimeaditionalinfo'].values,
        category_encoded,
        subcategory_encoded,
        tokenizer,
        max_length=256
    )
    
    # Create dataloader with balanced sampling
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    
    return dataloader, mlb_category, mlb_subcategory

# Usage example:

# Load data
df = pd.read_csv('output_no_duplicates.csv')

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Create dataloaders
train_dataloader, mlb_category, mlb_subcategory = create_dataloaders(
    df,
    tokenizer,
    batch_size=16
)


Available categories: ['Online Financial Fraud' 'Crime Against Women & Children'
 'Hacking  Damage to computercomputer system etc'
 'Online and Social Media Related Crime' 'Any Other Cyber Crime'
 'Cyber Attack/ Dependent Crimes' 'Cryptocurrency Crime'
 'Online Cyber Trafficking' 'Cyber Terrorism' 'Online Gambling  Betting'
 'Ransomware' 'Report Unlawful Content']
Balanced dataset saved to 'balancedtrain.csv' with 8525 samples
Category distribution in balanced dataset:
category
Online Financial Fraud                            1072
Crime Against Women & Children                    1072
Hacking  Damage to computercomputer system etc    1072
Online and Social Media Related Crime             1072
Any Other Cyber Crime                             1072
Cyber Attack/ Dependent Crimes                    1072
Online Cyber Trafficking                           540
Cyber Terrorism                                    480
Cryptocurrency Crime                               466
Online Gambling  Betti

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

class CybercrimeDataset(Dataset):
    def __init__(self, texts, category_labels, subcategory_labels, tokenizer, max_length=256):
        self.texts = texts
        self.category_labels = category_labels
        self.subcategory_labels = subcategory_labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        text = ' '.join(text.split())  # Clean extra spaces
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'category_labels': torch.FloatTensor(self.category_labels[idx]),
            'subcategory_labels': torch.FloatTensor(self.subcategory_labels[idx])
        }

def preprocess_text(text):
    text = str(text).lower()
    text = ' '.join(text.split())
    return text

def balance_dataset(df):
    majority_classes = [
        'Online Financial Fraud', 'Any Other Cyber Crime', 'Crime Against Women & Children',
        'Cyber Attack/ Dependent Crimes', 'Online and Social Media Related Crime',
        'Hacking  Damage to computercomputer system etc', 'Cyber Stalking',
        'Cryptocurrency Crime'
    ]
    
    # Define multiplication factors for severely undersampled categories
    multiplication_factors = {
        'Report Unlawful Content': 350,  # Multiply samples by 100 to get ~300 samples
        'Ransomware': 8,                 # Multiply samples by 6 to get ~1000 samples
        'Cyber Terrorism': 3,
        'Online Cyber Trafficking': 3
    }

    balanced_dfs = []
    
    print("Available categories:", df['category'].unique())
    
    for category in df['category'].unique():
        try:
            category_df = df[df['category'] == category]
            
            if category in multiplication_factors:
                # Multiply samples for severely undersampled categories
                factor = multiplication_factors[category]
                # Add some random noise to avoid exact duplicates
                augmented_dfs = []
                for _ in range(factor):
                    temp_df = category_df.copy()
                    # Add slight variations to text to avoid exact duplicates
                    temp_df['crimeaditionalinfo'] = temp_df['crimeaditionalinfo'].apply(
                        lambda x: x + f" variation_{_}")
                    augmented_dfs.append(temp_df)
                balanced_df = pd.concat(augmented_dfs)
                
            elif category in majority_classes:
                # Reduce majority classes to target size
                target_size = 1072  # Keep the current majority size
                balanced_df = category_df.sample(n=min(len(category_df), target_size), 
                                              random_state=42)
            else:
                # Keep other categories as is
                balanced_df = category_df
                
            balanced_dfs.append(balanced_df)
            
        except Exception as e:
            print(f"Warning: Category '{category}' skipped due to: {str(e)}")
            balanced_dfs.append(category_df)
    
    final_df = pd.concat(balanced_dfs, ignore_index=True)
    
    # Print the final distribution
    print("\nFinal category distribution:")
    print(final_df['category'].value_counts())
    
    return final_df

def create_dataloaders(df, tokenizer, batch_size=16):
    # Clean text
    df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(lambda x: str(x).lower().strip())
    
    # Balance dataset
    balanced_df = balance_dataset(df)
    
    # Save balanced dataset to CSV
    balanced_df.to_csv('balancedtrain.csv', index=False)
    print(f"Balanced dataset saved to 'balancedtrain.csv' with {len(balanced_df)} samples")
    print("Category distribution in balanced dataset:")
    print(balanced_df['category'].value_counts())
    
    # Prepare labels
    mlb_category = MultiLabelBinarizer()
    mlb_subcategory = MultiLabelBinarizer()
    
    # Transform labels
    category_encoded = mlb_category.fit_transform(
        balanced_df['category'].apply(lambda x: [x])
    )
    subcategory_encoded = mlb_subcategory.fit_transform(
        balanced_df['sub_category'].apply(lambda x: [x])
    )
    
    # Create dataset
    dataset = CybercrimeDataset(
        balanced_df['crimeaditionalinfo'].values,
        category_encoded,
        subcategory_encoded,
        tokenizer,
        max_length=256
    )
    
    # Create dataloader with balanced sampling
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    
    return dataloader, mlb_category, mlb_subcategory

# Usage example:
if __name__ == "__main__":
    # Load data
    df = pd.read_csv('output_no_duplicates.csv')

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

    # Create dataloaders
    train_dataloader, mlb_category, mlb_subcategory = create_dataloaders(
        df,
        tokenizer,
        batch_size=16
    )

Available categories: ['Online Financial Fraud' 'Crime Against Women & Children'
 'Hacking  Damage to computercomputer system etc'
 'Online and Social Media Related Crime' 'Any Other Cyber Crime'
 'Cyber Attack/ Dependent Crimes' 'Cryptocurrency Crime'
 'Online Cyber Trafficking' 'Cyber Terrorism' 'Online Gambling  Betting'
 'Ransomware' 'Report Unlawful Content']

Final category distribution:
category
Online Financial Fraud                            1072
Crime Against Women & Children                    1072
Hacking  Damage to computercomputer system etc    1072
Online and Social Media Related Crime             1072
Any Other Cyber Crime                             1072
Cyber Attack/ Dependent Crimes                    1072
Online Cyber Trafficking                           540
Cyber Terrorism                                    480
Cryptocurrency Crime                               466
Ransomware                                         448
Online Gambling  Betting                    

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

class CybercrimeDataset(Dataset):
    def __init__(self, texts, category_labels, subcategory_labels, tokenizer, max_length=256):
        self.texts = texts
        self.category_labels = category_labels
        self.subcategory_labels = subcategory_labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        text = ' '.join(text.split())  # Clean extra spaces
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'category_labels': torch.FloatTensor(self.category_labels[idx]),
            'subcategory_labels': torch.FloatTensor(self.subcategory_labels[idx])
        }

def preprocess_text(text):
    text = str(text).lower()
    text = ' '.join(text.split())
    return text

def balance_dataset(df):
    target_size = 5000  # Target size for all categories
    
    # Calculate multiplication factors based on current sizes
    category_counts = df['category'].value_counts()
    multiplication_factors = {}
    
    for category, count in category_counts.items():
        if count < target_size:
            # Calculate factor needed to reach close to target_size
            factor = int(np.ceil(target_size / count))
            multiplication_factors[category] = factor
    
    balanced_dfs = []
    print("Available categories:", df['category'].unique())
    
    for category in df['category'].unique():
        try:
            category_df = df[df['category'] == category]
            current_count = len(category_df)
            
            if current_count > target_size:
                # Downsample categories that are above target size
                balanced_df = category_df.sample(n=target_size, random_state=42)
            
            elif category in multiplication_factors:
                # Multiply samples for undersampled categories
                factor = multiplication_factors[category]
                augmented_dfs = []
                for i in range(factor):
                    temp_df = category_df.copy()
                    # Add variations to text to avoid exact duplicates
                    temp_df['crimeaditionalinfo'] = temp_df['crimeaditionalinfo'].apply(
                        lambda x: f"{x} variation_{i}")
                    augmented_dfs.append(temp_df)
                balanced_df = pd.concat(augmented_dfs)
                
                # If we overshot the target, downsample to target_size
                if len(balanced_df) > target_size:
                    balanced_df = balanced_df.sample(n=target_size, random_state=42)
            
            else:
                balanced_df = category_df
            
            balanced_dfs.append(balanced_df)
            
        except Exception as e:
            print(f"Warning: Category '{category}' skipped due to: {str(e)}")
            balanced_dfs.append(category_df)
    
    final_df = pd.concat(balanced_dfs, ignore_index=True)
    
    print("\nFinal category distribution:")
    print(final_df['category'].value_counts())
    
    return final_df

def create_dataloaders(df, tokenizer, batch_size=16):
    # Clean text
    df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(lambda x: str(x).lower().strip())
    
    # Balance dataset
    balanced_df = balance_dataset(df)
    
    # Save balanced dataset to CSV
    balanced_df.to_csv('balancedtrain.csv', index=False)
    print(f"Balanced dataset saved to 'balancedtrain.csv' with {len(balanced_df)} samples")
    print("Category distribution in balanced dataset:")
    print(balanced_df['category'].value_counts())
    
    # Prepare labels
    mlb_category = MultiLabelBinarizer()
    mlb_subcategory = MultiLabelBinarizer()
    
    # Transform labels
    category_encoded = mlb_category.fit_transform(
        balanced_df['category'].apply(lambda x: [x])
    )
    subcategory_encoded = mlb_subcategory.fit_transform(
        balanced_df['sub_category'].apply(lambda x: [x])
    )
    
    # Create dataset
    dataset = CybercrimeDataset(
        balanced_df['crimeaditionalinfo'].values,
        category_encoded,
        subcategory_encoded,
        tokenizer,
        max_length=256
    )
    
    # Create dataloader with balanced sampling
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    
    return dataloader, mlb_category, mlb_subcategory

# Usage example:
if __name__ == "__main__":
    # Load data
    df = pd.read_csv('output_no_duplicates.csv')

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

    # Create dataloaders
    train_dataloader, mlb_category, mlb_subcategory = create_dataloaders(
        df,
        tokenizer,
        batch_size=16
    )

Available categories: ['Online Financial Fraud' 'Crime Against Women & Children'
 'Hacking  Damage to computercomputer system etc'
 'Online and Social Media Related Crime' 'Any Other Cyber Crime'
 'Cyber Attack/ Dependent Crimes' 'Cryptocurrency Crime'
 'Online Cyber Trafficking' 'Cyber Terrorism' 'Online Gambling  Betting'
 'Ransomware' 'Report Unlawful Content']

Final category distribution:
category
Online Financial Fraud                            5000
Crime Against Women & Children                    5000
Hacking  Damage to computercomputer system etc    5000
Online and Social Media Related Crime             5000
Any Other Cyber Crime                             5000
Cyber Attack/ Dependent Crimes                    5000
Cryptocurrency Crime                              5000
Online Cyber Trafficking                          5000
Cyber Terrorism                                   5000
Online Gambling  Betting                          5000
Ransomware                                  