In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


CUDA available: True
Device name: NVIDIA A100-SXM4-40GB


In [None]:
import torch
print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")
print(f"Allocated GPU Memory: {torch.cuda.memory_allocated() / 1e9} GB")
print(f"Cached GPU Memory: {torch.cuda.memory_reserved() / 1e9} GB")


Total GPU Memory: 42.481811456 GB
Allocated GPU Memory: 41.851105792 GB
Cached GPU Memory: 41.892708352 GB


In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

def preprocess_data(file_path):
    """
    Preprocess data for model training. Includes feature engineering, text cleaning,
    and preparation of `clean_text` for model input.
    """
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    df['comment_text'] = df['comment_text'].fillna("").str.strip()

    # Step 1: Feature Engineering
    print("Performing feature engineering...")
    df['text_length'] = df['comment_text'].str.len()
    df['word_count'] = df['comment_text'].str.split().str.len()
    df['sentence_count'] = df['comment_text'].apply(lambda x: x.count('.') + x.count('!') + x.count('?'))
    df['repeated_characters'] = df['comment_text'].apply(lambda x: len(re.findall(r'(.)\1{2,}', x)))
    df['special_char_count'] = df['comment_text'].apply(lambda x: sum(1 for char in str(x) if not char.isalnum() and not char.isspace()))
    df['capital_ratio'] = df['comment_text'].apply(lambda x: sum(1 for char in str(x) if char.isupper()) / len(str(x)) if len(str(x)) > 0 else 0)

    # Hateword Features
    print("Calculating hateword features...")
    immigrant_discrimination_words = [
        'alien', 'illegal alien', 'wetback', 'border hopper', 'invader', 'job stealer',
        'criminal immigrant', 'refugee', 'illegal', 'migrant scum', 'asylum seeker',
        'invaders', 'freeloader', 'drain on resources', 'third-worlder', 'outlander',
        'economic migrant', 'foreign parasite', 'country leech', 'boat people', 'expat',
        'border crosser', 'foreign invader', 'refugee trash', 'immigrant menace'
    ]

    regional_discrimination_words = [
        'foreigner', 'outsider', 'non-native', 'gook', 'chinaman', 'ching chong',
        'paki', 'sand niggar', 'mexican invader', 'muslim terrorist', 'anchor baby',
        'cholo', 'beaner', 'turbanhead', 'terrorist', 'dirty foreigner', 'eastern european scum',
        'third-worlder', 'redneck', 'hillbilly', 'yank', 'limey', 'chink', 'kraut',
        'nip', 'wetback', 'border jumper', 'gringo', 'spic', 'cracker', 'honky'
    ]

    hate_words = immigrant_discrimination_words + regional_discrimination_words

    def preprocess_text_for_hatewords(text):
        text = re.sub(r'[^\w\s]', '', text)
        return text.lower()

    def count_hate_words(text, hate_words_list):
        return sum(1 for word in text.split() if word in hate_words_list)

    df['cleaned_for_hatewords'] = df['comment_text'].apply(preprocess_text_for_hatewords)
    df['hate_word_count'] = df['cleaned_for_hatewords'].apply(lambda x: count_hate_words(x, hate_words))
    df['hate_word_ratio'] = df.apply(
        lambda row: row['hate_word_count'] / row['word_count'] if row['word_count'] > 0 else 0, axis=1
    )

    # Text Cleaning for Model Input
    print("Cleaning text...")
    def clean_text(text):
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = text.lower()
        tokens = text.split()
        return ' '.join(tokens)

    df['clean_text'] = df['comment_text'].apply(clean_text)

    # Step 2: Scaling Numerical Features
    print("Preparing numerical features...")
    feature_columns = [
        'text_length', 'word_count', 'sentence_count',
        'repeated_characters', 'special_char_count',
        'capital_ratio', 'hate_word_count', 'hate_word_ratio'
    ]
    scaler = StandardScaler()
    df[feature_columns] = scaler.fit_transform(df[feature_columns])

    # Prepare `X` with both numerical features and `clean_text`
    print("Preparing final feature set...")
    X_numerical = df[feature_columns].values.astype(np.float32)
    X_text = df['clean_text'].tolist()  # Keep `clean_text` as a separate column for tokenizer
    y_labels = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.astype(np.float32)

    return X_numerical, X_text, y_labels



In [None]:
X_numerical,X_text,y_labels = preprocess_data("/content/train3000.csv")

Loading dataset...
Performing feature engineering...
Calculating hateword features...
Cleaning text...
Preparing numerical features...
Preparing final feature set...


In [None]:
from transformers import pipeline
from tqdm import tqdm

def generate_topics(X_text, batch_size=64, device=0):
    """
    Generate topic predictions for given text data.

    Args:
        X_text (list of str): List of cleaned text data.
        batch_size (int): Batch size for processing.
        device (int): GPU device index or -1 for CPU.

    Returns:
        list of str: Predicted topics for each input text.
    """
    # Predefined topics
    topics = ["crime", "employment", "social benefits", "immigration policies", "violence", "other"]

    # Initialize zero-shot classification pipeline
    classifier = pipeline(
        "zero-shot-classification",
        model="facebook/bart-large-mnli",
        device=device
    )

    print("Generating topics...")
    predictions = []
    for i in tqdm(range(0, len(X_text), batch_size), desc="Classifying Topics"):
        batch = X_text[i:i + batch_size]
        results = classifier(batch, candidate_labels=topics, multi_label=False)
        for result in results:
            # Use the highest-scoring label as the topic
            predictions.append(result['labels'][0])

    return predictions
prediction = generate_topics(X_text)
prediction



Generating topics...


Classifying Topics: 100%|██████████| 1/1 [00:09<00:00,  9.12s/it]


['other',
 'other',
 'other',
 'other',
 'other',
 'other',
 'crime',
 'crime',
 'other',
 'other',
 'other',
 'other',
 'other',
 'other',
 'other',
 'other',
 'violence',
 'other',
 'other',
 'other',
 'other',
 'other',
 'other',
 'employment',
 'other',
 'other',
 'other',
 'crime',
 'other',
 'social benefits',
 'other',
 'other',
 'other',
 'other',
 'other',
 'other',
 'other',
 'other',
 'violence',
 'other',
 'other',
 'other',
 'other',
 'violence',
 'crime',
 'other',
 'other',
 'other',
 'other',
 'other']

In [None]:
def generate_sentiments(X_text, batch_size=64, device=0, max_len=512):
    """
    Generate sentiment predictions for given text data with three classes (negative, neutral, positive).

    Args:
        X_text (list of str): List of cleaned text data.
        batch_size (int): Batch size for processing.
        device (int): GPU device index or -1 for CPU.
        max_len (int): Maximum sequence length for the model (default is 512).

    Returns:
        list of str: Predicted sentiments for each input text (e.g., "NEGATIVE", "NEUTRAL", "POSITIVE").
    """
    from transformers import pipeline

    # Use a model supporting three-class sentiment classification
    sentiment_analyzer = pipeline(
        "sentiment-analysis",
        model="cardiffnlp/twitter-roberta-base-sentiment",
        device=device
    )

    print("Generating sentiments with three classes...")
    sentiments = []
    for i in tqdm(range(0, len(X_text), batch_size), desc="Classifying Sentiments"):
        # Truncate texts that exceed max_len
        batch = [text[:max_len] for text in X_text[i:i + batch_size]]
        results = sentiment_analyzer(batch)
        # Convert model's output to simpler labels (negative, neutral, positive)
        sentiments.extend([
            "NEGATIVE" if res['label'] == "LABEL_0" else
            "NEUTRAL" if res['label'] == "LABEL_1" else
            "POSITIVE"
            for res in results
        ])

    return sentiments




sentiments = generate_sentiments(X_text)
sentiments




config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Generating sentiments with three classes...


Classifying Sentiments: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]


['NEUTRAL',
 'NEUTRAL',
 'NEGATIVE',
 'NEUTRAL',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEGATIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEGATIVE',
 'NEUTRAL',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEUTRAL',
 'NEGATIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEGATIVE',
 'NEUTRAL',
 'POSITIVE',
 'NEGATIVE',
 'NEUTRAL',
 'POSITIVE',
 'NEGATIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEGATIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEGATIVE',
 'NEGATIVE',
 'NEUTRAL']

In [None]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from torch.cuda.amp import GradScaler, autocast
from tqdm import tqdm
import numpy as np


# Dataset class
class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_masks, numerical_features, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.numerical_features = numerical_features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'numerical_features': self.numerical_features[idx],
            'labels': self.labels[idx],
        }


# Model class
class DistilBERTMultiLabel(nn.Module):
    def __init__(self, bert_model_name, numerical_feature_dim, num_labels):
        super(DistilBERTMultiLabel, self).__init__()
        self.bert = DistilBertModel.from_pretrained(bert_model_name)
        self.fc_text = nn.Linear(768, 128)
        self.fc_numerical = nn.Linear(numerical_feature_dim, 128)
        self.fc_output = nn.Linear(256, num_labels)

    def forward(self, input_ids, attention_mask, numerical_features):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_embeddings = bert_outputs.last_hidden_state[:, 0, :]  # CLS token embedding

        text_out = torch.nn.ReLU()(self.fc_text(text_embeddings))
        num_out = torch.nn.ReLU()(self.fc_numerical(numerical_features))

        combined = torch.cat((text_out, num_out), dim=1)
        logits = self.fc_output(combined)
        return logits


# Training function
def train_model_with_optimizations(model, train_loader, optimizer, criterion, epochs, device):
    model.train()
    scaler = GradScaler()

    for epoch in range(epochs):
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
        total_loss = 0

        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerical_features = batch['numerical_features'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            with autocast():
                logits = model(input_ids, attention_mask, numerical_features)
                loss = criterion(logits, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


# Evaluation function
def evaluate_model(model, val_loader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        loop = tqdm(val_loader, desc="Evaluating")
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerical_features = batch['numerical_features'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask, numerical_features)
            preds = torch.sigmoid(logits) > 0.5

            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    return np.vstack(all_preds), np.vstack(all_labels)


# Training and evaluation pipeline
def run_training_and_evaluation(X_numerical, X_text, y_labels, batch_size=16, epochs=3, max_length=512):
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    tokenized_data = tokenizer(
        X_text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    input_ids = tokenized_data["input_ids"]
    attention_masks = tokenized_data["attention_mask"]

    # Synchronize dataset sizes
    min_length = min(len(X_text), input_ids.shape[0], X_numerical.shape[0], y_labels.shape[0])
    input_ids = input_ids[:min_length]
    attention_masks = attention_masks[:min_length]
    X_numerical = X_numerical[:min_length]
    y_labels = y_labels[:min_length]

    dataset = CustomDataset(input_ids, attention_masks, torch.tensor(X_numerical, dtype=torch.float), torch.tensor(y_labels, dtype=torch.float))
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = DistilBERTMultiLabel("distilbert-base-uncased", X_numerical.shape[1], y_labels.shape[1]).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCEWithLogitsLoss()

    print("Starting training...")
    train_model_with_optimizations(model, train_loader, optimizer, criterion, epochs, device)

    print("Evaluating...")
    val_loader = DataLoader(dataset, batch_size=batch_size)
    predictions, true_labels = evaluate_model(model, val_loader, device)

    return model, predictions, true_labels
import torch
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from transformers import DistilBertTokenizer

# 自定义评估函数
def evaluate_model_with_metrics(model, test_loader, device):
    """
    评估模型并计算多种性能指标。
    """
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        loop = tqdm(test_loader, desc="Evaluating")
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerical_features = batch['numerical_features'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask, numerical_features)
            preds = torch.sigmoid(logits) > 0.5

            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    metrics = {
        "accuracy": accuracy_score(all_labels.flatten(), all_preds.flatten()),
        "precision": precision_score(all_labels.flatten(), all_preds.flatten(), zero_division=0, average="micro"),
        "recall": recall_score(all_labels.flatten(), all_preds.flatten(), zero_division=0, average="micro"),
        "f1_score": f1_score(all_labels.flatten(), all_preds.flatten(), zero_division=0, average="micro"),
        "roc_auc": roc_auc_score(all_labels, all_preds, average="micro")
    }

    return metrics, all_preds, all_labels


# 评估代码调用
def run_evaluation_on_test_data(test_file_path, model_path, batch_size=16, max_length=512):
    """
    加载模型并在测试数据上运行评估。
    """
    # 加载测试数据
    X_numerical, X_text, y_labels = preprocess_data(test_file_path)

    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    tokenized_data = tokenizer(
        X_text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    input_ids = tokenized_data["input_ids"]
    attention_masks = tokenized_data["attention_mask"]

    # 同步数据大小
    min_length = min(len(X_text), input_ids.shape[0], X_numerical.shape[0], y_labels.shape[0])
    input_ids = input_ids[:min_length]
    attention_masks = attention_masks[:min_length]
    X_numerical = X_numerical[:min_length]
    y_labels = y_labels[:min_length]

    # 创建测试集 DataLoader
    test_dataset = CustomDataset(
        input_ids,
        attention_masks,
        torch.tensor(X_numerical, dtype=torch.float),
        torch.tensor(y_labels, dtype=torch.float)
    )
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # 加载模型
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = DistilBERTMultiLabel("distilbert-base-uncased", X_numerical.shape[1], y_labels.shape[1]).to(device)
    model.load_state_dict(torch.load(model_path))
    model.to(device)

    # 评估模型
    metrics, predictions, true_labels = evaluate_model_with_metrics(model, test_loader, device)
    return metrics, predictions, true_labels


In [None]:
# 模型训练调用
X_numerical, X_text, y_labels = preprocess_data("train.csv")  # 替换为实际数据路径

batch_size = 16  # 可根据显存大小调整
epochs = 3       # 训练轮数

# 运行训练和评估
model, predictions, true_labels = run_training_and_evaluation(X_numerical, X_text, y_labels, batch_size=batch_size, epochs=epochs)

# 保存模型
torch.save(model.state_dict(), "distilbert_multilabel_model.pth")
print("Model saved as distilbert_multilabel_model.pth")
# 评估调用代码
test_file_path = "train50.csv"  # 测试数据路径
model_path = "distilbert_multilabel_model.pth"  # 模型路径

# 运行评估
metrics, predictions, true_labels = run_evaluation_on_test_data(test_file_path, model_path)

# 打印评估指标
print("Evaluation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")


Loading dataset...
Performing feature engineering...
Calculating hateword features...
Cleaning text...
Preparing numerical features...
Preparing final feature set...




Starting training...


Epoch 1/3: 100%|██████████| 9974/9974 [09:17<00:00, 17.90it/s, loss=0.0163]


Epoch 1, Loss: 0.0497


Epoch 2/3: 100%|██████████| 9974/9974 [09:17<00:00, 17.89it/s, loss=0.000318]


Epoch 2, Loss: 0.0334


Epoch 3/3: 100%|██████████| 9974/9974 [09:16<00:00, 17.93it/s, loss=0.0797]


Epoch 3, Loss: 0.0263
Evaluating...


Evaluating: 100%|██████████| 9974/9974 [09:37<00:00, 17.27it/s]


Model saved as distilbert_multilabel_model.pth
Loading dataset...
Performing feature engineering...
Calculating hateword features...
Cleaning text...
Preparing numerical features...
Preparing final feature set...


Evaluating: 100%|██████████| 4/4 [00:00<00:00, 19.01it/s]

Evaluation Metrics:
accuracy: 0.9933
precision: 0.9933
recall: 0.9933
f1_score: 0.9933
roc_auc: 0.9625





In [None]:
import numpy as np

def calculate_toxicity_score(predictions, weights=None):
    """
    Calculate the Toxicity Score based on the predictions of six labels.

    Args:
        predictions (np.ndarray): A 2D array of shape (n_samples, 6) where each row contains
                                  the six predicted values for a sample.
        weights (list or np.ndarray, optional): A list or array of weights for each label.
                                                If None, equal weights will be used.

    Returns:
        np.ndarray: A 1D array containing the toxicity scores for each sample.
    """
    # Ensure predictions is a numpy array
    predictions = np.array(predictions)

    if predictions.shape[1] != 6:
        raise ValueError("Predictions must have exactly 6 columns, one for each label.")

    # If weights are not provided, use equal weights
    if weights is None:
        weights = np.ones(6) / 6  # Equal weight for each label

    # Ensure weights sum to 1
    weights = np.array(weights)
    if len(weights) != 6:
        raise ValueError("Weights must have exactly 6 values, one for each label.")
    if not np.isclose(weights.sum(), 1.0):
        weights = weights / weights.sum()  # Normalize weights to sum to 1

    # Compute the toxicity score as a weighted average
    toxicity_scores = np.dot(predictions, weights)
    return toxicity_scores


300

In [None]:
!pip freeze > requirements.txt


In [None]:
pip freeze


absl-py==1.4.0
accelerate==1.1.1
aiohappyeyeballs==2.4.3
aiohttp==3.11.2
aiosignal==1.3.1
alabaster==1.0.0
albucore==0.0.19
albumentations==1.4.20
altair==4.2.2
annotated-types==0.7.0
anyio==3.7.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.5.1
arviz==0.20.0
astropy==6.1.6
astropy-iers-data==0.2024.11.18.0.35.2
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.1.0
attrs==24.2.0
audioread==3.0.1
autograd==1.7.0
babel==2.16.0
backcall==0.2.0
beautifulsoup4==4.12.3
bigframes==1.27.0
bigquery-magics==0.4.0
bleach==6.2.0
blinker==1.9.0
blis==0.7.11
blosc2==2.7.1
bokeh==3.6.1
Bottleneck==1.4.2
bqplot==0.12.43
branca==0.8.0
CacheControl==0.14.1
cachetools==5.5.0
catalogue==2.0.10
certifi==2024.8.30
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.0
chex==0.1.87
clarabel==0.9.0
click==8.1.7
cloudpathlib==0.20.0
cloudpickle==3.1.0
cmake==3.30.5
cmdstanpy==1.2.4
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.5
cons==0.4.6
contourpy==1.