In [None]:
import pandas as pd

## Task
You are predicting the probability that a comment is toxic. A toxic comment would receive a 1.0. A benign, non-toxic comment would receive a 0.0. In the test set, all comments are classified as either a 1.0 or a 0.0.

## Files
- **jigsaw-toxic-comment-train.csv** - data from our first competition. The dataset is made up of English comments from Wikipedia’s talk page edits. (https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)
- **jigsaw-unintended-bias-train.csv** - data from our second competition. This is an expanded version of the Civil Comments dataset with a range of additional labels. (https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification)
- **sample_submission.csv** - a sample submission file in the correct format
- **test.csv** - comments from Wikipedia talk pages in different non-English languages.
- **test_labels.csv** - ground truth labels for the test data (data added after competition deadlin)
- **validation.csv** - comments from Wikipedia talk pages in different non-English languages.
- **jigsaw-toxic-comment-train-processed-seqlen128.csv** - training data preprocessed for BERT
- **jigsaw-unintended-bias-train-processed-seqlen128.csv** - training data preprocessed for BERT
- **validation-processed-seqlen128.csv** - validation data preprocessed for BERT
- **test-processed-seqlen128.csv** - test data preprocessed for BERT

## Columns
- **id** - identifier within each file.
- **comment_text** - the text of the comment to be classified.
- **lang** - the language of the comment.
- **toxic** - whether or not the comment is classified as toxic. (Does not exist in test.csv.)

# Data 1

In [None]:
df = pd.read_csv('./jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
print(f"Total: {len(df):,}")
df.head()

In [None]:
for label in ['toxic','severe_toxic','obscene','threat','insult','identity_hate']:
    total = (df[label] > 0.5).sum()
    pct = total / len(df) * 100
    print(f"# of {label}: {total} ({pct:.2f}%)")

# Data 2

In [None]:
# Values are fractional values which represent the fraction of human raters who believed the attribute applied to the given comment. For evaluation, test set examples with target >= 0.5 will be considered to be in the positive class (toxic).
df = pd.read_csv('./jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv')
df = df[['id','comment_text','toxic','severe_toxicity','obscene','threat','insult','identity_attack','sexual_explicit','toxicity_annotator_count']]
print(f"Total: {len(df):,}")
df.head()

In [None]:
for label in ['toxic','severe_toxicity','obscene','threat','insult','identity_attack','sexual_explicit']:
    total = (df[label] > 0.5).sum()
    pct = total / len(df) * 100
    print(f"# of {label}: {total} ({pct:.2f}%)")

# Validation Data

In [None]:
df = pd.read_csv('./jigsaw-multilingual-toxic-comment-classification/validation.csv')
print(f"Total: {len(df):,}")
df.head()

# Test Data

In [None]:
df = pd.read_csv('./jigsaw-multilingual-toxic-comment-classification/test.csv')
print(f"Total: {len(df):,}")
print("Value Counts:",df.lang.value_counts().to_dict())
df.head()

In [None]:
df = pd.read_csv('./jigsaw-multilingual-toxic-comment-classification/test_labels.csv')
print(f"Total: {len(df):,}")
df.head()