### Prepare Sample Data For Toxicity Detection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# read data
# data obtained from : https://www.kaggle.com/fizzbuzz/cleaned-toxic-comments 
df = pd.read_csv('train_preprocessed.csv')

In [3]:
# explore df
df.info()
# shuffle the DataFrame rows 
df = df.sample(frac = 1) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   comment_text   159571 non-null  object 
 1   id             159571 non-null  object 
 2   identity_hate  159571 non-null  float64
 3   insult         159571 non-null  float64
 4   obscene        159571 non-null  float64
 5   set            159571 non-null  object 
 6   severe_toxic   159571 non-null  float64
 7   threat         159571 non-null  float64
 8   toxic          159571 non-null  float64
 9   toxicity       159571 non-null  float64
dtypes: float64(7), object(3)
memory usage: 12.2+ MB


In [4]:
# explore top 5 rows
df.head()

Unnamed: 0,comment_text,id,identity_hate,insult,obscene,set,severe_toxic,threat,toxic,toxicity
99262,holy grail some commentators claim that wolfr...,12fc2ac75b0fc9c8,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
150755,maybe you should be civil towards other people...,718629963d505de1,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
71675,your unsourced talk of racism in this edit sum...,bfe78785c17d8ad1,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
133485,agreed we really should try to stick to the s...,ca1a4d166545f2af,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
83712,verse problem i dont ever remember there being...,e005a13ed8da5b69,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0


In [5]:
# toxic lablel distribution
df['toxic'].value_counts(normalize=True)

0.0    0.904156
1.0    0.095844
Name: toxic, dtype: float64

#### Extracting Sample

In [6]:
# using train test split 
from sklearn.model_selection import train_test_split

def getSampleData(df, filename, rowsCount):
        # extract X and y 
    X = df[['id', 'comment_text', 'severe_toxic', 'obscene',
           'threat', 'insult', 'identity_hate']].values
    y = df['toxic'].values
    
    test_size = rowsCount / len(df)
    # run the train test split. we are extracting 100 rows out of 159571 : 100/159571 = 0.0006266802
    # create train test split
    X_train, X_test, y_train, y_test = train_test_split(
            X, y,stratify=y, test_size=test_size)
    
    # print shape
    print(X_test.shape, y_test.shape)
    
    # create dataframe with test data and add toxic column
    df_sample = pd.DataFrame(X_test)
    df_sample.columns = [ 'id', 'comment_text', 'severe_toxic', 'obscene',
           'threat', 'insult', 'identity_hate']
    df_sample['toxic'] = y_test
    
    # checking the distribution of toxic column
    print(df_sample['toxic'].value_counts())
    # exporting the sample dataset
    df_sample.to_csv(fileName, index=False)
    
    print('sample created')


In [7]:
# small sample 
rows = 100
fileName='toxic_data_sample.csv'
getSampleData(df, fileName, rows)

(100, 7) (100,)
0.0    90
1.0    10
Name: toxic, dtype: int64
sample created


In [8]:
# mid size sample 
rows = 1000
fileName='toxic_data_mid.csv'
getSampleData(df, fileName, rows)

(1000, 7) (1000,)
0.0    904
1.0     96
Name: toxic, dtype: int64
sample created


## full dataset

In [9]:
df.head()

Unnamed: 0,comment_text,id,identity_hate,insult,obscene,set,severe_toxic,threat,toxic,toxicity
99262,holy grail some commentators claim that wolfr...,12fc2ac75b0fc9c8,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
150755,maybe you should be civil towards other people...,718629963d505de1,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
71675,your unsourced talk of racism in this edit sum...,bfe78785c17d8ad1,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
133485,agreed we really should try to stick to the s...,ca1a4d166545f2af,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0
83712,verse problem i dont ever remember there being...,e005a13ed8da5b69,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0


In [10]:
df= df[[ 'id', 'comment_text', 'severe_toxic', 'obscene',
           'threat', 'insult', 'identity_hate','toxic']]
df.to_csv('toxic_data_full.csv', index=False)