In [1]:
#Take a look on this
#https://medium.com/@sebastiannorena/finding-correlation-between-many-variables-multidimensional-dataset-with-python-5deb3f39ffb3
    
import pandas as pd
import numpy as np
import nltk
import re
import os

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from zipfile import ZipFile


lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

nltk.download('stopwords')
nltk.download('wordnet')


TEXT_COL_NAME = 'Text'
LABEL_COL_NAME = 'Label'

TOXIC_LABEL = 'TOXIC'
HEALTHY_LABEL = 'HEALTHY'

THRESHOLD = 0.3


DEFAULT_DISTRIBUTION = (50000,50000)


def exportCSV(df,name,distribution=DEFAULT_DISTRIBUTION):
    dis_positive = distribution[0]
    dis_negative = distribution[1]
    
    ds_positive  = df[df[LABEL_COL_NAME] == TOXIC_LABEL]
    ds_negative  = df[df[LABEL_COL_NAME] == HEALTHY_LABEL]
    
    ds_positive_len = len(ds_positive)
    ds_negative_len = len(ds_negative)
    
    i_ds_positive = np.arange(ds_positive_len)
    np.random.shuffle(i_ds_positive)
    
    i_ds_negative = np.arange(ds_negative_len)
    np.random.shuffle(i_ds_negative)
    
    max_positives = ds_positive_len
    max_negatives = ds_negative_len
    
    if(dis_positive < max_positives):
        max_positives = dis_positive

    if(dis_negative < max_negatives):
        max_negatives = dis_negative
        
    i_ds_positive = i_ds_positive[:max_positives]
    i_ds_negative = i_ds_negative[:max_negatives]
    
    out_df_positives = ds_positive.iloc[i_ds_positive]
    out_df_negatives = ds_negative.iloc[i_ds_negative]
    
    
    print("Total of positives: ",ds_positive_len,"Total of negatives: ", ds_negative_len)
    print("Total of positives exported: ",max_positives,"Total of negatives exported: ", max_negatives)
    
    
    out_df = pd.concat([out_df_positives,out_df_negatives])
    out_df_len = len(out_df)
    i_out_df = np.arange(out_df_len)
    np.random.shuffle(i_out_df)
    out_df = out_df.iloc[i_out_df]

    csv_name = name+'.csv'
    zip_name = name+'.zip'
    
    try:
        os.remove(csv_name)
    except:
        print("Error while deleting file ", csv_name)
        
    try:
        os.remove(zip_name)
    except:
        print("Error while deleting file ", zip_name)
    
    out_df.to_csv(csv_name, index=False)
    
    zipObj = ZipFile(zip_name, 'w')
    zipObj.write(csv_name)
    zipObj.close()
    
def setLabels(df):
    positives = df[LABEL_COL_NAME] == 1
    negatives = df[LABEL_COL_NAME] == 0
    
    df.loc[positives, LABEL_COL_NAME] = TOXIC_LABEL
    df.loc[negatives, LABEL_COL_NAME] = HEALTHY_LABEL
    
    return df


[nltk_data] Downloading package stopwords to /Users/a/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/a/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
dataset = pd.read_csv("./dataset.csv")
validation_ds = pd.read_csv("./validation_dataset.csv")

In [3]:
dataset.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [4]:
dataset.describe()

Unnamed: 0,id,target,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,...,parent_id,article_id,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
count,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,405130.0,405130.0,405130.0,...,1026228.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0
mean,3738434.0,0.1030173,0.004582099,0.01387721,0.02263571,0.08115273,0.009311271,0.011964,0.003205,0.001884,...,3722687.0,281359.7,0.2779269,0.04420696,0.1091173,2.446167,0.5843688,0.006605974,1.439019,8.784694
std,2445187.0,0.1970757,0.02286128,0.06460419,0.07873156,0.1760657,0.04942218,0.087166,0.050193,0.026077,...,2450261.0,103929.3,1.055313,0.2449359,0.4555363,4.727924,1.866589,0.04529782,17.87041,43.50086
min,59848.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,61006.0,2006.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
25%,796975.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,796018.8,160120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
50%,5223774.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5222993.0,332126.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0
75%,5769854.0,0.1666667,0.0,0.0,0.0,0.09090909,0.0,0.0,0.0,0.0,...,5775758.0,366237.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,6.0
max,6334010.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,6333965.0,399541.0,102.0,21.0,31.0,300.0,187.0,1.0,1866.0,4936.0


Como ai lab solo acepta dos columnas, texto y class entonces eliminaremos todas las columnas excepto comment_text y target

In [5]:
dataset2cols = dataset.loc[:, dataset.columns.intersection(['target','comment_text'])]

Cambiamos el orden de las columnas para que target esté después de comment text

In [6]:
dataset2cols = dataset2cols.reindex(columns=['comment_text','target'])

In [7]:
dataset2cols.head()

Unnamed: 0,comment_text,target
0,"This is so cool. It's like, 'would you want yo...",0.0
1,Thank you!! This would make my life a lot less...,0.0
2,This is such an urgent design problem; kudos t...,0.0
3,Is this something I'll be able to install on m...,0.0
4,haha you guys are a bunch of losers.,0.893617


Ahora como ai lab solo acepta valores discretos tendremos que especificar un treshold para definir nuestras dos clases: no ofensivo (0) / ofensivo (1)

In [8]:
dataset2cols[dataset2cols.target >= THRESHOLD]

Unnamed: 0,comment_text,target
4,haha you guys are a bunch of losers.,0.893617
5,ur a sh*tty comment.,0.666667
6,hahahahahahahahhha suck it.,0.457627
11,This is a great story. Man. I wonder if the pe...,0.440000
13,It's ridiculous that these guys are being call...,0.600000
...,...,...
1804852,Xi and his comrades must be smirking over Trum...,0.300000
1804856,Believing in God or not believing in God are p...,0.500000
1804857,"I take your point, but I think you're shooting...",0.700000
1804866,There's no whine like Alberta whine!,0.300000


In [9]:
dataset2cols.insert(2, LABEL_COL_NAME, np.zeros(len(dataset2cols)), True)

In [10]:
dataset2cols

Unnamed: 0,comment_text,target,Label
0,"This is so cool. It's like, 'would you want yo...",0.000000,0.0
1,Thank you!! This would make my life a lot less...,0.000000,0.0
2,This is such an urgent design problem; kudos t...,0.000000,0.0
3,Is this something I'll be able to install on m...,0.000000,0.0
4,haha you guys are a bunch of losers.,0.893617,0.0
...,...,...,...
1804869,"Maybe the tax on ""things"" would be collected w...",0.000000,0.0
1804870,What do you call people who STILL think the di...,0.000000,0.0
1804871,"thank you ,,,right or wrong,,, i am following ...",0.000000,0.0
1804872,Anyone who is quoted as having the following e...,0.621212,0.0


In [11]:
dataset2cols[LABEL_COL_NAME].values[dataset2cols.target >= THRESHOLD] = 1

In [12]:
dataset2cols[dataset2cols.target >= THRESHOLD].describe()

Unnamed: 0,target,Label
count,266089.0,266089.0
mean,0.517449,1.0
std,0.186804,0.0
min,0.3,1.0
25%,0.4,1.0
50%,0.5,1.0
75%,0.644737,1.0
max,1.0,1.0


In [13]:
finalDataset = dataset2cols.reindex(columns=['comment_text',LABEL_COL_NAME])
finalDataset = finalDataset.rename(columns={"comment_text": TEXT_COL_NAME})
finalDataset = setLabels(finalDataset)

In [14]:
finalDataset.head()

Unnamed: 0,Text,Label
0,"This is so cool. It's like, 'would you want yo...",HEALTHY
1,Thank you!! This would make my life a lot less...,HEALTHY
2,This is such an urgent design problem; kudos t...,HEALTHY
3,Is this something I'll be able to install on m...,HEALTHY
4,haha you guys are a bunch of losers.,TOXIC


In [15]:
exportCSV(finalDataset,'train_small_300',(300,300))
exportCSV(finalDataset,'train_small_3k',(3000,3000))
exportCSV(finalDataset,'train_small_30k',(30000,30000))
exportCSV(finalDataset,'train_small_50k',(50000,50000))
exportCSV(finalDataset,'train_small_100k',(100000,100000))

Total of positives:  266089 Total of negatives:  1538785
Total of positives exported:  300 Total of negatives exported:  300
Total of positives:  266089 Total of negatives:  1538785
Total of positives exported:  3000 Total of negatives exported:  3000
Total of positives:  266089 Total of negatives:  1538785
Total of positives exported:  30000 Total of negatives exported:  30000
Total of positives:  266089 Total of negatives:  1538785
Total of positives exported:  50000 Total of negatives exported:  50000
Total of positives:  266089 Total of negatives:  1538785
Total of positives exported:  100000 Total of negatives exported:  100000
Error while deleting file  train_small_100k.csv
Error while deleting file  train_small_100k.zip


Despues de tener nuestro dataset listo  y exportado vamos ahora a trabajar nuestro dataset de validacion que se encuentra acá https://github.com/t-davidson/hate-speech-and-offensive-language/tree/master/data

Si en el dataset de validacion la clase es 2 entonces es no ofensivo


In [16]:
validation_ds.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [17]:
validation_ds = validation_ds.loc[:, validation_ds.columns.intersection(['tweet','class'])]

In [18]:
validation_ds.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [19]:
validation_ds.insert(2, LABEL_COL_NAME, np.ones(len(validation_ds)), True)

In [20]:
validation_ds.head()

Unnamed: 0,class,tweet,Label
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,1.0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1.0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1.0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1.0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1.0


In [21]:
validation_ds[LABEL_COL_NAME].values[validation_ds['class'] == 2] = 0

In [22]:
validation_ds.head()

Unnamed: 0,class,tweet,Label
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0.0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1.0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1.0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1.0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1.0


In [23]:
validation_ds = validation_ds.rename(columns={"tweet": TEXT_COL_NAME})

In [24]:
validation_ds = validation_ds.loc[:, validation_ds.columns.intersection([TEXT_COL_NAME,LABEL_COL_NAME])]

In [25]:
validation_ds.head()

Unnamed: 0,Text,Label
0,!!! RT @mayasolovely: As a woman you shouldn't...,0.0
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1.0
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1.0
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1.0
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1.0


Ahora necesitaremos limpiar nuestro texto de caracteres indeseados para eso usaremos una funcion implementada acá https://stackoverflow.com/questions/54396405/how-can-i-preprocess-nlp-text-lowercase-remove-special-characters-remove-numb


In [26]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    return " ".join(tokens)


In [27]:
validation_ds[TEXT_COL_NAME] = validation_ds[TEXT_COL_NAME].map(lambda s:preprocess(s)) 
validation_ds = setLabels(validation_ds)
validation_ds.head()

Unnamed: 0,Text,Label
0,rt mayasolovely as a woman you shouldn t compl...,HEALTHY
1,rt mleew boy dats cold tyga dwn bad for cuffin...,TOXIC
2,rt urkindofbrand dawg rt sbabylife you ever fu...,TOXIC
3,rt c_g_anderson viva_based she look like a tranny,TOXIC
4,rt shenikaroberts the shit you hear about me m...,TOXIC


In [28]:
exportCSV(validation_ds,'validation_small',(20000,4000))

Total of positives:  20620 Total of negatives:  4163
Total of positives exported:  20000 Total of negatives exported:  4000


BOW
TFIDF
FastText
DistilBERT
BERT
