In [1]:
VERSION = 12

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel,BertTokenizerFast, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
import re
import string
import numpy as np
import nltk

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pd.set_option("display.max_rows", 150)

In [4]:
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

In [5]:
def clean_text(text):
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    #remove nan
    text = re.sub('\bnan\b', '', text)
    text = re.sub(r'\b[nN][aA][nN]\b', '', text)
    # remove urls
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    
    #remove &nbsp;
    text = re.sub('&nbsp;', ' ', text)
    
    # remove html tages
    text = re.sub('<.*?>+', ' ', text)
    
    # Removing @user
    text = re.sub(r'@[^\s]+', ' ', text)
    
    # remove #word with word
    text = re.sub(r'#([^\s]+)', r'\1', text)
    
    # remove punctuation
    text = re.sub('[%s]' % re.escape(punctuations_list), ' ', text)
    
    # remove new line
    text = re.sub('\n', ' ', text)
    
    # Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Replace 3 or more consecutive letters by 2 letter.
    text = re.sub(sequencePattern, seqReplacePattern, text)
    
    # Removing English words and numbers and make right strip
    text = re.sub(r'\s*[0-9]+\b', '' , text).rstrip()
    
    # lower case
    text = text.lower()
    
    return text

In [6]:
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

In [7]:
def remove_emojis(text): 
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', text)

In [35]:
def preprocess_data(text):
    
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    
    # Normalize the text 
    text = normalize_arabic(text)

    # Remove emojis
    text = remove_emojis(text)

    return text

# Read Data

In [76]:
train      = pd.read_csv("datasets/train_dataset/train_data_v3.csv")
validation = pd.read_csv("datasets/validation_dataset/validation_data_v3.csv")
test_1     = pd.read_csv("datasets/test_dataset/test_data_Nov_v1.csv")
test_2     = pd.read_csv("datasets/test_dataset/test_data_Dec_v1.csv")
test_3     = pd.read_csv("datasets/test_dataset/test_data_Nov_Dec_Oct_v3.csv")

In [77]:
test_2.columns

Index(['created_at', 'product_id', 'product_name', 'product_description',
       'product_type', 'product_status', 'brand_name', 'brand_description',
       'day', 'model_prediction', 'manual_label'],
      dtype='object')

## Rename Columns in test data

In [78]:
test_1 = test_1.rename(columns={"product_name_cleaned":"final_product_name",
                         "product_description_cleaned":"final_product_description"})

test_2 = test_2.rename(columns={"product_name":"final_product_name",
                         "product_description":"final_product_description"})

test_2['manual_label'] = test_2['manual_label'].replace([2],[0]) # products need to confirm if they are toxic or not

In [79]:
all_train_data = pd.read_csv("datasets/train_dataset/all_train_data_v3.csv")

In [80]:
len(train), len(validation), len(test_1), len(test_2), len(test_3), len(all_train_data)

(65123, 23189, 2731, 1166, 80779, 117127)

# drop unnecessary columns 

In [81]:
train          = train[['final_product_name', 'final_product_description', 'toxicity']]
validation     = validation[['final_product_name', 'final_product_description', 'toxicity']]
test_1         = test_1[['final_product_name', 'final_product_description', 'original_label']].rename(columns = {'original_label':'toxicity'})
test_2         = test_2[['final_product_name', 'final_product_description', 'manual_label']].rename(columns = {'manual_label':'toxicity'})
test_3         = test_3[['final_product_name', 'final_product_description', 'toxicity']]
all_train_data = all_train_data[['final_product_name', 'final_product_description', 'toxicity']]

In [82]:
print("train {}".format(len(train)))        
print("test {}".format(len(validation)))                   
print("all_train {}".format(len(all_train_data)))          
print("Nov {}".format(len(test_1)))                      
print("Dec {}".format(len(test_2)))                      
print("new Nov Dec {}".format(len(test_3)))

train 65123
test 23189
all_train 117127
Nov 2731
Dec 1166
new Nov Dec 80779


In [83]:
train['tag']           = "train"        
validation['tag']      = "validation"         
all_train_data['tag']  = "all_train"    
test_1['tag']          = "test_1"      
test_2['tag']          = "test_2"      
test_3['tag']          = "test_3"      

In [84]:
data = pd.concat([train, validation, all_train_data, test_1, test_2, test_3], ignore_index=True)
print(data.tag.value_counts())
len(data), data.columns

tag
all_train     117127
test_3         80779
train          65123
validation     23189
test_1          2731
test_2          1166
Name: count, dtype: int64


(290115,
 Index(['final_product_name', 'final_product_description', 'toxicity', 'tag'], dtype='object'))

In [85]:
data = data.dropna(subset=['final_product_name'], how='all')
print("size of data {}".format(len(data)))
data = data.reset_index(drop= True)
data = data.fillna('')

size of data 289999


In [86]:
data.tag.value_counts()

tag
all_train     117069
test_3         80779
train          65065
validation     23189
test_1          2731
test_2          1166
Name: count, dtype: int64

In [91]:
data['soup_of_text'] = [f"{data['final_product_name'][i]} {data['final_product_description'][i]}" for i in range(len(data))]

data['soup_of_text_clean']  = data['soup_of_text'].apply(lambda q: preprocess_data(q))

data['toxicity'] = data.toxicity.apply(lambda x: int(x))

In [92]:
data.groupby(['tag']).toxicity.value_counts()

tag         toxicity
all_train   0           75070
            1           41999
test_1      0            2122
            1             609
test_2      0             695
            1             471
test_3      0           48897
            1           31882
train       0           34751
            1           30314
validation  1           11681
            0           11508
Name: count, dtype: int64

In [93]:
train_data   = data[(data['tag'] == 'train')].reset_index(drop = True)
val_data     = data[data['tag'] == 'validation'].reset_index(drop = True)
all_train    = data[(data['tag'] == 'all_train')].reset_index(drop = True)
test_1_data  = data[data['tag'] == 'test_1'].reset_index(drop = True)
test_2_data  = data[data['tag'] == 'test_2'].reset_index(drop = True)
test_3_data  = data[data['tag'] == 'test_3'].reset_index(drop = True)

In [94]:
len(train_data),len(val_data), len(test_1_data), len(test_2_data), len(test_3_data)

(65065, 23189, 2731, 1166, 80779)

In [96]:
train_texts  = train_data['soup_of_text_clean']
train_labels = train_data['toxicity']

val_texts    = val_data['soup_of_text_clean']
val_labels   = val_data['toxicity']

all_train_texts  = all_train['soup_of_text_clean']
all_train_labels = all_train['toxicity']

test_1_text   = test_1_data['soup_of_text_clean']
test_1_labels = test_1_data['toxicity']

test_2_text   = test_2_data['soup_of_text_clean']
test_2_labels = test_2_data['toxicity']

test_3_text   = test_3_data['soup_of_text_clean']
test_3_labels = test_3_data['toxicity']

In [97]:
BERT_MODEL_NAME = 'roberta-base'
bert = AutoModel.from_pretrained(BERT_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
model1 = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_NAME, num_labels=2)
model2 = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_NAME, num_labels=2)

model.safetensors: 100%|███████████████| 499M/499M [00:11<00:00, 45.1MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassi

In [41]:
# get length of all the messages in the train set
#seq_len = [len(tokenizer.encode(i)) for i in train_texts]
#pd.Series(seq_len).hist(bins = 30)

In [98]:
max_seq_len = 200

In [99]:
train_encodings = tokenizer(train_texts.to_list(),
                            truncation=True,
                            padding=True,
                            max_length=max_seq_len)

val_encodings = tokenizer(val_texts.to_list(),
                           truncation=True,
                           padding=True,
                           max_length=max_seq_len)

all_encodings = tokenizer(all_train_texts.to_list(),
                           truncation=True,
                           padding=True,
                           max_length=max_seq_len)

test_1_encodings = tokenizer(test_1_text.to_list(),
                           truncation=True,
                           padding=True,
                           max_length=max_seq_len)

test_2_encodings = tokenizer(test_2_text.to_list(),
                           truncation=True,
                           padding=True,
                           max_length=max_seq_len)

test_3_encodings = tokenizer(test_3_text.to_list(),
                           truncation=True,
                           padding=True,
                           max_length=max_seq_len)

In [100]:
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.to_list()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [101]:
train_dataset     = TweetDataset(train_encodings, train_labels)
val_dataset       = TweetDataset(val_encodings, val_labels)
all_train_dataset = TweetDataset(all_encodings, all_train_labels)
test_1_dataset    = TweetDataset(test_1_encodings, test_1_labels)
test_2_dataset    = TweetDataset(test_2_encodings, test_2_labels)
test_3_dataset    = TweetDataset(test_3_encodings, test_3_labels)

In [102]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [103]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    do_train=True,
    do_eval=False,
    per_device_train_batch_size=32,
    warmup_steps=500,
    learning_rate = 5e-5,
    weight_decay=0.01,
    logging_strategy='epoch'
    )

In [104]:
trainer1 = Trainer(
    model=model1,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    compute_metrics=compute_metrics
)

In [None]:
trainer1.train()



Step,Training Loss
2034,0.2434
4068,0.1189
6102,0.0937
8136,0.0781
10170,0.0719


In [None]:
val_scores    = trainer1.predict(val_dataset)
test_1_scores = trainer1.predict(test_1_dataset)
test_2_scores = trainer1.predict(test_2_dataset)
test_3_scores = trainer1.predict(test_3_dataset)

In [None]:
val_pred    = np.argmax(val_scores[0], axis=1)
test_1_pred = np.argmax(test_1_scores[0], axis=1)
test_2_pred = np.argmax(test_2_scores[0], axis=1)
test_3_pred = np.argmax(test_3_scores[0], axis=1)

In [None]:
col = 'v{}_small_model_results'.format(VERSION)

val_data[col]    = val_pred
test_1_data[col] = test_1_pred
test_2_data[col] = test_2_pred
test_3_data[col] = test_3_pred

In [None]:
print("train over {}, and validate over {}".format(len(train_dataset), len(val_dataset)))
print(classification_report(test_label, val_pred))

print("train over {}, and test over {} (NOV)".format(len(train_dataset), len(test_1_dataset)))
print(classification_report(nov_labels, test_1_pred ))

print("train over {}, and test over {} (DEC)".format(len(train_dataset), len(test_2_dataset)))
print(classification_report(nov_labels, test_2_pred ))

In [None]:
print("train over {}, and test over {} (DEC)".format(len(train_dataset), len(test_3_dataset)))
print(classification_report(nov_labels, test_3_pred ))

In [None]:
#trainer1.save_model('models/toxic_model_v{}_less'.format(VERSION))

In [None]:
trainer2 = Trainer(
    model=model2,
    args=training_args,
    train_dataset=all_train_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer2.train()

In [None]:
test_1_scores_2 = trainer2.predict(test_1_dataset)
test_2_scores_2 = trainer2.predict(test_2_dataset)
test_3_scores_2 = trainer2.predict(test_3_dataset)

In [None]:
test_1_pred_2 = np.argmax(test_1_scores_2[0], axis=1)
test_2_pred_2 = np.argmax(test_2_scores_2[0], axis=1)
test_3_pred_2 = np.argmax(test_3_scores_2[0], axis=1)

In [None]:
col = "v{}_big_model_results".format(VERSION)

test_1_data[col] = test_1_pred_2
test_2_data[col] = test_2_pred_2
test_3_data[col] = test_3_pred_2

In [None]:
print("train over {}, and test over {} (NOV)".format(len(all_train_dataset), len(test_1_dataset)))
print(classification_report(nov_labels, test_1_pred_2))

print("train over {}, and test over {} (DEC)".format(len(all_train_dataset), len(test_2_dataset)))
print(classification_report(dec_labels, test_2_pred_2 ))

print("train over {}, and test over {} (DEC)".format(len(all_train_dataset), len(test_3_dataset)))
print(classification_report(dec_labels, test_2_pred_3 ))

In [None]:
#trainer2.save_model('models/toxic_model_v{}'.format(VERSION))