In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import torch
import torch.nn as nn
import numpy as np
import warnings
import transformers
from transformers import AutoTokenizer, AutoModel, BertTokenizerFast
import pandas as pd

In [4]:
# Import dataset from https://huggingface.co/datasets/badmatr11x/hate-offensive-speech
train = pd.read_parquet('/content/drive/MyDrive/sentiment_analysis/english-offensive-lang-dataset/train-00000-of-00001-b57a122b095e5ed1.parquet')
test  = pd.read_parquet('/content/drive/MyDrive/sentiment_analysis/english-offensive-lang-dataset/test-00000-of-00001-10d11e25d2e9ec6e.parquet')
valid = pd.read_parquet('/content/drive/MyDrive/sentiment_analysis/english-offensive-lang-dataset/validation-00000-of-00001-9ea89a9fc1c6b387.parquet')
frames = [train, test, valid]
df = pd.concat(frames)
df.rename(columns={'tweet':'text'}, inplace=True)
df = df[:10000]

In [5]:
# Preprocess text
def filter(text):
    final_text = ''
    for word in text.split():
        if word.startswith('@'):
            continue
        elif word == 'RT':
            continue
        elif word[-3:] in ['com', 'org']:
            continue
        elif word.startswith('pic') or word.startswith('http') or word.startswith('www'):
            continue
        elif word.startswith('!') or word.startswith('&') or word.startswith('-'):
            continue
        else:
            final_text += word+' '
    return final_text

df['text'] = df['text'].apply(filter)
df

Unnamed: 0,label,text
0,2,where's his other half??? i want to indulge my...
1,2,easier for barrack hussein obama to blame guns...
2,2,As soon as a Democrat gets a chance to ? the p...
3,1,GOOD Music Young Money fags
4,2,mass shooting at orlando gay nightclub: what w...
...,...,...
9995,1,Yu a bitch made nigga
9996,2,can #lighttherapy help with or #depression? #a...
9997,2,i am lucky. #i_am #positive #affirmation
9998,2,people actually believe the shit spewing out o...


In [6]:
# Labeling offensive and hatespeech as 1
for i in df.index:
    if df.label.iloc[i] == 0 or df.label.iloc[i] == 1:
        df.at[i, 'label'] = 0
    else:
        df.at[i, 'label'] = 1

In [7]:
# Import pretrained tokenizer and model from https://huggingface.co/cardiffnlp/twitter-roberta-base-offensive
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-offensive")
bert = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base-offensive",return_dict=False)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-offensive were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
# Shuffle dataset
from sklearn.utils import shuffle
df = shuffle(df)

In [9]:
# Split data into train, test, valid
from sklearn.model_selection import train_test_split

train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'],
																	random_state = 2021,
																	test_size = 0.2,
																	stratify = df['label'])

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
																random_state = 2021,
																test_size = 0.5,
																stratify = temp_labels)

In [10]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [11]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [12]:
class Arch(nn.Module):

    def __init__(self, bert):
      
      super(Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)

      # dense layer 3 (Output layer)
      self.fc3 = nn.Linear(512,2)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)

      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc3(x)

      # apply softmax activation
      x = self.softmax(x)

      return x

In [13]:
# pass the pre-trained model to our define architecture
model = Arch(bert)

In [14]:
#load weights of the model
path = '/content/drive/MyDrive/english_offensive_language.pt'
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [15]:
# get predictions for test data
with torch.no_grad():
  preds = model(test_seq, test_mask)
  preds = preds.detach().cpu().numpy()

# Labeling
predicted_label = []
for pred in preds:
  predicted_label.append(np.argmax(pred))

In [16]:
# confusion matrix
pd.crosstab(test_y, predicted_label)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,369,36
1,29,566


In [17]:
from sklearn.metrics import classification_report
target_names = ['OFFENSIVE', 'NOT OFFENSIVE']
print(classification_report(test_y, predicted_label, target_names=target_names))

               precision    recall  f1-score   support

    OFFENSIVE       0.93      0.91      0.92       405
NOT OFFENSIVE       0.94      0.95      0.95       595

     accuracy                           0.94      1000
    macro avg       0.93      0.93      0.93      1000
 weighted avg       0.93      0.94      0.93      1000

