In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import torch
import torch.nn as nn
import numpy as np
import warnings
import transformers
from transformers import AutoTokenizer, AutoModel, BertTokenizerFast
import pandas as pd

In [4]:
# Import pretrained tokenizer and model from https://huggingface.co/YSKartal/bert-base-turkish-cased-turkish_offensive_trained_model
tokenizer = AutoTokenizer.from_pretrained("YSKartal/bert-base-turkish-cased-turkish_offensive_trained_model")
bert = AutoModel.from_pretrained("YSKartal/bert-base-turkish-cased-turkish_offensive_trained_model",return_dict=False, from_tf=True)

All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


In [5]:
# Import datasets
train = pd.read_csv('/content/drive/MyDrive/sentiment_analysis/turkish-offensive-lang-dataset/train.csv')
test = pd.read_csv('/content/drive/MyDrive/sentiment_analysis/turkish-offensive-lang-dataset/test.csv')
valid = pd.read_csv('/content/drive/MyDrive/sentiment_analysis/turkish-offensive-lang-dataset/valid.csv')

In [6]:
frames = [train, test, valid]
df = pd.concat(frames)
df = df[:10000]

In [7]:
# Preprocess text
def filter(text):
    final_text = ''
    for word in text.split():
        if word.startswith('@'):
            continue
        elif word[-3:] in ['com', 'org']:
            continue
        elif word.startswith('pic') or word.startswith('http') or word.startswith('www'):
            continue
        else:
            final_text += word+' '
    return final_text

df['text'] = df['text'].apply(filter)
df

Unnamed: 0,id,text,label
0,26418.0,Gerçekten sizin hikayelerinizi izleyerek mi ye...,0
1,14473.0,Çoook çok bi baklava bi sen zaten,0
2,16107.0,"1) Sn. DÜKEL; Atatürk'ün, Karma E. M. ile başl...",0
3,45908.0,Konfederasyonumuzun Aile ve Sosyal Politikalar...,0
4,12878.0,Hakemler tarih yazıyorlar / 9 kişiye karşı 3-2...,1
...,...,...,...
9995,35585.0,"sahne ışık görüntü işleme hepsi çok hoş olmuş,...",0
9996,26400.0,Hayır 2017 içinde aynı seleri söylüyodunuz 201...,0
9997,43302.0,klübümüz 6 puanı garantilemek için görüşmelere...,0
9998,13619.0,Ben sana içimden geçenleri deli gibi anlatmak ...,0


In [8]:
# Shuffle dataset
from sklearn.utils import shuffle
df = shuffle(df)

In [9]:
# Split data into train, test, valid
from sklearn.model_selection import train_test_split

train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'],
																	random_state = 2021,
																	test_size = 0.2,
																	stratify = df['label'])

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
																random_state = 2021,
																test_size = 0.5,
																stratify = temp_labels)

In [10]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



In [11]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [12]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)

      # dense layer 3 (Output layer)
      self.fc3 = nn.Linear(512,2)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)

      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc3(x)

      # apply softmax activation
      x = self.softmax(x)

      return x

In [13]:
# pass the pre-trained Roberta to our define architecture
model = BERT_Arch(bert)

In [14]:
#load weights of the model
path = '/content/drive/MyDrive/turkish_offensive_language.pt'
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [15]:
# get predictions for test data
with torch.no_grad():
  preds = model(test_seq, test_mask)
  preds = preds.detach().cpu().numpy()

# Labeling
predicted_label = []
for pred in preds:
  predicted_label.append(np.argmax(pred))

In [16]:
# confusion matrix
pd.crosstab(test_y, predicted_label)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,678,10
1,14,298


In [17]:
from sklearn.metrics import classification_report
target_names = ['OFFENSIVE', 'NOT OFFENSIVE']
print(classification_report(test_y, predicted_label, target_names=target_names))

               precision    recall  f1-score   support

    OFFENSIVE       0.98      0.99      0.98       688
NOT OFFENSIVE       0.97      0.96      0.96       312

     accuracy                           0.98      1000
    macro avg       0.97      0.97      0.97      1000
 weighted avg       0.98      0.98      0.98      1000

