In [1]:
! pip install accelerate -U
! pip install transformers
! pip install datasets

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn.functional as F
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import data

In [3]:
ROOT_DIR = '/content/'
train_df = pd.read_csv(ROOT_DIR + "train.csv")
test_df = pd.read_csv(ROOT_DIR + "test.csv")

train_df["label"] = train_df['off']
train_df = train_df.rename(columns={'comment': 'text'})
train_df['off'].replace({1: 'OF', 0: 'NOF'},inplace = True)
train_df.drop(columns = "Unnamed: 0",inplace = True)

test_df["label"] = test_df['off']
test_df = test_df.rename(columns={'comment': 'text'})
test_df['off'].replace({1: 'OF', 0: 'NOF'},inplace = True)
test_df.drop(columns = "Unnamed: 0",inplace = True)

train_df.head()

Unnamed: 0,text,off,label
0,فنانين الكبت والفساد .عقلية جنسية لا غير. العف...,OF,1
1,الدعارة هربت منها في المحمدية و سكنت في بوزنيق...,OF,1
2,كون غير خريتي و مدرتيش هادشي,OF,1
3,لا حول ولا قوة الا بالله العلي العظيم لا حول و...,NOF,0
4,الله يرحم الوالدين عندي مشكل كبيييير و محتاجة ...,NOF,0


## Create variables to use later

In [4]:
id2label = {1: 'OF', 0: 'NOF'}
label2id = {'OF' : 1, 'NOF' : 0}
NUM_LABELS= len(id2label)

In [5]:
NUM_LABELS

2

# Create Dataset object and Datalaoder

In [6]:
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer

class MyDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_seq_length=256):

        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        row = self.data.iloc[idx]
        text = row["text"]
        label = row["label"]

        #tokenize the text
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_seq_length,
            return_tensors='pt'
        )

        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": label
        }


# Specify the checkpoint and tokenizer
checkpoint = "UBC-NLP/MARBERT"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

max_seq_length = 256
batch_size = 32

#create train dataloader
train_dataset = MyDataset(train_df, tokenizer, max_seq_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

#create test dataloader
test_dataset = MyDataset(test_df, tokenizer, max_seq_length)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

# Model

In [7]:
import torch
import torch.nn as nn
from torch.optim import Adam
from transformers import AutoModel, AutoConfig, AutoTokenizer
from tqdm import tqdm


class MyTopicPredictionModel(nn.Module):
    def __init__(self, checkpoint, num_topics):
        super(MyTopicPredictionModel, self).__init__()

        self.num_topics = num_topics

        self.bert = AutoModel.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint, output_hidden_states=True))
        self.dropout = nn.Dropout(0.1)
        self.lstm1 = nn.LSTM(self.bert.config.hidden_size, 512, num_layers=4, dropout=0.1, bidirectional=False, batch_first=True)
        self.classifier = nn.Linear(512, num_topics)

    def forward(self, input_ids=None, attention_mask=None, labels=None):

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        sequence_outputs = self.dropout(last_hidden_state)
        lstm_out1, _ = self.lstm1(sequence_outputs)
        logits = F.softmax(self.classifier(lstm_out1[:, -1, :]))

        return logits


model = MyTopicPredictionModel(checkpoint='UBC-NLP/MARBERT', num_topics=NUM_LABELS)

# Freeze weights
for param in model.bert.parameters():
  param.requires_grad = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

MyTopicPredictionModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(100000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme

In [8]:
optimizer = torch.optim.Adam(model.parameters(),lr = 1e-5 )
criterion = nn.CrossEntropyLoss()

num_epochs = 10
for epoch in range(num_epochs):

  model.train()

  total_loss = 0.0
  total_acc = 0.0

  correct_predictions = 0
  total_predictions = 0

  progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")
  for batch in progress_bar:

    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["label"].to(device)

    # Predict
    logits = model(input_ids=input_ids, attention_mask=attention_mask)

    # calculate and save loss
    loss = criterion(logits, labels)
    total_loss += loss.item()

    # Optimize
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

    # Calculate and save accuracy
    predicted_labels = torch.argmax(logits, dim=1)
    correct_predictions += (predicted_labels == labels).sum().item()
    total_predictions += labels.size(0)
    accuracy = correct_predictions / total_predictions
    total_acc += accuracy

  average_loss = total_loss / len(train_dataloader)
  average_accuarcy = total_acc / len(train_dataloader)


  model.eval()
  with torch.inference_mode():

    total_test_loss = 0.0
    total_test_acc = 0.0

    correct_test_predictions = 0
    total_test_predictions = 0

    progress_bar = tqdm(test_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")
    for batch in progress_bar:

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # predict
        logits = model(input_ids=input_ids, attention_mask=attention_mask)

        # calculate and save loss
        test_loss = criterion(logits, labels)
        total_test_loss += test_loss.item()

        # Calculate and save accuracy
        predicted_labels = torch.argmax(logits, dim=1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)
        test_accuracy = correct_predictions / total_predictions
        total_test_acc += test_accuracy

    average_test_loss = total_test_loss / len(test_dataloader)
    average_test_accuarcy = total_test_acc / len(test_dataloader)


    progress_bar.set_postfix({"Train loss": loss.item(),
                              "Train accuracy": accuracy,
                              "Test loss": test_loss.item(),
                              "Test accuracy": test_accuracy})


    print(f"Epoch [{epoch + 1}/{num_epochs}] | Train Loss: {average_loss:.4f} | Test Loss: {average_test_loss:.4f} | Train Accuracy: {average_accuarcy:.4f} | Test Accuracy: {average_test_accuarcy:.4f} ")


  logits = F.softmax(self.classifier(lstm_out1[:, -1, :]))
Epoch 1/10: 100%|██████████| 201/201 [02:20<00:00,  1.43it/s]
Epoch 1/10: 100%|██████████| 51/51 [00:30<00:00,  1.67it/s]


Epoch [1/10] | Train Loss: 0.6902 | Test Loss: 0.6809 | Train Accuracy: 0.5422 | Test Accuracy: 0.5398 


Epoch 2/10: 100%|██████████| 201/201 [02:34<00:00,  1.31it/s]
Epoch 2/10: 100%|██████████| 51/51 [00:30<00:00,  1.67it/s]


Epoch [2/10] | Train Loss: 0.6732 | Test Loss: 0.6617 | Train Accuracy: 0.5676 | Test Accuracy: 0.5824 


Epoch 3/10: 100%|██████████| 201/201 [02:34<00:00,  1.30it/s]
Epoch 3/10: 100%|██████████| 51/51 [00:30<00:00,  1.67it/s]


Epoch [3/10] | Train Loss: 0.5859 | Test Loss: 0.5482 | Train Accuracy: 0.6625 | Test Accuracy: 0.7118 


Epoch 4/10: 100%|██████████| 201/201 [02:34<00:00,  1.30it/s]
Epoch 4/10: 100%|██████████| 51/51 [00:30<00:00,  1.67it/s]


Epoch [4/10] | Train Loss: 0.5441 | Test Loss: 0.5351 | Train Accuracy: 0.7637 | Test Accuracy: 0.7564 


Epoch 5/10: 100%|██████████| 201/201 [02:34<00:00,  1.30it/s]
Epoch 5/10: 100%|██████████| 51/51 [00:30<00:00,  1.67it/s]


Epoch [5/10] | Train Loss: 0.5358 | Test Loss: 0.5230 | Train Accuracy: 0.7597 | Test Accuracy: 0.7655 


Epoch 6/10: 100%|██████████| 201/201 [02:34<00:00,  1.30it/s]
Epoch 6/10: 100%|██████████| 51/51 [00:30<00:00,  1.67it/s]


Epoch [6/10] | Train Loss: 0.5276 | Test Loss: 0.5393 | Train Accuracy: 0.7703 | Test Accuracy: 0.7703 


Epoch 7/10: 100%|██████████| 201/201 [02:34<00:00,  1.30it/s]
Epoch 7/10: 100%|██████████| 51/51 [00:30<00:00,  1.67it/s]


Epoch [7/10] | Train Loss: 0.5244 | Test Loss: 0.5262 | Train Accuracy: 0.7810 | Test Accuracy: 0.7775 


Epoch 8/10: 100%|██████████| 201/201 [02:34<00:00,  1.30it/s]
Epoch 8/10: 100%|██████████| 51/51 [00:30<00:00,  1.67it/s]


Epoch [8/10] | Train Loss: 0.5235 | Test Loss: 0.5239 | Train Accuracy: 0.7773 | Test Accuracy: 0.7751 


Epoch 9/10: 100%|██████████| 201/201 [02:34<00:00,  1.30it/s]
Epoch 9/10: 100%|██████████| 51/51 [00:30<00:00,  1.67it/s]


Epoch [9/10] | Train Loss: 0.5244 | Test Loss: 0.5477 | Train Accuracy: 0.7785 | Test Accuracy: 0.7728 


Epoch 10/10: 100%|██████████| 201/201 [02:34<00:00,  1.30it/s]
Epoch 10/10: 100%|██████████| 51/51 [00:30<00:00,  1.67it/s]

Epoch [10/10] | Train Loss: 0.5168 | Test Loss: 0.5110 | Train Accuracy: 0.7849 | Test Accuracy: 0.7857 





# Test the model

In [10]:
model.eval()


test_predictions = []
test_true_labels = []
test_texts = []


with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)  # True labels

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_labels = torch.argmax(logits, dim=1) # predicted labels


        predicted_labels = predicted_labels.cpu().numpy()
        true_labels = labels.cpu().numpy()
        text_batch = batch["input_ids"]

        # Save values in lists
        test_predictions.extend(predicted_labels.tolist())
        test_true_labels.extend(true_labels.tolist())
        test_texts.extend(text_batch)


# label 1 corresponds to 'OF' offensive and label 0 corresponds to 'NOF' not offensive:
test_true_labels = [id2label[label] for label in test_true_labels]
test_predictions = [id2label[label] for label in test_predictions]

  logits = F.softmax(self.classifier(lstm_out1[:, -1, :]))


In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('UBC-NLP/MARBERT')

# Convert token IDs back to human-readable text
decoded_texts = [tokenizer.decode(text, skip_special_tokens=True) for text in test_texts]


In [12]:
import pandas as pd

data = {
    'text': decoded_texts,
    'true_Label': test_true_labels,
    'predicted_Label': test_predictions
}


results = pd.DataFrame(data)

results.head()

Unnamed: 0,text,true_Label,predicted_Label
0,تبهديل تسولو البنات في الشارع,OF,OF
1,انا مستعد بي مسعدة بسيطة في وجه اللة ربي يادير...,NOF,NOF
2,لاحول ولاقوة الا بالله العلي العظيم,NOF,NOF
3,حتى هاد العايلة لي جالس وسط منهوم خص يدلمهوم ا...,OF,OF
4,لا حول ولا قوة الا بالله اش هذا المستوى اصلا ر...,OF,OF


In [14]:
test_accuracy = (results["true_Label"] == results["predicted_Label"]).sum()/ results.shape[0]
print(f"Test accuracy : {test_accuracy*100:.2f} %")

Test accuracy : 79.31 %


In [None]:
final_results = results[results["predicted_Label"]=='OF'][["text","predicted_Label"]]

In [None]:
final_results.head()

Unnamed: 0,text,predicted_Label
3,حتى هاد العايلة لي جالس وسط منهوم خص يدلمهوم ا...,OF
4,لا حول ولا قوة الا بالله اش هذا المستوى اصلا ر...,OF
5,لا حول ولا قوة الا بالله العلي العظيم واتوب ال...,OF
6,صحافيه تقول جبوها من جوطيه ومدام قنانه جبوها م...,OF
7,وا ناري اشنو واقع في هاد البلاد الوالدين طغاو ...,OF


In [9]:
torch.save(model.state_dict(), '/content/model.pth')

# Create files to deploy in app

In [None]:
%%writefile model.py

import torch
import torch.nn as nn
from torch.optim import Adam
from transformers import AutoModel, AutoConfig, AutoTokenizer
from tqdm import tqdm

def create_model(checkpoint = "UBC-NLP/MARBERT" , NUM_LABELS = 2 ):
  class MyTopicPredictionModel(nn.Module):
      def __init__(self, checkpoint, num_topics):
          super(MyTopicPredictionModel, self).__init__()

          self.num_topics = num_topics

          self.bert = AutoModel.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint, output_hidden_states=True))
          self.dropout = nn.Dropout(0.1)
          self.lstm1 = nn.LSTM(self.bert.config.hidden_size, 512, num_layers=4, dropout=0.1, bidirectional=False, batch_first=True)
          self.classifier = nn.Linear(512, num_topics)

      def forward(self, input_ids=None, attention_mask=None, labels=None):

          outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
          last_hidden_state = outputs.last_hidden_state
          sequence_outputs = self.dropout(last_hidden_state)
          lstm_out1, _ = self.lstm1(sequence_outputs)
          logits = F.softmax(self.classifier(lstm_out1[:, -1, :]))

          return logits


  model = MyTopicPredictionModel(checkpoint=checkpoint, num_topics=NUM_LABELS)

  # Freeze weights
  for param in model.bert.parameters():
    param.requires_grad = False

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  return model

def create_tokenizer(checkpoint= "UBC-NLP/MARBERT"):
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
  return tokenizer

Overwriting model.py


In [None]:
%%writefile predict.py

import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification


#predict labels for texts
def predict_labels(texts, tokenizer, model, max_seq_length=256, batch_size=32):
    # Tokenize texts
    inputs = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_seq_length,
        return_tensors='pt'
    )

    # Create DataLoader
    dataset = torch.utils.data.TensorDataset(inputs["input_ids"], inputs["attention_mask"])
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

    # Predict labels
    model.eval()
    predicted_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch[0].to('cpu')
            attention_mask = batch[1].to('cpu')
            outputs = model(input_ids, attention_mask=attention_mask)
            predicted_class = torch.argmax(outputs, dim=1)
            predicted_labels.extend(predicted_class.cpu().numpy())

    return predicted_labels

#add predicted labels to dataframe
def predictions_dataframe(df, tokenizer, model, id2label  = {1: 'OF', 0: 'NOF'} ,max_seq_length=256, batch_size=32):
    texts = df['text'].tolist()
    predicted_labels = predict_labels(texts, tokenizer, model, max_seq_length, batch_size)
    predicted_labels = [id2label[label] for label in predicted_labels]
    df['predicted_label'] = predicted_labels
    return df[df['predicted_label'] == "OF"]


Overwriting predict.py


In [6]:
from model import create_model #, create_tokenizer
from web_scraping import get_comments_from_url  , youtube_data_api
from preprocessing import preprocess_arabic_text
from predict import predictions_dataframe
import pandas as pd
import torch

#import Model
model = create_model()
model.load_state_dict(torch.load("/content/model.pth"))

# import tokenizer
checkpoint= "UBC-NLP/MARBERT"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Data scraping and preprocessing
DEVELOPER_KEY = "AIzaSyAjxeM_uYL3XtSnr5EZjXMeuVod__CV3fo"

YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

youtube = youtube_data_api(YOUTUBE_API_SERVICE_NAME , YOUTUBE_API_VERSION  , DEVELOPER_KEY)
#YouTube video URL here
youtube_url = "https://youtu.be/IrkFeijDREk?si=8bNXdys7TmQFdjrO"

#get comments from the video URL
video_comments_data = get_comments_from_url(youtube , url = youtube_url, max_comments=100)

#df from the collected comments data
video_comments_df = pd.DataFrame(video_comments_data, columns=["text"])

video_comments_df['Text_pro'] = video_comments_df['text'].apply(preprocess_arabic_text)

data = pd.DataFrame({"text" : video_comments_df['Text_pro']})


# predict

results = predictions_dataframe(data, tokenizer, model)

results

RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

In [None]:

%%writefile app.py

def predict_from_ytb(youtube_url):
    import pandas as pd
    import torch
    from transformers import AutoTokenizer

    from model import create_model
    from web_scraping import get_comments_from_url  , youtube_data_api
    from preprocessing import preprocess_arabic_text
    from predict import predictions_dataframe


    #import Model
    model = create_model()
    model.load_state_dict(torch.load("model.pth",map_location=torch.device('cpu')))

    # import tokenizer
    checkpoint= "UBC-NLP/MARBERT"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    # Data scraping and preprocessing
    DEVELOPER_KEY = "AIzaSyAjxeM_uYL3XtSnr5EZjXMeuVod__CV3fo"

    YOUTUBE_API_SERVICE_NAME = "youtube"
    YOUTUBE_API_VERSION = "v3"

    youtube = youtube_data_api(YOUTUBE_API_SERVICE_NAME , YOUTUBE_API_VERSION  , DEVELOPER_KEY)

    #get comments from the video URL
    video_comments_data = get_comments_from_url(youtube , url = youtube_url, max_comments=100)

    #df from the collected comments data
    video_comments_df = pd.DataFrame(video_comments_data, columns=["text"])

    video_comments_df['Text_pro'] = video_comments_df['text'].apply(preprocess_arabic_text)

    data = pd.DataFrame({"text" : video_comments_df['Text_pro']})


    # predict

    results = predictions_dataframe(data, tokenizer, model)

    return results

import gradio as gr
import pandas as pd

# Create the Gradio interface
gr.Interface(fn=predict_from_ytb,
             inputs=gr.inputs.Textbox(lines=5, label="Enter youtube link"),
             outputs= gr.outputs.Dataframe(label="results"),
             title="Toxic comments detection").launch()
