In [1]:
!pip install tensorflow
!pip install scikit-learn



In [91]:
!pip install torch



In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import torch
from sklearn.metrics import accuracy_score
from transformers import BertForSequenceClassification

In [77]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
df = pd.read_csv("data.csv")

In [98]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Removing Stopwords
    stop_words = set(stopwords.words('arabic'))
    tokens = [word for word in tokens if word not in stop_words]

    # Removing Punctuation and Special Characters
    tokens = [word for word in tokens if word.isalnum()]

    # Stemming or Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Filter only Arabic words using regex
    arabic_words = [word for word in lemmatized_tokens if re.fullmatch('[\u0600-\u06FF]+', word)]

    return " ".join(arabic_words)

In [None]:
df['preprossing'] = df['content'].apply(preprocess_text)

In [81]:
df = df.drop('content', axis=1)

In [104]:
df = df.dropna()

In [109]:
from sklearn.model_selection import train_test_split

# Assuming df contains your dataset with 'target' and 'content' columns
X = df['preprocessing']
y = df['label']

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [119]:
# Load the pre-trained MARBERT model and tokenizer
model_name = "UBC-NLP/MARBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Assuming binary classification (change num_labels accordingly)

# Tokenize and encode the training data
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(y_train.tolist())
)

# Tokenize and encode the validation data
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, max_length=128)
val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(y_val.tolist())
)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training parameters
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataset) * 3)
epochs = 3

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

    for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {average_loss}")

# Evaluation on validation set
model.eval()
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
predictions, true_labels = [], []

for batch in tqdm(val_dataloader, desc="Validation"):
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Calculate accuracy on validation set
accuracy = accuracy_score(true_labels, predictions)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 875/875 [02:56<00:00,  4.94it/s]


Epoch 1/3, Average Loss: 0.4549241573171956


Epoch 2/3: 100%|██████████| 875/875 [02:57<00:00,  4.93it/s]


Epoch 2/3, Average Loss: 0.2433783157542348


Epoch 3/3: 100%|██████████| 875/875 [02:57<00:00,  4.93it/s]


Epoch 3/3, Average Loss: 0.08926058690582535


Validation: 100%|██████████| 188/188 [00:11<00:00, 16.73it/s]

Validation Accuracy: 83.87%



