# AraBERT Sentiment Analysis Project


## 1) Data Loading and Preparation

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
try:
    import transformers, datasets, torch, nbformat
    print('Dependencies already installed.')
except Exception as e:
    print('Installing dependencies... (may take several minutes)')
    !pip install -q transformers datasets "torch>=1.13" arabert python-arabic-reshaper farasa

print('\nPython environment:')
import sys, torch
print('Python', sys.version.split()[0])
print('Torch available:', torch.cuda.is_available(), 'GPU count:', torch.cuda.device_count())

Dependencies already installed.

Python environment:
Python 3.12.12
Torch available: True GPU count: 1


In [17]:
import os, pandas as pd

base_dir = '/content/drive/MyDrive/nlp_project/data/raw'
os.makedirs(base_dir, exist_ok=True)
print('Put your raw .txt files under', base_dir, 'in Positive/Negative subfolders.')

data = []
for label_folder in ['Positive', 'Negative']:
    folder_path = os.path.join(base_dir, label_folder)
    if not os.path.exists(folder_path):
        print('Warning — folder does not exist:', folder_path)
        continue
    label = 1 if label_folder.lower() == 'positive' else 0
    print(f'Reading {folder_path} → label {label}')
    for file in os.listdir(folder_path):
        if file.endswith('.txt'):
            file_path = os.path.join(folder_path, file)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read().strip()
                if text:
                    data.append({'tweet': text, 'label': label})
                else:
                    print('Skipped empty file:', file_path)

df = pd.DataFrame(data)
print('\nTotal samples:', len(df))
save_dir = '/content/data/processed'
os.makedirs(save_dir, exist_ok=True)
labeled_csv = os.path.join(save_dir, '/content/arabic_tweets_labeled.csv')
df.to_csv(labeled_csv, index=False, encoding='utf-8')
print('Saved labeled CSV to', labeled_csv)

Put your raw .txt files under /content/drive/MyDrive/nlp_project/data/raw in Positive/Negative subfolders.
Reading /content/drive/MyDrive/nlp_project/data/raw/Positive → label 1
Reading /content/drive/MyDrive/nlp_project/data/raw/Negative → label 0
Skipped empty file: /content/drive/MyDrive/nlp_project/data/raw/Negative/negative234.txt
Skipped empty file: /content/drive/MyDrive/nlp_project/data/raw/Negative/negative352.txt
Skipped empty file: /content/drive/MyDrive/nlp_project/data/raw/Negative/negative178.txt
Skipped empty file: /content/drive/MyDrive/nlp_project/data/raw/Negative/negative473.txt
Skipped empty file: /content/drive/MyDrive/nlp_project/data/raw/Negative/negative294.txt
Skipped empty file: /content/drive/MyDrive/nlp_project/data/raw/Negative/negative191.txt
Skipped empty file: /content/drive/MyDrive/nlp_project/data/raw/Negative/negative184.txt
Skipped empty file: /content/drive/MyDrive/nlp_project/data/raw/Negative/negative176.txt
Skipped empty file: /content/drive/MyDr

## 2) Data Cleaning

In [18]:
import pandas as pd, re, os
input_csv = '/content/arabic_tweets_labeled.csv'
if not os.path.exists(input_csv):
    raise FileNotFoundError('Labeled CSV not found. Run the previous cell or change path.')

df = pd.read_csv(input_csv)

def clean_arabic_text(text):
    text = str(text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[@#]\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[\u064B-\u0652]", "", text)
    text = re.sub(r"[^\w\s\u0600-\u06FF]", " ", text)
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['tweet'] = df['tweet'].apply(clean_arabic_text)
df = df[df['tweet'].str.strip() != '']
cleaned_csv = '/content/arabic_tweets_cleaned.csv'
df.to_csv(cleaned_csv, index=False, encoding='utf-8')
print('Cleaned saved to', cleaned_csv)
print('Remaining samples:', len(df))
df.head(5)

Cleaned saved to /content/arabic_tweets_cleaned.csv
Remaining samples: 1984


Unnamed: 0,tweet,label
0,و الله و بالنهاية يكونو رائعون,1
1,الله ياريت يبقى فى اخلاق كدة فى الوقت دة,1
2,جامدة جد,1
3,صباح الدرب للجنة,1
4,بارك الله لكما وبارك عليكما وجمع بينكما على خير,1


## 3) Data Splitting

In [19]:
import pandas as pd, os
from sklearn.model_selection import train_test_split

cleaned_csv = '/content/arabic_tweets_cleaned.csv'
df = pd.read_csv(cleaned_csv)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

os.makedirs('/content/data/processed', exist_ok=True)
train_df.to_csv('/content/data/processed/train.csv', index=False)
test_df.to_csv('/content/data/processed/test.csv', index=False)
print('Train size:', len(train_df), 'Test size:', len(test_df))
print('Train distribution:\n', train_df['label'].value_counts())
print('Test distribution:\n', test_df['label'].value_counts())

Train size: 1587 Test size: 397
Train distribution:
 label
1    800
0    787
Name: count, dtype: int64
Test distribution:
 label
1    200
0    197
Name: count, dtype: int64


In [20]:
!pip install -q --upgrade transformers accelerate datasets

## 4) AraBERT Model Training

In [21]:
import os
import random
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


MODEL_NAME = "aubmindlab/bert-base-arabertv2"
TRAIN_CSV = "/content/data/processed/train.csv"
TEST_CSV = "/content/data/processed/test.csv"
OUTPUT_DIR = "/content/models/arabertv2-sentiment"
BATCH_SIZE = 8
NUM_EPOCHS = 3
MAX_LENGTH = 128
SEED = 42


def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
set_seed(SEED)

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
test_ds = Dataset.from_pandas(test_df.reset_index(drop=True))

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    return tokenizer(
        examples["tweet"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False
    )

train_ds = train_ds.map(preprocess_function, batched=True, remove_columns=["tweet"])
test_ds = test_ds.map(preprocess_function, batched=True, remove_columns=["tweet"])


data_collator = DataCollatorWithPadding(tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    do_eval=True,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

os.makedirs(OUTPUT_DIR, exist_ok=True)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Training finished. Model saved to:", OUTPUT_DIR)


Map:   0%|          | 0/1587 [00:00<?, ? examples/s]

Map:   0%|          | 0/397 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 31}.


Step,Training Loss
50,0.4504
100,0.3653
150,0.3882
200,0.3537
250,0.2413
300,0.1523
350,0.191
400,0.1515
450,0.1101
500,0.101


Training finished. Model saved to: /content/models/arabertv2-sentiment


## 5) ML Model: TF-IDF + Logistic Regression

In [22]:
# =========================
# ML MODEL: TF-IDF + Logistic Regression
# =========================

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), analyzer="word")
X_train = tfidf.fit_transform(train_df["tweet"])
X_test = tfidf.transform(test_df["tweet"])

y_train = train_df["label"]
y_test = test_df["label"]

# Logistic Regression model
ml_model = LogisticRegression(max_iter=1000)
ml_model.fit(X_train, y_train)

# Predictions
y_pred = ml_model.predict(X_test)

# Evaluation
acc = accuracy_score(y_test, y_pred)
p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", zero_division=0)

print("ML Model Results")
print("Accuracy:", acc)
print("Precision:", p)
print("Recall:", r)
print("F1-score:", f1)


ML Model Results
Accuracy: 0.8664987405541562
Precision: 0.8656716417910447
Recall: 0.87
F1-score: 0.8678304239401496


## 6) LSTM Model

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

MAX_WORDS = 20000
MAX_LEN = 128 # Increased from 100

tokenizer_lstm = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer_lstm.fit_on_texts(train_df["tweet"])

X_train_seq = tokenizer_lstm.texts_to_sequences(train_df["tweet"])
X_test_seq = tokenizer_lstm.texts_to_sequences(test_df["tweet"])

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")

y_train = train_df["label"].values
y_test = test_df["label"].values

# Build LSTM model
lstm_model = Sequential([
    Embedding(MAX_WORDS, 200), # Increased embedding dim from 128 to 200
    LSTM(256, return_sequences=False), # Increased LSTM units from 128 to 256
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

lstm_model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

lstm_model.fit(X_train_pad, y_train, validation_split=0.1, epochs=10, batch_size=32) # Increased epochs from 5 to 10

# Evaluation
y_pred_prob = lstm_model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int).reshape(-1)

acc = accuracy_score(y_test, y_pred)
p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", zero_division=0)

print("LSTM Results")
print("Accuracy:", acc)
print("Precision:", p)
print("Recall:", r)
print("F1-score:", f1)


Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 47ms/step - accuracy: 0.5204 - loss: 0.6955 - val_accuracy: 0.4025 - val_loss: 0.7034
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.4725 - loss: 0.6960 - val_accuracy: 0.4025 - val_loss: 0.6936
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.4928 - loss: 0.6951 - val_accuracy: 0.4025 - val_loss: 0.6948
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.4635 - loss: 0.6948 - val_accuracy: 0.4025 - val_loss: 0.6939
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.4675 - loss: 0.6950 - val_accuracy: 0.4025 - val_loss: 0.6941
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.5025 - loss: 0.6938 - val_accuracy: 0.4025 - val_loss: 0.6946
Epoch 7/10
[1m45/45[0m [32m━━━━

## 7) AraBERT Model Evaluation

In [24]:
import pandas as pd, os, json, torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

MODEL_DIR = '/content/models/arabertv2-sentiment'
TEST_CSV = '/content/data/processed/test.csv'

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval()

test_df = pd.read_csv(TEST_CSV)
test_ds = Dataset.from_pandas(test_df.reset_index(drop=True))

def preprocess(examples):
    return tokenizer(examples['tweet'], truncation=True, max_length=128)

test_ds = test_ds.map(preprocess, batched=True, remove_columns=['tweet'])

from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(test_ds, batch_size=8, collate_fn=collator)
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in loader:
        labels = batch.pop('labels') if 'labels' in batch else None
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        if labels is not None:
            all_labels.extend(labels.cpu().numpy())

if all_labels:
    acc = accuracy_score(all_labels, all_preds)
    p, r, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)
    metrics = {'accuracy': acc, 'precision': p, 'recall': r, 'f1': f1}
    print('Test metrics:', metrics)
    with open(os.path.join(MODEL_DIR, 'test_metrics.json'), 'w', encoding='utf-8') as f:
        json.dump(metrics, f, ensure_ascii=False, indent=2)
else:
    print('No labels in test set — only saved predictions.')

# Save predictions
test_df['preds'] = all_preds
test_df.to_csv(os.path.join(MODEL_DIR, 'test_predictions.csv'), index=False)
print('Predictions saved to', os.path.join(MODEL_DIR, 'test_predictions.csv'))

Map:   0%|          | 0/397 [00:00<?, ? examples/s]

Test metrics: {'accuracy': 0.9319899244332494, 'precision': 0.9346733668341709, 'recall': 0.93, 'f1': 0.9323308270676691}
Predictions saved to /content/models/arabertv2-sentiment/test_predictions.csv


## 8) AraBERT Prediction Helper

In [25]:
# === 7) Prediction helper ===
# Load saved model directory then run predict(text)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, numpy as np

MODEL_DIR = '/content/models/arabertv2-sentiment'
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval()

def predict(text):
    enc = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        out = model(**enc)
    pred = int(torch.argmax(out.logits, dim=1).cpu().numpy()[0])
    probs = torch.nn.functional.softmax(out.logits, dim=1).cpu().numpy()[0].tolist()
    label_name = 'Positive' if pred==1 else 'Negative'
    return {'label': label_name, 'pred': pred, 'probs': probs}


## 9) Interactive Prediction

In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

MODEL_PATH = "/content/models/arabertv2-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)


def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()

    label_map = {0: "Negative ❌", 1: "Positive ✅"}
    return label_map[predicted_class_id]

# ==== Try it manually ====
while True:
    text = input("اكتب جملة للتجربة (أو اكتب q للخروج): ")
    if text.lower() == "q":
        break
    print("التصنيف:", predict_sentiment(text))
    print("-----------------------------------")


اكتب جملة للتجربة (أو اكتب q للخروج): طيب
التصنيف: Positive ✅
-----------------------------------
اكتب جملة للتجربة (أو اكتب q للخروج): شرير
التصنيف: Negative ❌
-----------------------------------
اكتب جملة للتجربة (أو اكتب q للخروج): q
