In [None]:
!pip install transformers datasets torch scikit-learn


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pickle
import zipfile
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [None]:
df = pd.read_csv('train.csv')  # Replace with your dataset path
print("Dataset shape:", df.shape)
print("\nLabel distribution:")
print(df['label'].value_counts())

label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label mapping:", label_mapping)

num_labels = len(label_encoder.classes_)
print(f"Number of classes: {num_labels}")


Dataset shape: (5873, 3)

Label distribution:
label
not_applicable    3149
hope              1812
hate               912
Name: count, dtype: int64
Label mapping: {'hate': np.int64(0), 'hope': np.int64(1), 'not_applicable': np.int64(2)}
Number of classes: 3


In [None]:
#model_name = "aubmindlab/bert-large-arabertv02"
#CAMeL-Lab/bert-base-arabic-camelbert-da
#araelectra-base-discriminator
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="single_label_classification"
)


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class ArabicSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
train_dataset = ArabicSentimentDataset(
    texts=df['text'].tolist(),
    labels=df['label_encoded'].tolist(),
    tokenizer=tokenizer
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
training_args = TrainingArguments(
    output_dir='/mnt/c/Users/T2410260/model/Arabert-cache',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_strategy='epoch',

    load_best_model_at_end=False,
    push_to_hub=False,
    report_to=None,
    dataloader_pin_memory=False
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting training...")
trainer.train()




Starting training...


Step,Training Loss
100,1.0143
200,0.8318
300,0.8173
400,0.7726
500,0.7793
600,0.7604
700,0.7554
800,0.6639
900,0.6495
1000,0.6325


TrainOutput(global_step=1472, training_loss=0.6823376922503762, metrics={'train_runtime': 92.4602, 'train_samples_per_second': 254.077, 'train_steps_per_second': 15.92, 'total_flos': 1545265102316544.0, 'train_loss': 0.6823376922503762, 'epoch': 4.0})

In [None]:
model.save_pretrained('/mnt/c/Users/T2410260/model/fine_tuned_araelectra-base-discriminator')
tokenizer.save_pretrained('/mnt/c/Users/T2410260/model/fine_tuned_araelectra-base-discriminator')

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Model, tokenizer, and label encoder saved successfully!")

Model, tokenizer, and label encoder saved successfully!


In [None]:
import pandas as pd
import torch
import zipfile
import os

def predict_validation_set(model, tokenizer, label_encoder, validation_csv_path, output_path='submission.csv', max_length=128):
    """
    Predict sentiments for validation set and create submission file
    """
    # Load validation data
    val_df = pd.read_csv(validation_csv_path)
    print(f"Loaded {len(val_df)} samples from validation set")

    # Ensure required columns exist
    if 'id' not in val_df.columns or 'text' not in val_df.columns:
        raise ValueError("Validation CSV must contain 'id' and 'text' columns")

    # Set model to evaluation mode
    model.eval()

    predictions = []

    # Process in batches for efficiency
    batch_size = 32
    total_batches = (len(val_df) + batch_size - 1) // batch_size

    print("Making predictions...")

    with torch.no_grad():
        for i in range(0, len(val_df), batch_size):
            batch_texts = val_df['text'].iloc[i:i+batch_size].tolist()

            # Handle NaN or empty texts
            batch_texts = [str(text) if pd.notna(text) and text != '' else "empty text" for text in batch_texts]

            # Tokenize batch
            inputs = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            )

            # Move to device if using GPU
            device = next(model.parameters()).device
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Make predictions
            outputs = model(**inputs)
            logits = outputs.logits

            # Get predicted classes
            predicted_classes = torch.argmax(logits, dim=-1).cpu().numpy()
            predicted_labels = label_encoder.inverse_transform(predicted_classes)

            predictions.extend(predicted_labels)

            # Progress update
            print(f"Processed batch {(i//batch_size) + 1}/{total_batches}")

    # Create submission dataframe
    submission_df = pd.DataFrame({
        'id': val_df['id'],
        'prediction': predictions
    })

    # Save to CSV
    submission_df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"Predictions saved to {output_path}")

    # Create zip file
    zip_path = output_path.replace('.csv', '.zip')
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(output_path, os.path.basename(output_path))

    print(f"Submission zip file created: {zip_path}")

    # Display sample predictions
    print("\nSample predictions:")
    print(submission_df.head(10))

    # Display prediction distribution
    print("\nPrediction distribution:")
    print(submission_df['prediction'].value_counts())

    return submission_df

# Usage
validation_csv_path = 'validation.csv'  # Replace with your validation file path
submission_df = predict_validation_set(
    model=model,
    tokenizer=tokenizer,
    label_encoder=label_encoder,
    validation_csv_path=validation_csv_path,
    output_path='submission008.csv'
)


Loaded 1476 samples from validation set
Making predictions...
Processed batch 1/47
Processed batch 2/47
Processed batch 3/47
Processed batch 4/47
Processed batch 5/47
Processed batch 6/47
Processed batch 7/47
Processed batch 8/47
Processed batch 9/47
Processed batch 10/47
Processed batch 11/47
Processed batch 12/47
Processed batch 13/47
Processed batch 14/47
Processed batch 15/47
Processed batch 16/47
Processed batch 17/47
Processed batch 18/47
Processed batch 19/47
Processed batch 20/47
Processed batch 21/47
Processed batch 22/47
Processed batch 23/47
Processed batch 24/47
Processed batch 25/47
Processed batch 26/47
Processed batch 27/47
Processed batch 28/47
Processed batch 29/47
Processed batch 30/47
Processed batch 31/47
Processed batch 32/47
Processed batch 33/47
Processed batch 34/47
Processed batch 35/47
Processed batch 36/47
Processed batch 37/47
Processed batch 38/47
Processed batch 39/47
Processed batch 40/47
Processed batch 41/47
Processed batch 42/47
Processed batch 43/47
P

In [None]:
from huggingface_hub import notebook_login

# Login to Hugging Face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Define your model repository name
repo_name = "Ash2749/araelectra-base-discriminator-t1"

# Push model with additional options
model.push_to_hub(
    repo_name,
    commit_message="Fine-tuned araelectra-base-discriminator BERT for Arabic sentiment analysis",
    private=False  # Set to True if you want a private repository
)

# Push tokenizer
tokenizer.push_to_hub(
    repo_name,
    commit_message="Tokenizer for Arabic sentiment analysis model"
)

print(f"Model and tokenizer successfully pushed to: https://huggingface.co/{repo_name}")

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Model and tokenizer successfully pushed to: https://huggingface.co/Ash2749/araelectra-base-discriminator-t1
