In [3]:
# Cell 1: Import Libraries
import os
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import re
import string
import arabic_reshaper
from bidi.algorithm import get_display

In [9]:
!pip install transformers torch accelerate




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [12]:
from huggingface_hub import login

# Remplace "TON_TOKEN_HF" par ton vrai token Hugging Face
login("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")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\PC USER\.cache\huggingface\token
Login successful


In [13]:
!huggingface-cli whoami

AichaESSALMI1


In [14]:
from transformers import AutoTokenizer
import transformers
import torch

model = "meta-llama/Llama-2-7b-chat-hf" # meta-llama/Llama-2-7b-hf

tokenizer = AutoTokenizer.from_pretrained(model, use_auth_token=True)



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [15]:
  # Cell 3: Dataset Loading Function
def load_arabic_dataset(base_path):
      """Load Arabic text files from directories"""
      texts = []
      labels = []

      print("Starting to load dataset...")
      total_files = sum([len(files) for r, d, files in os.walk(base_path)])
      processed_files = 0

      for class_name in os.listdir(base_path):
          class_path = os.path.join(base_path, class_name)
          if os.path.isdir(class_path):
              print(f"\nProcessing class: {class_name}")

              for filename in os.listdir(class_path):
                  if filename.endswith('.txt'):
                      file_path = os.path.join(class_path, filename)
                      try:
                          with open(file_path, 'r', encoding='utf-8') as f:
                              text = f.read()
                              texts.append(text)
                              labels.append(class_name)

                          processed_files += 1
                          if processed_files % 100 == 0:
                              print(f"Processed {processed_files}/{total_files} files")

                      except Exception as e:
                          print(f"Error reading file {file_path}: {e}")

      print(f"\nCompleted loading {processed_files} files from {len(set(labels))} classes")
      return texts, labels

In [17]:
# Cell 4: LLaMA Embedding Function
def get_llama_embedding(text, tokenizer, model, device):
    """Get embeddings from LLaMA model"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True,
                      padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.cpu().numpy().flatten()

In [18]:
# Cell 5: Main Training Function
def train_model(base_path):
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load dataset
    print("Loading dataset...")
    texts, labels = load_arabic_dataset(base_path)

    # Preprocess texts
    print("Preprocessing texts...")
    processed_texts = [preprocess_arabic_text(text) for text in texts]

    # Encode labels
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    # Load LLaMA model and tokenizer
    print("Loading LLaMA model...")
    MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME).to(device)

    # Generate embeddings
    print("Generating embeddings...")
    embeddings = []
    batch_size = 32

    for i in range(0, len(processed_texts), batch_size):
        batch_texts = processed_texts[i:i + batch_size]
        batch_embeddings = [get_llama_embedding(text, tokenizer, model, device)
                          for text in batch_texts]
        embeddings.extend(batch_embeddings)
        print(f"Processed {i + len(batch_texts)}/{len(processed_texts)} texts")

    X = np.array(embeddings)
    y = encoded_labels

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train XGBoost classifier
    print("Training XGBoost classifier...")
    xgb_model = xgb.XGBClassifier(
        objective="multi:softmax",
        num_class=len(label_encoder.classes_),
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        use_label_encoder=False,
        eval_metric='mlogloss'
    )

    xgb_model.fit(X_train, y_train)

    # Evaluate model
    print("Evaluating model...")
    y_pred = xgb_model.predict(X_test)

    # Print classification report
    report = classification_report(
        y_test,
        y_pred,
        target_names=label_encoder.classes_,
        digits=4
    )
    print("\nClassification Report:")
    print(report)

    # Save models and label encoder
    print("Saving models...")
    xgb_model.save_model("llama_xgboost_model.json")
    np.save("label_encoder_classes.npy", label_encoder.classes_)

    return xgb_model, label_encoder, tokenizer, model

In [19]:
# Cell 6: Prediction Function
def predict_new_text(text, xgb_model, tokenizer, llama_model, label_encoder, device):
    """Predict class for new text"""
    # Preprocess text
    processed_text = preprocess_arabic_text(text)

    # Get embedding
    embedding = get_llama_embedding(processed_text, tokenizer, llama_model, device)

    # Reshape for prediction
    embedding = embedding.reshape(1, -1)

    # Predict
    prediction = xgb_model.predict(embedding)
    probabilities = xgb_model.predict_proba(embedding)

    # Get class name and probability
    predicted_class = label_encoder.inverse_transform(prediction)[0]
    confidence = np.max(probabilities)

    return predicted_class, confidence

In [20]:
# Cell 7: Run Training
if __name__ == "__main__":
    # Set your dataset path
    base_path ="C:\\Users\\PC USER\\OneDrive\\Bureau\\corpusFinal"


    # Train the model
    xgb_model, label_encoder, tokenizer, llama_model = train_model(base_path)

    # Example prediction
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Test with a sample text
    sample_text = "Your Arabic text here"
    predicted_class, confidence = predict_new_text(
        sample_text,
        xgb_model,
        tokenizer,
        llama_model,
        label_encoder,
        device
    )
    print(f"Predicted class: {predicted_class}")
    print(f"Confidence: {confidence:.4f}")

Using device: cuda
Loading dataset...
Starting to load dataset...

Processing class: الغرفة الإجتماعية
Processed 100/32325 files
Processed 200/32325 files
Processed 300/32325 files
Processed 400/32325 files
Processed 500/32325 files
Processed 600/32325 files
Processed 700/32325 files
Processed 800/32325 files
Processed 900/32325 files
Processed 1000/32325 files
Processed 1100/32325 files
Processed 1200/32325 files
Processed 1300/32325 files
Processed 1400/32325 files
Processed 1500/32325 files
Processed 1600/32325 files
Processed 1700/32325 files
Processed 1800/32325 files
Processed 1900/32325 files
Processed 2000/32325 files
Processed 2100/32325 files
Processed 2200/32325 files
Processed 2300/32325 files
Processed 2400/32325 files
Processed 2500/32325 files
Processed 2600/32325 files
Processed 2700/32325 files
Processed 2800/32325 files
Processed 2900/32325 files
Processed 3000/32325 files
Processed 3100/32325 files
Processed 3200/32325 files
Processed 3300/32325 files
Processed 3400/

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

: 