# Chat with BLIP

In this notebook, we'll illustrate usage of BLIP, a state-of-the-art vision-language model by Salesforce.

HuggingFace docs: https://huggingface.co/docs/transformers/main/en/model_doc/blip_2.

## Set-up environment

Note that it's advised to run this notebook on a GPU environment, high RAM.

In [1]:
!pip install transformers peft accelerate bitsandbytes





In [9]:
!pip install kagglehub


Collecting kagglehub
  Downloading kagglehub-0.3.12-py3-none-any.whl.metadata (38 kB)
Downloading kagglehub-0.3.12-py3-none-any.whl (67 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.12


## preprocessing

In [7]:
!pip install rouge-score nltk scikit-learn matplotlib seaborn

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m40.1 MB/s[0m  [33m0:00:00[0m
[?25hDownloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Building wheels for collected packages: rouge-score
[33m  DEPRECATION: Building 'rouge-score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge-score'. Discussion can be found at https://githu

In [3]:

import pandas as pd
import numpy as np
from collections import Counter
import re
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

class ArabicCaptionEvaluator:
    """
    Comprehensive evaluation suite for Arabic image captioning with enhanced preprocessing and multi-reference support
    """

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
        self.smoothing = SmoothingFunction().method1

        # Arabic diacritics (Tashkeel) for removal
        self.arabic_diacritics = re.compile(r'[\u064B-\u065F\u0670\u0640]')

        # Arabic punctuation marks
        self.arabic_punctuation = re.compile(r'[؟،؛٪٫٬‰؍]')

        # General punctuation
        self.general_punctuation = re.compile(r'[!"#$%&\'()*+,\-./:;<=>?@\[\]^_`{|}~]')

        # Multiple spaces
        self.multiple_spaces = re.compile(r'\s+')

        # Arabic letter normalization patterns
        self.alef_patterns = re.compile(r'[أإآا]')
        self.yeh_patterns = re.compile(r'[يى]')
        self.teh_patterns = re.compile(r'[ةه]')

    def normalize_arabic_text(self, text: str) -> str:
        """
        Advanced Arabic text normalization
        """
        if pd.isna(text) or text == '':
            return ''

        text = str(text).strip()

        # Remove Arabic diacritics (Tashkeel)
        text = self.arabic_diacritics.sub('', text)

        # Normalize Arabic letters
        text = self.alef_patterns.sub('ا', text)  # Normalize all Alef variants to basic Alef
        text = self.yeh_patterns.sub('ي', text)   # Normalize Yeh variants
        text = self.teh_patterns.sub('ة', text)   # Normalize Teh Marbuta

        # Remove punctuation (both Arabic and general)
        text = self.arabic_punctuation.sub(' ', text)
        text = self.general_punctuation.sub(' ', text)

        # Remove numbers and English characters
        text = re.sub(r'[0-9a-zA-Z]', ' ', text)

        # Remove extra whitespace
        text = self.multiple_spaces.sub(' ', text)

        # Remove leading/trailing whitespace
        text = text.strip()

        return text

    def advanced_tokenize_arabic(self, text: str) -> List[str]:
        """
        Advanced Arabic tokenization with normalization
        """
        normalized_text = self.normalize_arabic_text(text)

        if not normalized_text:
            return []

        # Split by whitespace
        tokens = normalized_text.split()

        # Filter out very short tokens (less than 2 characters)
        tokens = [token for token in tokens if len(token) >= 2]

        return tokens

    def clean_for_rouge(self, text: str) -> str:
        """
        Clean text specifically for ROUGE evaluation
        """
        if pd.isna(text) or text == '':
            return ''

        text = str(text).strip()

        # Remove diacritics
        text = self.arabic_diacritics.sub('', text)

        # Normalize Arabic letters
        text = self.alef_patterns.sub('ا', text)
        text = self.yeh_patterns.sub('ي', text)
        text = self.teh_patterns.sub('ة', text)

        # Keep some punctuation for ROUGE but clean excessive ones
        text = re.sub(r'[0-9a-zA-Z]', ' ', text)
        text = self.multiple_spaces.sub(' ', text)

        return text.strip()

    def calculate_multi_reference_bleu(self, references: List[str], candidate: str) -> Dict[str, float]:
        """
        Calculate BLEU scores against multiple references (takes maximum score)
        """
        if not references or not candidate:
            return {'bleu1': 0.0, 'bleu2': 0.0, 'bleu3': 0.0, 'bleu4': 0.0}

        cand_tokens = self.advanced_tokenize_arabic(candidate)
        if not cand_tokens:
            return {'bleu1': 0.0, 'bleu2': 0.0, 'bleu3': 0.0, 'bleu4': 0.0}

        # Prepare multiple references
        reference_list = []
        for ref in references:
            if pd.notna(ref) and ref.strip():
                ref_tokens = self.advanced_tokenize_arabic(ref)
                if ref_tokens:
                    reference_list.append(ref_tokens)

        if not reference_list:
            return {'bleu1': 0.0, 'bleu2': 0.0, 'bleu3': 0.0, 'bleu4': 0.0}

        try:
            # BLEU automatically handles multiple references by taking the best match
            bleu1 = sentence_bleu(reference_list, cand_tokens, weights=(1, 0, 0, 0), smoothing_function=self.smoothing)
            bleu2 = sentence_bleu(reference_list, cand_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=self.smoothing)
            bleu3 = sentence_bleu(reference_list, cand_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=self.smoothing)
            bleu4 = sentence_bleu(reference_list, cand_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=self.smoothing)
        except:
            return {'bleu1': 0.0, 'bleu2': 0.0, 'bleu3': 0.0, 'bleu4': 0.0}

        return {
            'bleu1': round(bleu1, 4),
            'bleu2': round(bleu2, 4),
            'bleu3': round(bleu3, 4),
            'bleu4': round(bleu4, 4)
        }

    def calculate_multi_reference_rouge(self, references: List[str], candidate: str) -> Dict[str, float]:
        """
        Calculate ROUGE scores against multiple references (takes maximum score)
        """
        if not references or not candidate:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        cand_clean = self.clean_for_rouge(candidate)
        if not cand_clean:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        best_scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        for ref in references:
            if pd.notna(ref) and ref.strip():
                ref_clean = self.clean_for_rouge(ref)
                if ref_clean:
                    try:
                        scores = self.rouge_scorer.score(ref_clean, cand_clean)
                        # Take maximum score across references
                        best_scores['rouge1'] = max(best_scores['rouge1'], scores['rouge1'].fmeasure)
                        best_scores['rouge2'] = max(best_scores['rouge2'], scores['rouge2'].fmeasure)
                        best_scores['rougeL'] = max(best_scores['rougeL'], scores['rougeL'].fmeasure)
                    except:
                        continue

        return {k: round(v, 4) for k, v in best_scores.items()}

    def calculate_bleu_score(self, reference: str, candidate: str) -> Dict[str, float]:
        """
        Calculate BLEU scores with advanced tokenization (single reference)
        """
        ref_tokens = self.advanced_tokenize_arabic(reference)
        cand_tokens = self.advanced_tokenize_arabic(candidate)

        if not ref_tokens or not cand_tokens:
            return {'bleu1': 0.0, 'bleu2': 0.0, 'bleu3': 0.0, 'bleu4': 0.0}

        # BLEU expects list of references
        reference_list = [ref_tokens]

        try:
            bleu1 = sentence_bleu(reference_list, cand_tokens, weights=(1, 0, 0, 0), smoothing_function=self.smoothing)
            bleu2 = sentence_bleu(reference_list, cand_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=self.smoothing)
            bleu3 = sentence_bleu(reference_list, cand_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=self.smoothing)
            bleu4 = sentence_bleu(reference_list, cand_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=self.smoothing)
        except:
            return {'bleu1': 0.0, 'bleu2': 0.0, 'bleu3': 0.0, 'bleu4': 0.0}

        return {
            'bleu1': round(bleu1, 4),
            'bleu2': round(bleu2, 4),
            'bleu3': round(bleu3, 4),
            'bleu4': round(bleu4, 4)
        }

    def calculate_rouge_scores(self, reference: str, candidate: str) -> Dict[str, float]:
        """
        Calculate ROUGE scores with Arabic-specific cleaning (single reference)
        """
        ref_clean = self.clean_for_rouge(reference)
        cand_clean = self.clean_for_rouge(candidate)

        if not ref_clean or not cand_clean:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        try:
            scores = self.rouge_scorer.score(ref_clean, cand_clean)
            return {
                'rouge1': round(scores['rouge1'].fmeasure, 4),
                'rouge2': round(scores['rouge2'].fmeasure, 4),
                'rougeL': round(scores['rougeL'].fmeasure, 4)
            }
        except:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

    def calculate_cosine_similarity(self, reference: str, candidate: str) -> float:
        """
        Calculate cosine similarity with normalized text
        """
        ref_clean = self.normalize_arabic_text(reference)
        cand_clean = self.normalize_arabic_text(candidate)

        if not ref_clean or not cand_clean:
            return 0.0

        try:
            # Use character n-grams (1-4) for better Arabic similarity
            vectorizer = TfidfVectorizer(
                analyzer='char',
                ngram_range=(1, 4),
                lowercase=False,
                strip_accents=None
            )
            vectors = vectorizer.fit_transform([ref_clean, cand_clean])
            similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
            return round(similarity, 4)
        except:
            return 0.0

    def calculate_word_similarity(self, reference: str, candidate: str) -> float:
        """
        Calculate word-level cosine similarity
        """
        ref_tokens = self.advanced_tokenize_arabic(reference)
        cand_tokens = self.advanced_tokenize_arabic(candidate)

        if not ref_tokens or not cand_tokens:
            return 0.0

        try:
            # Join tokens back for TF-IDF
            ref_text = ' '.join(ref_tokens)
            cand_text = ' '.join(cand_tokens)

            vectorizer = TfidfVectorizer(
                analyzer='word',
                lowercase=False,
                strip_accents=None
            )
            vectors = vectorizer.fit_transform([ref_text, cand_text])
            similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
            return round(similarity, 4)
        except:
            return 0.0

    def calculate_jaccard_similarity(self, reference: str, candidate: str) -> float:
        """
        Calculate Jaccard similarity coefficient
        """
        ref_tokens = set(self.advanced_tokenize_arabic(reference))
        cand_tokens = set(self.advanced_tokenize_arabic(candidate))

        if not ref_tokens or not cand_tokens:
            return 0.0

        intersection = len(ref_tokens.intersection(cand_tokens))
        union = len(ref_tokens.union(cand_tokens))

        if union == 0:
            return 0.0

        jaccard_sim = intersection / union
        return round(jaccard_sim, 4)

    def calculate_lin_similarity(self, reference: str, candidate: str) -> float:
        """
        Calculate Lin similarity (Dice coefficient) with advanced tokenization
        """
        ref_tokens = set(self.advanced_tokenize_arabic(reference))
        cand_tokens = set(self.advanced_tokenize_arabic(candidate))

        if not ref_tokens or not cand_tokens:
            return 0.0

        intersection = len(ref_tokens.intersection(cand_tokens))
        total = len(ref_tokens) + len(cand_tokens)

        if total == 0:
            return 0.0

        lin_sim = (2.0 * intersection) / total
        return round(lin_sim, 4)

    def calculate_semantic_similarity(self, reference: str, candidate: str) -> float:
        """
        Enhanced semantic similarity using weighted word overlap
        """
        ref_tokens = self.advanced_tokenize_arabic(reference)
        cand_tokens = self.advanced_tokenize_arabic(candidate)

        if not ref_tokens or not cand_tokens:
            return 0.0

        # Count word frequencies
        ref_counter = Counter(ref_tokens)
        cand_counter = Counter(cand_tokens)

        # Calculate weighted overlap
        common_words = set(ref_tokens).intersection(set(cand_tokens))

        if not common_words:
            return 0.0

        # Calculate precision and recall with frequency weighting
        overlap_score = 0
        total_ref_freq = sum(ref_counter.values())
        total_cand_freq = sum(cand_counter.values())

        for word in common_words:
            # Weight by frequency
            ref_weight = ref_counter[word] / total_ref_freq
            cand_weight = cand_counter[word] / total_cand_freq
            overlap_score += min(ref_weight, cand_weight)

        # Normalize by average document length
        avg_length = (len(ref_tokens) + len(cand_tokens)) / 2
        normalized_score = overlap_score * avg_length

        return round(min(normalized_score, 1.0), 4)

    def evaluate_single_pair(self, reference: str, candidate: str) -> Dict[str, float]:
        """
        Evaluate a single reference-candidate pair with all metrics
        """
        # BLEU scores
        bleu_scores = self.calculate_bleu_score(reference, candidate)

        # ROUGE scores
        rouge_scores = self.calculate_rouge_scores(reference, candidate)

        # Similarity metrics
        char_cosine_sim = self.calculate_cosine_similarity(reference, candidate)
        word_cosine_sim = self.calculate_word_similarity(reference, candidate)
        jaccard_sim = self.calculate_jaccard_similarity(reference, candidate)
        lin_sim = self.calculate_lin_similarity(reference, candidate)
        semantic_sim = self.calculate_semantic_similarity(reference, candidate)

        # Combine all metrics
        results = {
            **bleu_scores,
            **rouge_scores,
            'char_cosine_similarity': char_cosine_sim,
            'word_cosine_similarity': word_cosine_sim,
            'jaccard_similarity': jaccard_sim,
            'lin_similarity': lin_sim,
            'semantic_similarity': semantic_sim
        }

        return results

    def evaluate_multi_reference_pair(self, references: List[str], candidate: str) -> Dict[str, float]:
        """
        Evaluate a candidate against multiple references (takes best scores)
        """
        # Multi-reference BLEU scores
        bleu_scores = self.calculate_multi_reference_bleu(references, candidate)

        # Multi-reference ROUGE scores  
        rouge_scores = self.calculate_multi_reference_rouge(references, candidate)

        # For similarity metrics, take maximum across all references
        similarities = {
            'char_cosine_similarity': 0.0,
            'word_cosine_similarity': 0.0,
            'jaccard_similarity': 0.0,
            'lin_similarity': 0.0,
            'semantic_similarity': 0.0
        }

        for ref in references:
            if pd.notna(ref) and ref.strip():
                char_cosine_sim = self.calculate_cosine_similarity(ref, candidate)
                word_cosine_sim = self.calculate_word_similarity(ref, candidate)
                jaccard_sim = self.calculate_jaccard_similarity(ref, candidate)
                lin_sim = self.calculate_lin_similarity(ref, candidate)
                semantic_sim = self.calculate_semantic_similarity(ref, candidate)

                # Take maximum
                similarities['char_cosine_similarity'] = max(similarities['char_cosine_similarity'], char_cosine_sim)
                similarities['word_cosine_similarity'] = max(similarities['word_cosine_similarity'], word_cosine_sim)
                similarities['jaccard_similarity'] = max(similarities['jaccard_similarity'], jaccard_sim)
                similarities['lin_similarity'] = max(similarities['lin_similarity'], lin_sim)
                similarities['semantic_similarity'] = max(similarities['semantic_similarity'], semantic_sim)

        # Combine all metrics
        results = {
            **bleu_scores,
            **rouge_scores,
            **similarities
        }

        return results

    def evaluate_dataset(self, df: pd.DataFrame, ref_col: str = 'Description',
                        cand_col: str = 'arabic_caption') -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Evaluate entire dataset and return both metrics and detailed results
        """
        print(f"Evaluating {len(df)} caption pairs with enhanced preprocessing...")

        results = []

        # Reset index to ensure sequential numbering
        df_reset = df.reset_index(drop=True)

        for idx, row in df_reset.iterrows():
            reference = row[ref_col]
            candidate = row[cand_col]

            scores = self.evaluate_single_pair(reference, candidate)
            scores['original_index'] = idx
            results.append(scores)

            if (idx + 1) % 50 == 0:
                print(f"Processed {idx + 1}/{len(df)} pairs...")

        # Create metrics DataFrame
        results_df = pd.DataFrame(results)

        # Create detailed results DataFrame by merging original data with metrics
        detailed_results_df = df_reset.copy()

        # Add all evaluation metrics to the detailed DataFrame
        for metric in results_df.columns:
            if metric != 'original_index':
                detailed_results_df[metric] = results_df[metric].values

        # Add some additional computed columns for analysis
        detailed_results_df['reference_length'] = detailed_results_df[ref_col].apply(
            lambda x: len(self.advanced_tokenize_arabic(x)) if pd.notna(x) else 0
        )
        detailed_results_df['candidate_length'] = detailed_results_df[cand_col].apply(
            lambda x: len(self.advanced_tokenize_arabic(x)) if pd.notna(x) else 0
        )
        detailed_results_df['length_ratio'] = detailed_results_df.apply(
            lambda row: row['candidate_length'] / row['reference_length']
            if row['reference_length'] > 0 else 0, axis=1
        )

        # Add normalized texts for inspection
        detailed_results_df['reference_normalized'] = detailed_results_df[ref_col].apply(
            self.normalize_arabic_text
        )
        detailed_results_df['candidate_normalized'] = detailed_results_df[cand_col].apply(
            self.normalize_arabic_text
        )

        return results_df, detailed_results_df

    def generate_summary_report(self, results_df: pd.DataFrame) -> Dict[str, float]:
        """
        Generate comprehensive summary statistics
        """
        metrics = ['bleu1', 'bleu2', 'bleu3', 'bleu4', 'rouge1', 'rouge2', 'rougeL',
                  'char_cosine_similarity', 'word_cosine_similarity', 'jaccard_similarity',
                  'lin_similarity', 'semantic_similarity']

        summary = {}
        for metric in metrics:
            summary[f'{metric}_mean'] = round(results_df[metric].mean(), 4)
            summary[f'{metric}_std'] = round(results_df[metric].std(), 4)
            summary[f'{metric}_median'] = round(results_df[metric].median(), 4)
            summary[f'{metric}_min'] = round(results_df[metric].min(), 4)
            summary[f'{metric}_max'] = round(results_df[metric].max(), 4)

        return summary

    def plot_evaluation_results(self, results_df: pd.DataFrame, save_path: str = None):
        """
        Create comprehensive visualization of evaluation results
        """
        metrics = ['bleu1', 'bleu2', 'bleu3', 'bleu4', 'rouge1', 'rouge2', 'rougeL',
                  'char_cosine_similarity', 'word_cosine_similarity', 'jaccard_similarity',
                  'lin_similarity', 'semantic_similarity']

        fig, axes = plt.subplots(2, 3, figsize=(20, 12))
        axes = axes.flatten()

        # 1. Box plot of all metrics
        ax1 = axes[0]
        results_df[metrics].boxplot(ax=ax1)
        ax1.set_title('Distribution of All Evaluation Metrics', fontsize=12, fontweight='bold')
        ax1.set_ylabel('Score')
        ax1.tick_params(axis='x', rotation=45)

        # 2. BLEU scores comparison
        ax2 = axes[1]
        bleu_metrics = ['bleu1', 'bleu2', 'bleu3', 'bleu4']
        bleu_means = [results_df[metric].mean() for metric in bleu_metrics]
        bars = ax2.bar(bleu_metrics, bleu_means, color=['lightblue', 'skyblue', 'deepskyblue', 'steelblue'])
        ax2.set_title('BLEU Scores Comparison', fontsize=12, fontweight='bold')
        ax2.set_ylabel('Average Score')
        for bar, value in zip(bars, bleu_means):
            ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                    f'{value:.3f}', ha='center', va='bottom', fontsize=9)

        # 3. ROUGE scores comparison
        ax3 = axes[2]
        rouge_metrics = ['rouge1', 'rouge2', 'rougeL']
        rouge_means = [results_df[metric].mean() for metric in rouge_metrics]
        bars = ax3.bar(rouge_metrics, rouge_means, color=['lightcoral', 'coral', 'tomato'])
        ax3.set_title('ROUGE Scores Comparison', fontsize=12, fontweight='bold')
        ax3.set_ylabel('Average Score')
        for bar, value in zip(bars, rouge_means):
            ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                    f'{value:.3f}', ha='center', va='bottom', fontsize=9)

        # 4. Similarity metrics comparison
        ax4 = axes[3]
        sim_metrics = ['char_cosine_similarity', 'word_cosine_similarity', 'jaccard_similarity',
                      'lin_similarity', 'semantic_similarity']
        sim_means = [results_df[metric].mean() for metric in sim_metrics]
        sim_labels = ['Char Cosine', 'Word Cosine', 'Jaccard', 'Lin', 'Semantic']
        bars = ax4.bar(sim_labels, sim_means, color=['lightgreen', 'green', 'forestgreen', 'darkgreen', 'olive'])
        ax4.set_title('Similarity Metrics Comparison', fontsize=12, fontweight='bold')
        ax4.set_ylabel('Average Score')
        ax4.tick_params(axis='x', rotation=45)
        for bar, value in zip(bars, sim_means):
            ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                    f'{value:.3f}', ha='center', va='bottom', fontsize=8)

        # 5. Correlation heatmap
        ax5 = axes[4]
        correlation_matrix = results_df[metrics].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=ax5,
                   square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
        ax5.set_title('Metric Correlation Matrix', fontsize=12, fontweight='bold')

        # 6. Distribution of best performing metric
        ax6 = axes[5]
        best_metric = 'bleu4'  # or choose based on highest mean
        ax6.hist(results_df[best_metric], bins=25, alpha=0.7, color='gold', edgecolor='black')
        ax6.set_title(f'Distribution of {best_metric.upper()} Scores', fontsize=12, fontweight='bold')
        ax6.set_xlabel(f'{best_metric.upper()} Score')
        ax6.set_ylabel('Frequency')
        ax6.axvline(results_df[best_metric].mean(), color='red', linestyle='--',
                   label=f'Mean: {results_df[best_metric].mean():.3f}')
        ax6.legend()

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')

        plt.show()

# Enhanced main evaluation function
def evaluate_arabic_captions(df: pd.DataFrame, ref_col: str = 'Description',
                           cand_col: str = 'arabic_caption', save_results: bool = True):
    """
    Main function to evaluate Arabic captions with enhanced preprocessing

    Args:
        df: DataFrame with reference and candidate columns
        ref_col: Name of reference (ground truth) column
        cand_col: Name of candidate (generated) column
        save_results: Whether to save results to files

    Returns:
        Dictionary with results dataframe, detailed results, and summary
    """

    print("🚀 Starting Enhanced Arabic Caption Evaluation...")
    print(f"Dataset size: {len(df)} samples")
    print(f"Reference column: {ref_col}")
    print(f"Candidate column: {cand_col}")

    # Initialize evaluator
    evaluator = ArabicCaptionEvaluator()

    # Run evaluation - now returns both metrics and detailed results
    results_df, detailed_results_df = evaluator.evaluate_dataset(df, ref_col, cand_col)

    # Generate summary
    summary = evaluator.generate_summary_report(results_df)

    # Print comprehensive summary
    print("\n📊 ENHANCED EVALUATION SUMMARY")
    print("=" * 60)

    metrics_display = {
        'BLEU-1': 'bleu1_mean',
        'BLEU-2': 'bleu2_mean',
        'BLEU-3': 'bleu3_mean',
        'BLEU-4': 'bleu4_mean',
        'ROUGE-1': 'rouge1_mean',
        'ROUGE-2': 'rouge2_mean',
        'ROUGE-L': 'rougeL_mean',
        'Char Cosine Sim': 'char_cosine_similarity_mean',
        'Word Cosine Sim': 'word_cosine_similarity_mean',
        'Jaccard Sim': 'jaccard_similarity_mean',
        'Lin Similarity': 'lin_similarity_mean',
        'Semantic Sim': 'semantic_similarity_mean'
    }

    for display_name, metric_key in metrics_display.items():
        mean_val = summary[metric_key]
        std_val = summary[metric_key.replace('_mean', '_std')]
        max_val = summary[metric_key.replace('_mean', '_max')]
        print(f"{display_name:18}: {mean_val:.4f} (±{std_val:.4f}) [Max: {max_val:.4f}]")

    # Print additional statistics
    print(f"\nDataset Statistics:")
    print(f"Average reference length: {detailed_results_df['reference_length'].mean():.2f} words")
    print(f"Average candidate length: {detailed_results_df['candidate_length'].mean():.2f} words")
    print(f"Average length ratio: {detailed_results_df['length_ratio'].mean():.2f}")

    # Create visualizations
    print("\n📈 Generating enhanced visualizations...")
    evaluator.plot_evaluation_results(results_df)

    # Save results if requested
    if save_results:
        # Save detailed results (original data + all metrics)
        detailed_results_df.to_csv('detailed_arabic_caption_evaluation_results.csv', index=False, encoding='utf-8')

        # Save just the metrics
        results_df.to_csv('metrics_only_evaluation_results.csv', index=False)

        # Save summary to JSON
        import json
        with open('enhanced_evaluation_summary.json', 'w', encoding='utf-8') as f:
            json.dump(summary, f, indent=2, ensure_ascii=False)

        # Create a summary Excel file with multiple sheets
        try:
            with pd.ExcelWriter('comprehensive_evaluation_results.xlsx', engine='openpyxl') as writer:
                # Sheet 1: Detailed results
                detailed_results_df.to_excel(writer, sheet_name='Detailed_Results', index=False)

                # Sheet 2: Metrics only
                results_df.to_excel(writer, sheet_name='Metrics_Only', index=False)

                # Sheet 3: Summary statistics
                summary_df = pd.DataFrame([
                    {'Metric': k.replace('_mean', ''), 'Mean': v}
                    for k, v in summary.items() if k.endswith('_mean')
                ])
                summary_df['Std'] = [summary[k.replace('Mean', 'Std').replace('_mean', '_std')] for k in summary_df['Metric'] + '_mean']
                summary_df['Min'] = [summary[k.replace('Mean', 'Min').replace('_mean', '_min')] for k in summary_df['Metric'] + '_mean']
                summary_df['Max'] = [summary[k.replace('Mean', 'Max').replace('_mean', '_max')] for k in summary_df['Metric'] + '_mean']
                summary_df.to_excel(writer, sheet_name='Summary_Statistics', index=False)

                print("💾 Comprehensive Excel file created: comprehensive_evaluation_results.xlsx")
        except ImportError:
            print("⚠️  openpyxl not available. Excel file not created.")

        print("💾 Results saved to:")
        print("  - detailed_arabic_caption_evaluation_results.csv (Original data + all metrics)")
        print("  - metrics_only_evaluation_results.csv (Metrics only)")
        print("  - enhanced_evaluation_summary.json (Summary statistics)")

    return {
        'results_df': results_df,
        'detailed_results_df': detailed_results_df,
        'summary': summary,
        'evaluator': evaluator
    }

# New function for multi-reference evaluation
def evaluate_multi_reference_captions(df: pd.DataFrame, 
                                     ref_cols: List[str] = ['orig_caption', 'gen1', 'gen2'],
                                     cand_col: str = 'Generated_Caption', 
                                     save_results: bool = True):
    """
    Evaluate captions against multiple references simultaneously
    
    Args:
        df: DataFrame with multiple reference columns and candidate column
        ref_cols: List of reference column names
        cand_col: Name of candidate (generated) column
        save_results: Whether to save results to files
    
    Returns:
        Dictionary with results dataframe, detailed results, and summary
    """
    
    print("🚀 Starting Multi-Reference Arabic Caption Evaluation...")
    print(f"Dataset size: {len(df)} samples")
    print(f"Reference columns: {ref_cols}")
    print(f"Candidate column: {cand_col}")
    
    # Initialize evaluator
    evaluator = ArabicCaptionEvaluator()
    
    results = []
    df_reset = df.reset_index(drop=True)
    
    for idx, row in df_reset.iterrows():
        # Collect all valid references for this sample
        references = []
        for ref_col in ref_cols:
            if ref_col in row and pd.notna(row[ref_col]) and str(row[ref_col]).strip():
                references.append(str(row[ref_col]).strip())
        
        candidate = str(row[cand_col]).strip() if pd.notna(row[cand_col]) else ""
        
        if references and candidate:
            # Evaluate against multiple references
            scores = evaluator.evaluate_multi_reference_pair(references, candidate)
            scores['original_index'] = idx
            scores['num_references'] = len(references)
            results.append(scores)
        
        if (idx + 1) % 50 == 0:
            print(f"Processed {idx + 1}/{len(df)} pairs...")
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    
    # Generate summary
    summary = evaluator.generate_summary_report(results_df)
    
    # Create detailed results
    detailed_results_df = df_reset.copy()
    for metric in results_df.columns:
        if metric not in ['original_index']:
            detailed_results_df[metric] = pd.NA
            for _, result_row in results_df.iterrows():
                idx = result_row['original_index']
                detailed_results_df.loc[idx, metric] = result_row[metric]
    
    # Print summary
    print("\n📊 MULTI-REFERENCE EVALUATION SUMMARY")
    print("=" * 60)
    
    metrics_display = {
        'BLEU-1': 'bleu1_mean',
        'BLEU-2': 'bleu2_mean', 
        'BLEU-3': 'bleu3_mean',
        'BLEU-4': 'bleu4_mean',
        'ROUGE-1': 'rouge1_mean',
        'ROUGE-2': 'rouge2_mean',
        'ROUGE-L': 'rougeL_mean',
        'Char Cosine Sim': 'char_cosine_similarity_mean',
        'Word Cosine Sim': 'word_cosine_similarity_mean',
        'Jaccard Sim': 'jaccard_similarity_mean',
        'Lin Similarity': 'lin_similarity_mean',
        'Semantic Sim': 'semantic_similarity_mean'
    }
    
    for display_name, metric_key in metrics_display.items():
        mean_val = summary[metric_key]
        std_val = summary[metric_key.replace('_mean', '_std')]
        max_val = summary[metric_key.replace('_mean', '_max')]
        print(f"{display_name:18}: {mean_val:.4f} (±{std_val:.4f}) [Max: {max_val:.4f}]")
    
    # Print reference statistics
    avg_refs = results_df['num_references'].mean()
    print(f"\nMulti-Reference Statistics:")
    print(f"Average references per sample: {avg_refs:.2f}")
    print(f"Reference count distribution: {results_df['num_references'].value_counts().sort_index().to_dict()}")
    
    # Create visualizations
    print("\n📈 Generating multi-reference visualizations...")
    evaluator.plot_evaluation_results(results_df)
    
    # Save results if requested
    if save_results:
        detailed_results_df.to_csv('multi_reference_detailed_evaluation.csv', index=False, encoding='utf-8')
        results_df.to_csv('multi_reference_metrics_only.csv', index=False)
        
        import json
        with open('multi_reference_evaluation_summary.json', 'w', encoding='utf-8') as f:
            json.dump(summary, f, indent=2, ensure_ascii=False)
        
        print("💾 Multi-reference results saved!")
    
    return {
        'results_df': results_df,
        'detailed_results_df': detailed_results_df,
        'summary': summary,
        'evaluator': evaluator
    }

# Example usage:
if __name__ == "__main__":
    print("Enhanced Multi-Reference Arabic Caption Evaluation Script Ready!")
    print("\nKey features:")
    print("✅ Multi-reference BLEU and ROUGE evaluation")  
    print("✅ Advanced Arabic text normalization")
    print("✅ Maximum score selection across references")
    print("✅ Comprehensive similarity metrics")
    print("✅ Single and multi-reference evaluation modes")
    print("\nTo use this script:")
    print("1. For single reference: evaluate_arabic_captions(df)")
    print("2. For multi-reference: evaluate_multi_reference_captions(df)")

Enhanced Multi-Reference Arabic Caption Evaluation Script Ready!

Key features:
✅ Multi-reference BLEU and ROUGE evaluation
✅ Advanced Arabic text normalization
✅ Maximum score selection across references
✅ Comprehensive similarity metrics
✅ Single and multi-reference evaluation modes

To use this script:
1. For single reference: evaluate_arabic_captions(df)
2. For multi-reference: evaluate_multi_reference_captions(df)


In [4]:
normalized = ArabicCaptionEvaluator()
expanded_df['caption'] = expanded_df['caption'].apply(normalized.normalize_arabic_text)

In [5]:
expanded_df.replace('.jpg', '', regex=True, inplace=True)
expanded_df.head()

Unnamed: 0,ID,caption
0,S.I.PH01.01.001,الشيخ سعد الدين العلمي بمعية وفد غزة في المسجد...
1,S.I.PH01.01.001,تظةر الصورة الشيخ سعد الدين العلمي برفقة وفد غ...
2,S.I.PH01.01.001,تقدم ةذة الصورة الشيخ سعد الدين العلمي ومعة وف...
3,S.I.PH01.01.002,الشيخ سعد الدين العلمي متحدثا في استقبال رؤساء...
4,S.I.PH01.01.002,يتحدث الشيخ سعد الدين العلمي خلال استقبالة في ...


In [6]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import os
import pandas as pd
from sklearn.model_selection import train_test_split

class SingleCaptionDataset(Dataset):
    def __init__(self, data, processor, max_length=250):
        self.data = data
        self.processor = processor
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        image_path = item['image_path']
        caption = item['caption']
        
        # Load and process image
        image = Image.open(image_path).convert('RGB')
        
        # Process image
        pixel_values = self.processor(images=image, return_tensors="pt")['pixel_values'].squeeze(0)
        
        # Process text
        text_encoding = self.processor.tokenizer(
            caption,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        input_ids = text_encoding['input_ids'].squeeze(0)
        attention_mask = text_encoding['attention_mask'].squeeze(0)
        
        # For training, labels are the same as input_ids
        labels = input_ids.clone()
        
        return {
            'pixel_values': pixel_values,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

def load_single_caption_dataset(images_dir):
    """Load dataset for single caption training, using one caption per image instance."""
    data = []
    
    # Get all image files
    image_extensions = ('.jpg',)
    image_files = [f for f in os.listdir(images_dir) if f.lower().endswith(image_extensions)]
    print(f"Found {len(image_files)} images in {images_dir}")

    # Load the expanded captions (each image has 3 rows with different captions)
    captions_df = expanded_df
    print(f"Loaded {len(captions_df)} captions from expanded_captions.csv")

    # For each row (caption instance), find the corresponding image file and add as a separate sample
    for _, row in captions_df.iterrows():
        image_id = str(row['ID'])
        caption = row['caption']
        image_file = None
        for ext in image_extensions:
            potential_file = f"{image_id}{ext}"
            if potential_file in image_files:
                image_file = potential_file
                break
        if image_file:
            image_path = os.path.join(images_dir, image_file)
            if os.path.exists(image_path):
                data.append({
                    'image_path': image_path,
                    'caption': caption,
                    'image_id': image_id
                })
        else:
            print(f"Warning: Image file {image_id} not found in {images_dir}")

    print(f"Loaded {len(data)} image-caption pairs (one per caption instance)")
    return data


In [7]:

# Load model for CAPTIONING (not VQA)
from transformers import BlipProcessor, BlipForConditionalGeneration
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer
import torch

processor = BlipProcessor.from_pretrained("omarsabri8756/blip-Arabic-flickr-8k")
model = BlipForConditionalGeneration.from_pretrained(
    "omarsabri8756/blip-Arabic-flickr-8k",
    torch_dtype=torch.float16,
    device_map="auto"
)

# LoRA configuration for captioning
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "self.query",
        "self.key",
        "self.value",
        "output.dense",
        "self_attn.qkv",
        "self_attn.projection",
        "mlp.fc1",
        "mlp.fc2",
    ],
)

model = get_peft_model(model, config)
print(model)
model.print_trainable_parameters()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


PeftModel(
  (base_model): LoraModel(
    (model): BlipForConditionalGeneration(
      (vision_model): BlipVisionModel(
        (embeddings): BlipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
        )
        (encoder): BlipEncoder(
          (layers): ModuleList(
            (0-23): 24 x BlipEncoderLayer(
              (self_attn): BlipAttention(
                (dropout): Dropout(p=0.0, inplace=False)
                (qkv): lora.Linear(
                  (base_layer): Linear(in_features=1024, out_features=3072, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1024, out_features=16, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=16, out_features=3072, bias=False)
               

In [28]:

# Load dataset
import pandas as pd
data = load_single_caption_dataset("merged_images")
print(f"Loaded {len(data)} image-caption pairs for training.")


Found 2717 images in merged_images
Loaded 7335 captions from expanded_captions.csv


Loaded 7335 image-caption pairs (one per caption instance)
Loaded 7335 image-caption pairs for training.


In [30]:
print(f"Loaded {len(data)} captioning samples.")
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)


Loaded 7335 captioning samples.


In [31]:

train_dataset = SingleCaptionDataset(train_data, processor)
val_dataset = SingleCaptionDataset(val_data, processor)


In [32]:
print (f"Training dataset size: {len(train_dataset)}")

Training dataset size: 6601


In [33]:

# Training arguments
training_args = TrainingArguments(
    output_dir="./blip-captioning-lora",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    warmup_ratio=0.1,
    logging_steps=25,
    eval_steps=100,
    save_steps=200,
    save_strategy="steps",
    eval_strategy="steps",  # Changed from evaluation_strategy to eval_strategy
    #metric_for_best_model="eval_loss",
    report_to="none",
    fp16=True,
    dataloader_pin_memory=True,
    save_total_limit=3,
    remove_unused_columns=False,
)

def data_collator(features):
    batch = {}
    
    # Stack pixel values (images)
    if 'pixel_values' in features[0]:
        batch['pixel_values'] = torch.stack([f['pixel_values'] for f in features])
    
    # Stack text inputs
    if 'input_ids' in features[0]:
        batch['input_ids'] = torch.stack([f['input_ids'] for f in features])
    
    if 'attention_mask' in features[0]:
        batch['attention_mask'] = torch.stack([f['attention_mask'] for f in features])
    
    if 'labels' in features[0]:
        batch['labels'] = torch.stack([f['labels'] for f in features])
    
    return batch

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
)


In [34]:
import torch

# Check GPU availability and usage
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Current GPU: {torch.cuda.current_device()}")
    print(f"GPU name: {torch.cuda.get_device_name()}")
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
    print(f"GPU memory total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("CUDA not available, using CPU")

# Check model device
print(f"Model device: {next(model.parameters()).device}")

CUDA available: True
Number of GPUs: 1
Current GPU: 0
GPU name: NVIDIA L4
GPU memory allocated: 0.92 GB
GPU memory reserved: 0.97 GB
GPU memory total: 21.96 GB
Model device: cuda:0


In [None]:
trainer.train()

# Save the final model
model.save_pretrained("./blip-captioning-lora-final-augmented")
print("Training complete and model saved to ./blip-captioning-lora-final-augmented")

Step,Training Loss,Validation Loss
100,0.5911,0.524631


# Evaluation

In [None]:
from peft import PeftModel, PeftConfig
from transformers import BlipForConditionalGeneration

# Load config
peft_config = PeftConfig.from_pretrained("blip-captioning-lora-final-augmented")

# Load base model
base_model = BlipForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path)

# Load LoRA adapter weights
model = PeftModel.from_pretrained(base_model, "blip-captioning-lora-final-augmented")
processor = BlipProcessor.from_pretrained("omarsabri8756/blip-Arabic-flickr-8k")

# Set to eval if needed
model.eval()


In [1]:
import os

local_images_dir = r"Test_images"
image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')

local_image_files = [f for f in os.listdir(local_images_dir) if f.lower().endswith(image_extensions)]
print(f"Found {len(local_image_files)} images in {local_images_dir}")

Found 75 images in Test_images


In [None]:
df = pd.read_csv("/teamspace/studios/this_studio/captions_clean_test.csv")
names = df['File Name'].tolist()


In [None]:
names_final = df["File Name"].astype(str) + ".jpg"

names_final = names_final.to_list()

In [None]:
from PIL import Image
import pandas as pd
from transformers import BlipProcessor
import torch

# Use the processor and model already loaded in previous cells

captions = []
for filename in names_final:
    image_path = os.path.join(local_images_dir, filename)
    image = Image.open(image_path).convert('RGB')
    inputs = processor(images=image, return_tensors="pt")
    # Move inputs to the same device as the model (GPU if available)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        output_ids = model.generate(**inputs, max_new_tokens=100)
        caption = processor.tokenizer.decode(output_ids[0], skip_special_tokens=True)
    captions.append({'Filename': filename, 'caption': caption})
    print(f"Finished: {filename}")


Finished: ISH.PH03.03.016.jpg
Finished: ISH.PH03.02.059.jpg
Finished: ISH.PH03.02.043.jpg
Finished: ISH.PH01.12.072.jpg
Finished: ISH.PH01.24.027.jpg
Finished: ISH.PH01.24.030.jpg
Finished: ISH.PH01.13.099.jpg
Finished: ISH.PH03.03.052.jpg
Finished: ISH.PH01.24B.040.jpg
Finished: ISH.PH01.24.062.01.jpg
Finished: ISH.PH01.24.117.jpg
Finished: ISH.PH03.03.041.jpg
Finished: ISH.PH03.04.020.jpg
Finished: ISH.PH03.05.013.jpg
Finished: ISH.PH01.13.125.jpg
Finished: ISH.PH03.05B.084.jpg
Finished: ISH.PH01.24.112.jpg
Finished: ISH.PH01.24.109.jpg
Finished: ISH.PH01.24.067.jpg
Finished: ISH.PH03.05B.065.jpg
Finished: ISH.PH03.03.034.jpg
Finished: ISH.PH01.13.116.jpg
Finished: ISH.PH01.13.119.jpg
Finished: ISH.PH01.24.044.jpg
Finished: ISH.PH01.24.057.jpg
Finished: ISH.PH01.13.129.jpg
Finished: ISH.PH01.13.120.jpg
Finished: ISH.PH03.05B.002.jpg
Finished: ISH.PH03.05B.037.jpg
Finished: ISH.PH03.04.003.jpg
Finished: ISH.PH03.02.087.jpg
Finished: ISH.PH01.24.125.jpg
Finished: ISH.PH01.24B.003.jpg
F

In [None]:

captions_df = pd.DataFrame(captions)
captions_df.head(25)


Unnamed: 0,Filename,caption
0,ISH.PH03.03.016.jpg,صورة لةيلاريون كوتشي مع عامة معروضة في احد الم...
1,ISH.PH03.02.059.jpg,صورة للجناح الفلسطيني في معرض دمشق الدولي بحضو...
2,ISH.PH03.02.043.jpg,صورة لاحتفال عسكري لقوات الثورة الفلسطينية في ...
3,ISH.PH01.12.072.jpg,صورة لاحتفال عسكري لقوات الثورة الفلسطينية في ...
4,ISH.PH01.24.027.jpg,صورة لتدريب جنود جيش التحرير الفلسطيني في احد ...
5,ISH.PH01.24.030.jpg,صورة لاحتفال عسكري لقوات الثورة الفلسطينية في ...
6,ISH.PH01.13.099.jpg,صورة لتدريب مقاتلين من قوات الثورة الفلسطينية ...
7,ISH.PH03.03.052.jpg,صورة لجانب من الجناح الفلسطيني في معرض دمشق ال...
8,ISH.PH01.24B.040.jpg,صورة لاحتفال عسكري لقوات الثورة الفلسطينية في ...
9,ISH.PH01.24.062.01.jpg,صورة لاحتفال عسكري لقوات الثورة الفلسطينية في ...


In [None]:
captions_df.to_csv('generated_captions_Test.csv', index=False, encoding='utf-8-sig')
print(captions_df.head())

              Filename                                            caption
0  ISH.PH03.03.016.jpg  صورة لةيلاريون كوتشي مع عامة معروضة في احد الم...
1  ISH.PH03.02.059.jpg  صورة للجناح الفلسطيني في معرض دمشق الدولي بحضو...
2  ISH.PH03.02.043.jpg  صورة لاحتفال عسكري لقوات الثورة الفلسطينية في ...
3  ISH.PH01.12.072.jpg  صورة لاحتفال عسكري لقوات الثورة الفلسطينية في ...
4  ISH.PH01.24.027.jpg  صورة لتدريب جنود جيش التحرير الفلسطيني في احد ...
