<a href="https://colab.research.google.com/github/abidlifiras/llm-qcm-demo/blob/master/Dataset_kpis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset KPI Analysis Notebook

In [1]:
# Install required libraries
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=23c8fbbf09dcccd610007ad908d4434050b60643e857570f6200008549221e4e
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [3]:
!git clone https://github.com/abidlifiras/llm-qcm-demo.git

Cloning into 'llm-qcm-demo'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 25 (delta 5), reused 21 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 690.85 KiB | 5.31 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [2]:
# Import dependencies
import pandas as pd
from langdetect import detect
from collections import Counter
import matplotlib.pyplot as plt

In [4]:
# Load the datasets
df_train = pd.read_json("llm-qcm-demo/dataset/train.json")
df_dev = pd.read_json("llm-qcm-demo/dataset/dev.json")
df_test = pd.read_json("llm-qcm-demo/dataset/test.json")

In [5]:
#Combine for global operations
datasets = {'train': df_train, 'dev': df_dev, 'test': df_test}

In [6]:
# Function to detect language safely
def detect_language_safe(text):
    try:
        return detect(text)
    except:
        return "unknown"

In [7]:
# Initialize KPI summary
kpi_summary = []
kpi_details = {}


In [8]:
# Extract KPIs for each dataset
for name, df in datasets.items():
    total_questions = len(df)
    unique_subjects = df['subject_name'].nunique()
    subjects_distribution = df['subject_name'].value_counts().to_dict()
    types_distribution = df['type'].value_counts().to_dict() if 'type' in df.columns else {}

    detected_langs = df['question'].apply(detect_language_safe)
    lang_distribution = dict(Counter(detected_langs))

    kpi_summary.append({
        'Dataset': name,
        'Total Questions': total_questions,
        'Unique Subjects': unique_subjects,
        'Language Distribution': lang_distribution
    })

    kpi_details[name] = {
        'Subjects Distribution': subjects_distribution,
        'Types Distribution': types_distribution,
        'Language Distribution': lang_distribution
    }

In [9]:
# Convert summary to DataFrame
summary_df = pd.DataFrame(kpi_summary)
print("\n✅ Dataset Overview Summary:")
display(summary_df)

# 🔍 Optional: display detailed subject breakdown
for dataset, details in kpi_details.items():
    print(f"\n📘 Dataset: {dataset.upper()}")
    print("Subjects Distribution:", details['Subjects Distribution'])
    print("Types Distribution:", details['Types Distribution'])
    print("Language Distribution:", details['Language Distribution'])


✅ Dataset Overview Summary:


Unnamed: 0,Dataset,Total Questions,Unique Subjects,Language Distribution
0,train,2171,1,"{'fr': 2129, 'ca': 11, 'it': 12, 'en': 2, 'es'..."
1,dev,312,1,{'fr': 312}
2,test,622,1,{'fr': 622}



📘 Dataset: TRAIN
Subjects Distribution: {'pharmacie': 2171}
Types Distribution: {'multiple': 1576, 'simple': 595}
Language Distribution: {'fr': 2129, 'ca': 11, 'it': 12, 'en': 2, 'es': 9, 'pt': 2, 'et': 2, 'cs': 1, 'hu': 1, 'tl': 2}

📘 Dataset: DEV
Subjects Distribution: {'pharmacie': 312}
Types Distribution: {'simple': 164, 'multiple': 148}
Language Distribution: {'fr': 312}

📘 Dataset: TEST
Subjects Distribution: {'pharmacie': 622}
Types Distribution: {}
Language Distribution: {'fr': 622}


# 📌 Conclusion:
# This analysis helps us understand the composition of our datasets,
# including the number of examples, subject variety, and dominant language.
# This is useful for preparing fine-tuning strategies.