# Data Processing for Model Fine-Tuning

In [1]:
from pathlib import Path
from chatbmw.data.cleaner import process_jsonl_file

# Process a single JSONL file
INPUT_FILE_NAME_2000 = '../datasets/raw_data/bmw_articles_20251230_162547_2000articles.jsonl'

# Create output path with proper filename (extract stem, not full path)
input_path = Path(INPUT_FILE_NAME_2000)
output_dir = Path('../datasets/clean_data')
output_dir.mkdir(parents=True, exist_ok=True)
OUTPUT_FILE_NAME_2000 = output_dir / f"{input_path.stem}_clean.jsonl"

count = process_jsonl_file(INPUT_FILE_NAME_2000, OUTPUT_FILE_NAME_2000)

In [2]:
print(f"✓ Processed {count} articles")
print(f"Saved to: {OUTPUT_FILE_NAME_2000}")

✓ Processed 2000 articles
Saved to: ../datasets/clean_data/bmw_articles_20251230_162547_2000articles_clean.jsonl


In [3]:
from chatbmw.data.procesor import process_dataset_to_chat_splits

INPUT_FILE_NAME_2000_CLEAN = '../datasets/clean_data/bmw_articles_20251230_162547_2000articles_clean.jsonl'
OUTPUT_DIR = '../datasets/chat_data_2000'
EXCLUDE_TYPES = ["Fact & Figures"]  # Types to exclude from training

stats = process_dataset_to_chat_splits(
        input_path=INPUT_FILE_NAME_2000_CLEAN,
        output_dir=OUTPUT_DIR,
        exclude_types=EXCLUDE_TYPES,
        tasks=["summarization", "tag_extraction", "type_classification", "title_generation"],
        test_size=0.1,
        val_size=0.1,
        seed=42,
    )

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1964 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1964 [00:00<?, ? examples/s]

In [4]:
split_type = "stratified" if stats.get('stratified_split', True) else "random (fallback)"
print(f"  Original articles: {stats['original_articles']}")
print(f"  After filtering: {stats['filtered_articles']} (excluded {stats['excluded_count']})")
print(f"  Article types: {stats['unique_types']}")
print(f"  Split method: {split_type}")
print(f"\n  Article splits:")
print(f"    Train: {stats['train_articles']} articles")
print(f"    Validation: {stats['val_articles']} articles")
print(f"    Test: {stats['test_articles']} articles")
print(f"\n  Chat conversations:")
print(f"    Train: {stats['train_conversations']} conversations")
print(f"    Validation: {stats['val_conversations']} conversations")
print(f"    Test: {stats['test_conversations']} conversations")
print(f"    Total: {stats['total_conversations']} conversations")
print(f"\n  Output saved to: {stats['output_dir']}")

  Original articles: 2000
  After filtering: 1964 (excluded 36)
  Article types: ['Press Kit', 'Press Release', 'Speech']
  Split method: stratified

  Article splits:
    Train: 1570 articles
    Validation: 197 articles
    Test: 197 articles

  Chat conversations:
    Train: 6264 conversations
    Validation: 787 conversations
    Test: 787 conversations
    Total: 7838 conversations

  Output saved to: ../datasets/chat_data_2000
