In [1]:
!pip install requests img2pdf pillow pandas



In [2]:
import os
import json
import requests
import img2pdf
from PIL import Image, UnidentifiedImageError
from io import BytesIO
import pandas as pd

def process_qa_jsonl(file_path, output_directory, eval_dir):
    os.makedirs(output_directory, exist_ok=True)
    questions_answers = []

    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            deck_name = data['deck_name']
            image_urls = data['image_urls']
            question = data['question']
            answer = data['answer']
            
            questions_answers.append({'question': question, 'answer': answer})

            output_path = os.path.join(output_directory, f"{deck_name}.pdf")
            
            # Fetch image data from URLs
            image_data = []
            for img_url in image_urls:
                try:
                    response = requests.get(img_url)
                    img = Image.open(BytesIO(response.content))
                    img_bytes = BytesIO()
                    img.save(img_bytes, format='JPEG')
                    image_data.append(img_bytes.getvalue())
                except (UnidentifiedImageError, ValueError):
                    print(f"Skipping unidentified image: {img_url}")
                    continue

            # Convert the list of image data to a single PDF file
            if image_data:
                pdf_data = img2pdf.convert(image_data)

                # Write the PDF content to a file (make sure you have write permissions for the specified file)
                with open(output_path, "wb") as file:
                    file.write(pdf_data)
            else:
                print(f"Skipping PDF creation for {deck_name} due to no valid images.")
    
    # Create a DataFrame and save it as a CSV file
    df = pd.DataFrame(questions_answers)
    df.to_csv(os.path.join(eval_dir, 'questions_answers.csv'), index=False)

# Define the file paths
qa_jsonl_path = 'qa.jsonl'
output_directory = 'data'
eval_dir = 'eval_data'

# Process the JSONL file and create PDFs and the CSV file
process_qa_jsonl(qa_jsonl_path, output_directory, eval_dir)


Skipping unidentified image: https://image.slidesharecdn.com/0903organizingforbiandbigdatainthe21stcentury-clean-140922112024-phpapp02/95/organizing-for-agile-bi-1-1024.jpg
Skipping unidentified image: https://image.slidesharecdn.com/0903organizingforbiandbigdatainthe21stcentury-clean-140922112024-phpapp02/95/organizing-for-agile-bi-2-1024.jpg
Skipping unidentified image: https://image.slidesharecdn.com/0903organizingforbiandbigdatainthe21stcentury-clean-140922112024-phpapp02/95/organizing-for-agile-bi-3-1024.jpg
Skipping unidentified image: https://image.slidesharecdn.com/0903organizingforbiandbigdatainthe21stcentury-clean-140922112024-phpapp02/95/organizing-for-agile-bi-4-1024.jpg
Skipping unidentified image: https://image.slidesharecdn.com/0903organizingforbiandbigdatainthe21stcentury-clean-140922112024-phpapp02/95/organizing-for-agile-bi-5-1024.jpg
Skipping unidentified image: https://image.slidesharecdn.com/0903organizingforbiandbigdatainthe21stcentury-clean-140922112024-phpapp02/