In [None]:
import os
import random
import csv
from weasyprint import HTML
from pdf2image import convert_from_path
from PIL import Image, ImageFilter, ImageDraw, ImageFont

# Directories
text_dir = 'articles/text'
font_dir = 'fonts'
output_dir = 'synthetic_arabic_pages'
os.makedirs(output_dir, exist_ok=True)

# Page styles
page_styles = {
    'A4': {'size': '210mm 297mm'},
    'Letter': {'size': '216mm 279mm'}
}

# HTML template
html_template = """
<!DOCTYPE html>
<html lang="ar" dir="rtl">
<head>
    <meta charset="UTF-8">
    <style>
        @font-face {{
            font-family: '{font_name}';
            src: url('{font_path}');
        }}
        body {{
            font-family: '{font_name}', sans-serif;
            margin: 0;
            padding: 20px;
            size: {page_size};
            background: {background};
        }}
        .content {{
            column-count: {columns};
            column-gap: 20px;
            background: linear-gradient({gradient});
            padding: 20px;
        }}
    </style>
</head>
<body>
    <div class="content">
        {content}
    </div>
</body>
</html>
"""


# Function to add noise
def add_noise(image):
    draw = ImageDraw.Draw(image)
    width, height = image.size
    for _ in range(random.randint(500, 1000)):
        x, y = random.randint(0, width-1), random.randint(0, height-1)
        draw.point((x, y), fill=random.choice([(0, 0, 0), (255, 255, 255)]))
    image = image.filter(ImageFilter.GaussianBlur(random.uniform(0, 2)))

    return image

# Function to create HTML content
def create_html_content(text, font_path, font_name, page_size, columns, background, gradient):
    content = text.replace('\n', '<br>')
    print(font_name)
    return html_template.format(
        font_path=font_path,
        font_name=font_name,
        page_size=page_size,
        columns=columns,
        background=background,
        gradient=gradient,
        content=content
    )

# Function to convert HTML to image
def html_to_image(html_content, output_image_path):
    # Convert HTML to PDF
    pdf_path = output_image_path.replace('.png', '.pdf')
    HTML(string=html_content).write_pdf(pdf_path)
    
    # Convert PDF to PNG
    images = convert_from_path(pdf_path)

    if len(images) > 1:
            # Remove PDF
        os.remove(pdf_path)
        return True

    for image in images:
        # Apply blur effect
        image = image.filter(ImageFilter.GaussianBlur(random.uniform(0, 2)))
        ## apply noise

        ## add a bit of noise
        # width, height = image.size
        # noise = Image.new('RGB', (width, height), (255, 255, 255))
        # for _ in range(random.randint(0, 3000)):
        #     x = random.randint(0, width-1)
        #     y = random.randint(0, height-1)
        #     noise.putpixel((x, y), (0, 0, 0))
        #     image = Image.blend(image, noise, 0.1)

        image = add_noise(image)

        ## add skew
        skew = random.uniform(-0.1, 0.1)
        image = image.transform(
            image.size, 
            Image.AFFINE, 
            (1, skew, 0, skew, 1, 0)
        )


        # Save image
        image.save(output_image_path, 'PNG')

    # Remove PDF
    os.remove(pdf_path)
    return False


# Read fonts
fonts = [os.path.join(font_dir, f) for f in os.listdir(font_dir) if f.endswith(('.ttf', '.otf'))]

# Prepare metadata CSV
metadata_path = os.path.join(output_dir, 'metadata.csv')
with open(metadata_path, mode='w', newline='', encoding='utf-8') as metadata_file:
    writer = csv.writer(metadata_file)
    writer.writerow(['Image Name', 'Text Content'])
    
    # Process each text file
    for text_file in os.listdir(text_dir):
        if text_file.endswith('.txt'):


            text_path = os.path.join(text_dir, text_file)
            with open(text_path, 'r', encoding='utf-8') as file:
                text_content = file.read()
                
                # Randomly select font and page style
                font_path = random.choice(fonts)
                font_name = os.path.splitext(os.path.basename(font_path))[0]
                page_name, style = random.choice(list(page_styles.items()))
                style['columns'] = random.choice([1])
                # Random background and gradient
                background = random.choice(['#ffffff', '#f0f0f0', '#e0e0e0'])
                gradient = f'to bottom, {background}, #d0d0d0'

                try:

                    # Define output image path
                    image_name = f"{os.path.splitext(text_file)[0]}.png"
                    image_path = os.path.join(output_dir, image_name)

                    if os.path.exists(image_path):
                        print(f"Image {image_path} already exists")
                        continue
                
                    # Create HTML content
                    html_content = create_html_content(
                        text_content, font_path, font_name, style['size'], style['columns'], background, gradient
                    )
                    

                    
                    # Generate image
                    hasMorethanOnePage = html_to_image(html_content, image_path)
                    if hasMorethanOnePage:
                        print(f"Document has more than one page, skipping...")
                        continue
                    
                    # Write metadata
                    writer.writerow([image_name, text_content])
                    
                    print(f"Generated Image: {image_path}")
                except Exception as e:
                    print(f"Error processing {text_file}: {e}")
                    continue

print(f"Metadata saved to: {metadata_path}")