## Generování syntetických dat

In [1]:
import pandas as pd
import random
from faker import Faker
import numpy as np
import calendar
from datetime import date

In [None]:
# Inicializace Faker pro česká data
faker = Faker("cs_CZ")

# Slovník firemních údajů
firm_info = {}
all_companies = [
    'IT vývoj s.r.o.', 'IT podpora a.s.', 'IT cloud s.r.o.',
    'Plyn Servis a.s.', 'Elektřina s.r.o.', 'Voda a Služby s.r.o.',
    'Personál Services s.r.o.', 'Logistika a.s.', 'Autopark s.r.o.',
    'PHM distribuce s.r.o.', 'Právo & Partners s.r.o.', 'Poradenství s.r.o.',
    'Nájem a správa s.r.o.', 'CloudEra a.s.', 'Komunikační Platformy s.r.o.',
    'Nová Generace IT s.r.o.', 'Bankovnictví a.s.', 'Finanční Group s.r.o.',
    'Technologie s.r.o.', 'AI StartUp s.r.o.', 'Virtuální Kancelář s.r.o.',
    'FinDoc AI'
]

for company in all_companies:
    firm_info[company] = {
        'ico': faker.random_int(100000000, 999999999),
        'dic': f'CZ{faker.random_int(100000000, 999999999)}',
        'account': faker.iban()
    }

# Kategorie transakcí
categories = {
    'Výdaje': {
        'IT': ['IT vývoj s.r.o.', 'IT podpora a.s.', 'IT cloud s.r.o.'],
        'Energie': ['Plyn Servis a.s.', 'Elektřina s.r.o.', 'Voda a Služby s.r.o.'],
        'Mzdy': ['Personál Services s.r.o.'],
        'Doprava': ['Logistika a.s.', 'Autopark s.r.o.', 'PHM distribuce s.r.o.'],
        'Služby': ['Právo & Partners s.r.o.', 'Poradenství s.r.o.', 'Nájem a správa s.r.o.']
    },
    'Příjmy': {
        'SaaS': ['CloudEra a.s.', 'Komunikační Platformy s.r.o.', 'Nová Generace IT s.r.o.'],
        'Konzultace': ['Bankovnictví a.s.', 'Finanční Group s.r.o.'],
        'Licence': ['Technologie s.r.o.', 'AI StartUp s.r.o.', 'Virtuální Kancelář s.r.o.']
    }
}

def generate_invoices(n=2040):
    invoices = []
    invoice_id = 1
    
    for _ in range(n):
        # Základní parametry
        is_income = random.random() < 0.6
        category_type = 'Příjmy' if is_income else 'Výdaje'
        category = random.choice(list(categories[category_type].keys()))
        company = random.choice(categories[category_type][category])
        
        # Dodavatel a odběratel
        supplier_name = 'FinDoc AI' if is_income else company
        customer_name = company if is_income else 'FinDoc AI'
        
        # Datumy
        invoice_date = faker.date_between(start_date=date(2024, 1, 1), end_date=date(2024, 12, 31))

        
        invoice = {
            'invoice_id': f"24{invoice_id:07d}",
            'supplier_name': supplier_name,
            'supplier_ico': firm_info[supplier_name]['ico'],
            'supplier_dic': firm_info[supplier_name]['dic'],
            'supplier_account': firm_info[supplier_name]['account'],
            'customer_name': customer_name,
            'customer_ico': firm_info[customer_name]['ico'],
            'customer_dic': firm_info[customer_name]['dic'],
            'category': category,
            'invoice_date': pd.to_datetime(invoice_date),
            'due_date': None,  
            'total_amount': round(random.uniform(500, 500000), 2),
            'currency': 'CZK',
            'transaction_type': 'Příjmy' if supplier_name == 'FinDoc AI' else 'Výdaje',
            'variable_symbol': faker.random_number(digits=10),
            'note': f"Faktura za: {category}",
            'items': '; '.join([faker.bs() for _ in range(random.randint(1, 5))]),
            'payment_status': None,  
            'delay_days': None,  
            'payment_date': None,  
            'is_month_end': None,  
            'is_anomaly': False,
            'anomaly_type': None
        }

         # Výpočet dat splatnosti a platby
        due_days = random.randint(7, 30)
        invoice['due_date'] = invoice['invoice_date'] + pd.Timedelta(days=due_days)
        
        # Zpoždění platby
        delay = random.choice([0, random.randint(1, 90)])
        invoice['delay_days'] = delay
        invoice['payment_date'] = invoice['due_date'] + pd.Timedelta(days=delay)
        invoice['payment_status'] = 'Paid' if delay == 0 else 'Delayed'

        # Určení, zda se jedná o poslední den měsíce
        last_day_of_month = calendar.monthrange(invoice['invoice_date'].year, invoice['invoice_date'].month)[1]
        invoice['is_month_end'] = invoice['invoice_date'].day == last_day_of_month

        # Generování anomálií
        if random.random() < 0.09:
            invoice = add_anomaly(invoice, is_income)
        
        invoices.append(invoice)
        invoice_id += 1
    
    return pd.DataFrame(invoices)

def add_anomaly(invoice, is_income):
    anomaly_type = random.choices(
        ['Items Total Mismatch + Unusual Due Date', 'High Value + Short Due Date', 
         'Unusual Number of Items', 'Unusual Service for Customer'],
        weights=[0.35, 0.35, 0.2, 0.1]
    )[0]
    
    invoice['is_anomaly'] = True
    invoice['anomaly_type'] = anomaly_type
    
    if anomaly_type == 'Items Total Mismatch + Unusual Due Date':
        invoice['items'] = '; '.join([f"{faker.bs()} ({random.uniform(100, 10000):.2f} CZK)" 
                                    for _ in range(random.randint(1, 5))])
        invoice['total_amount'] = round(sum(float(item.split('(')[1].split(' ')[0]) 
                                          for item in invoice['items'].split('; ')) * 0.9, 2)
        invoice['due_date'] = invoice['invoice_date'] + pd.DateOffset(days=random.choice([1, 2, 60, 90]))
    
    elif anomaly_type == 'High Value + Short Due Date':
        invoice['total_amount'] = round(random.uniform(400000, 1000000), 2)
        invoice['due_date'] = invoice['invoice_date'] + pd.DateOffset(days=random.randint(1, 3))
    
    elif anomaly_type == 'Unusual Number of Items':
        invoice['items'] = '; '.join([faker.bs() for _ in range(random.randint(15, 20))])
    
    elif anomaly_type == 'Unusual Service for Customer':
        new_category = random.choice([c for c in categories['Výdaje' if not is_income else 'Příjmy'] if c != invoice['note'].split(': ')[1]])
        invoice['note'] = f"Faktura za: {new_category}"
    
    return invoice

# Generování dat
synthetic_data = generate_invoices(50000)
synthetic_data.to_csv("synthetic_train_data.csv", index=False)

synthetic_project_data = generate_invoices(2040)
synthetic_project_data.to_csv("synthetic_project_data.csv", index=False)

In [2]:
df_projekt = pd.read_csv("synthetic_project_data.csv")

## Uložení náhodných faktur do PDF - nunto nastavit počet faktur

In [None]:
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
import re

# Registrace fontů pro podporu českých znaků
pdfmetrics.registerFont(TTFont('Arial', '/System/Library/Fonts/Supplemental/Arial.ttf'))
pdfmetrics.registerFont(TTFont('Arial-Bold', '/System/Library/Fonts/Supplemental/Arial Bold.ttf'))

def parse_item_and_amount(item):
    """
    Rozdělí položku na popis a částku (pokud je uvedena v závorce)
    """
    match = re.search(r'\((.*?)\s*CZK\)', item)
    if match:
        amount_str = match.group(1).replace(',', '.').replace(' ', '')
        amount = float(amount_str)
        desc = re.sub(r'\s*\(.*?\)\s*', '', item).strip()
        return desc, amount
    return item.strip(), None

def distribute_amounts(n, total):
    """
    Rozdělí celkovou částku na n náhodných částek (součet je total)
    """
    if n == 0:
        return []
    amounts = [random.uniform(100, 5000) for _ in range(n)]
    total_generated = sum(amounts)
    return [round(amount * total / total_generated, 2) for amount in amounts]

def generate_pdf_invoices(df, num_invoices=200):        
    """" 
    Parametr num_invoices určuje faktur k vygenerování! 
    """ 
    sample_invoices = df.sample(n=num_invoices)
    
    for index, invoice in sample_invoices.iterrows():
        file_name = f"faktura_{invoice['invoice_id']}.pdf"
        c = canvas.Canvas(file_name, pagesize=A4)
        y_position = 800  # Počáteční Y pozice

        # Záhlaví
        c.setFont("Arial-Bold", 16)
        c.drawString(50, y_position, "FAKTURA - DAŇOVÝ DOKLAD")
        y_position -= 30
        
        # Číslo faktury
        c.setFont("Arial-Bold", 12)
        c.drawString(50, y_position, f"Číslo faktury: {invoice['invoice_id']}")
        y_position -= 40

        # Dodavatel
        c.setFont("Arial-Bold", 12)
        c.drawString(50, y_position, "Dodavatel:")
        c.setFont("Arial", 10)
        y_position -= 15
        c.drawString(50, y_position, invoice['supplier_name'])
        y_position -= 15
        c.drawString(50, y_position, f"IČO: {invoice['supplier_ico']}")
        y_position -= 15
        c.drawString(50, y_position, f"DIČ: {invoice['supplier_dic']}")
        y_position -= 15
        c.drawString(50, y_position, f"Č. účtu: {invoice['supplier_account']}")
        y_position -= 40

        # Odběratel
        c.setFont("Arial-Bold", 12)
        c.drawString(50, y_position, "Odběratel:")
        c.setFont("Arial", 10)
        y_position -= 15
        c.drawString(50, y_position, invoice['customer_name'])
        y_position -= 15
        c.drawString(50, y_position, f"IČO: {invoice['customer_ico']}")
        y_position -= 15
        c.drawString(50, y_position, f"DIČ: {invoice['customer_dic']}")
        y_position -= 40

        # Detaily fakturace
        c.setFont("Arial-Bold", 12)
        c.drawString(50, y_position, "Detaily fakturace:")
        c.setFont("Arial", 10)
        y_position -= 15
        c.drawString(50, y_position, f"Variabilní symbol: {invoice['variable_symbol']}")
        y_position -= 15
        c.drawString(50, y_position, f"Datum vystavení: {pd.to_datetime(invoice['invoice_date']).strftime('%d.%m.%Y')}")
        y_position -= 15
        c.drawString(50, y_position, f"Datum splatnosti: {pd.to_datetime(invoice['due_date']).strftime('%d.%m.%Y')}")
        y_position -= 40

        # Tabulka položek
        c.setFont("Arial", 10)
        c.drawString(50, y_position, f"{invoice['note']}")
        y_position -= 20
        
        # Zpracování položek
        items_raw = invoice['items'].split('; ')
        parsed_items = [parse_item_and_amount(item) for item in items_raw]
        
        # Rozdělení na položky s a bez částek
        items_with_amount = [(desc, amt) for desc, amt in parsed_items if amt is not None]
        items_without_amount = [desc for desc, amt in parsed_items if amt is None]
        
        # Generování částek
        if items_without_amount:
            if items_with_amount:
                generated_amounts = [random.uniform(100, 5000) for _ in items_without_amount]
            else:
                generated_amounts = distribute_amounts(len(items_without_amount), invoice['total_amount'])
        
        # Vykreslování položek
        for desc, amt in items_with_amount:
            c.drawString(50, y_position, desc)
            c.drawString(400, y_position, f"{amt:.2f} CZK")
            y_position -= 20
        
        for desc, amount in zip(items_without_amount, generated_amounts):
            c.drawString(50, y_position, desc)
            c.drawString(400, y_position, f"{amount:.2f} CZK")
            y_position -= 20

        # Celková částka
        c.setFont("Arial-Bold", 12)
        c.drawString(50, y_position-40, f"Celkem: {invoice['total_amount']:.2f} CZK")

        c.save()

# Volání funkce s aktuálními daty
generate_pdf_invoices(df_projekt)
