In [3]:
import pytesseract
from PIL import Image
import pandas as pd
import openpyxl
from io import BytesIO

def extract_text_from_image(image_bytes):
    """
    Extract text from an image using Tesseract OCR.
    """
    img = Image.open(BytesIO(image_bytes))
    text = pytesseract.image_to_string(img)
    return text.strip()

def process_reports_xlsx(xlsx_path):
    """
    Process data from the 'reports.xlsx' file and extract required fields.
    """
    data = []
    wb = openpyxl.load_workbook(xlsx_path, data_only=True)
    worksheet = wb.active
    for row in worksheet.iter_rows(min_row=2, values_only=True):
        image_description = row[0]
        image_bytes = row[1].value
        text = extract_text_from_image(image_bytes)
        # Process the extracted text and generate the required fields
        # Here, you can implement your logic to extract pattern string, category, type, website location, etc.
        pattern_string = text  # Placeholder, replace with your logic
        comment = "Sample comment"  # Placeholder, replace with your logic
        pattern_category = "Sample category"  # Placeholder, replace with your logic
        pattern_type = "Sample type"  # Placeholder, replace with your logic
        where_in_website = "Sample location"  # Placeholder, replace with your logic
        deceptive = "Yes" if "deceptive" in text.lower() else "No"  # Example: Check if the text contains 'deceptive'
        website_page = "Sample page"  # Placeholder, replace with your logic
        data.append((pattern_string, comment, pattern_category, pattern_type, where_in_website, deceptive, website_page))
    return data

def save_to_csv(data, csv_path):
    """
    Save the extracted data to a CSV file.
    """
    df = pd.DataFrame(data, columns=["Pattern String", "Comment", "Pattern Category", "Pattern Type",
                                     "Where in website?", "Deceptive?", "Website Page"])
    df.to_csv(csv_path, index=False)

if __name__ == "__main__":
    reports_xlsx_path = "reports.xlsx"
    csv_output_path = "extracted_data.csv"

    # Process the 'reports.xlsx' file
    extracted_data = process_reports_xlsx(reports_xlsx_path)

    # Save the extracted data to a CSV file
    save_to_csv(extracted_data, csv_output_path)

    print("Data extraction and CSV creation completed.")


Data extraction and CSV creation completed.
