In [1]:
import os
import re
import cv2
import pytesseract
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from tqdm import tqdm
from pathlib import Path
from io import BytesIO
from PIL import Image as PILImage  # Correct import for PIL Image

# ReportLab imports
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import Paragraph, Spacer, Image, PageBreak, SimpleDocTemplate
import datetime

# Counter
global figure_counter 
figure_counter = 1

# Methods
def createTextObject(text, style, centered=False):
    """
    Creates a Paragraph with optional centered alignment.
    """
    if centered:
        style = ParagraphStyle(name="Centered", parent=style, alignment=1)  # 1 = TA_CENTER
    return Paragraph(text, style)

def generate_violin_plot(df, column, group_by=None):
    """
    Generates a violin plot for a given column of a DataFrame.
    """
    plt.figure(figsize=(6, 4))
    if group_by:
        sns.violinplot(data=df, x=group_by, y=column)
    else:
        sns.violinplot(data=df[column])
    plt.tight_layout()
    
    buffer = BytesIO()
    plt.savefig(buffer, format="PNG")
    plt.close()
    buffer.seek(0)
    return buffer

from io import BytesIO
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def generate_boxplot_with_stripplot(df, column, group_by=None):
    """
    Generates a boxplot with stripplot (points for each data point) where outliers
    (those falling outside 1.5 times the IQR) are shown as red points and the rest as black.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the data.
        column (str): The column to plot.
        group_by (str, optional): The column to group by. If None, no grouping is applied.

    Returns:
        BytesIO: A buffer containing the plot image in PNG format.
    """
    plt.figure(figsize=(8, 6))
    
    if group_by:
        # Draw the boxplot without fliers
        sns.boxplot(data=df, x=group_by, y=column, hue=group_by, palette="Set2", legend=False, showfliers=False)
        
        # For each group, compute the IQR-based bounds and plot points accordingly
        groups = df[group_by].unique()
        for grp in groups:
            # Select data for this group
            grp_data = df[df[group_by] == grp][column]
            Q1 = grp_data.quantile(0.25)
            Q3 = grp_data.quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Classify the points as non-outliers and outliers
            non_outliers = grp_data[(grp_data >= lower_bound) & (grp_data <= upper_bound)]
            outliers = grp_data[(grp_data < lower_bound) | (grp_data > upper_bound)]
            
            # Plot non-outliers (black)
            sns.stripplot(x=[grp] * len(non_outliers), y=non_outliers, color="black", alpha=0.5, jitter=True)
            # Plot outliers (red)
            sns.stripplot(x=[grp] * len(outliers), y=outliers, color="red", alpha=0.5, jitter=True)
            
        plt.title(f"Boxplot with Stripplot of {column} grouped by {group_by}")
    else:
        # Draw the boxplot without fliers for the whole column
        sns.boxplot(data=df, y=column, color="lightblue", showfliers=False)
        
        # Compute bounds for the overall data
        data = df[column]
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Classify the points
        non_outliers = data[(data >= lower_bound) & (data <= upper_bound)]
        outliers = data[(data < lower_bound) | (data > upper_bound)]
        
        # When no grouping, use a constant x value (e.g., 0) for all points
        sns.stripplot(x=[0] * len(non_outliers), y=non_outliers, color="black", alpha=0.5, jitter=True)
        sns.stripplot(x=[0] * len(outliers), y=outliers, color="red", alpha=0.5, jitter=True)
        plt.title(f"Boxplot with Stripplot of {column}")
        plt.xlabel("")  # No x-axis label needed when not grouped

    plt.ylabel(column)
    plt.tight_layout()
    
    # Save the plot into a BytesIO buffer in PNG format
    buffer = BytesIO()
    plt.savefig(buffer, format="PNG", bbox_inches="tight")
    plt.close()
    buffer.seek(0)
    return buffer



# Constants
global metric_image
metric_image = {
    "A1": "Head_A1.png",
    "A2": "Head_A2.png",
    "A3": "Head_A3.png",
    "A4": "Head_A4.png",
    "A5": "Head_A5.png",
    "B1": "Pronotum_B1.png",
    "B2": "Pronotum_B2.png",
    "B3": "Pronotum_B3.png",
    "B4": "Pronotum_B4.png",
    "B5": "Pronotum_B5.png",
    "C1": "Lateral_C1.png",
    "D1": "Mesosternal_process_D1.png",
    "D2": "Mesosternal_process_D2.png",
    "D3": "Mesosternal_process_D3.png",
    "D4": "Mesosternal_process_D4.png",
    "E1": "Prosternal_process_E1.png",
    "E2": "Prosternal_process_E2.png",
    "F1": "Ventral.png",
    "F2": "Ventral.png",
    "F3": "Ventral.png",
    "F4": "Ventral.png",
    "F5": "Ventral.png",
    "W2": "Head.png",
    "W3": "Head.png",
    "W4": "Head.png",
    "W5": "Head.png",
    "W6": "Pronotum.png",
    "W7": "Pronotum.png",
    "W9": "Mesosternal_process_D4.png",
    "W10": "Mesosternal_process_D4.png",
    "W13": "Prosternal_process.png",
}

# Location of images
global protocol_image_location
protocol_image_location = Path(r"F:\BIOMETRY_PNG\20250110\Protocolo")

# --- Saving and Loading the Dictionary ---

def save_metric_image(dictionary, filepath):
    """
    Saves the metric_image dictionary to a JSON file.
    
    Args:
        dictionary (dict): The dictionary to save.
        filepath (str or Path): The path to the file where the dictionary will be saved.
    """
    with open(filepath, 'w', encoding='utf-8') as file:
        json.dump(dictionary, file, indent=4)
    print(f"Dictionary saved to {filepath}")

def load_metric_image(filepath):
    """
    Loads the metric_image dictionary from a JSON file.
    
    Args:
        filepath (str or Path): The path to the JSON file.
    
    Returns:
        dict: The loaded dictionary.
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        loaded_dict = json.load(file)
    print(f"Dictionary loaded from {filepath}")
    return loaded_dict

# Define the path for the JSON file
metric_image_file = Path("metric_image.json")

# Save the metric_image dictionary
save_metric_image(metric_image, metric_image_file)

# Later, load the dictionary from the file
metric_image_loaded = load_metric_image(metric_image_file)

# Optional: Verify that both dictionaries are the same
assert metric_image == metric_image_loaded, "The loaded dictionary does not match the original!"


Dictionary saved to metric_image.json
Dictionary loaded from metric_image.json


In [5]:
def feature_engineering(measurement_df):
    # Ensure the required columns are present in the DataFrame
    required_columns = ["A1", "A2", "A3", "A4", "B1", "B2", "B3", "B4", "B5", "C1", "D1", "D2", "D3", "D4", "E1", "E2", "F1", "F2", "F3", "F4", "F5", "code"]
    
    # Check if all required columns are available in the DataFrame
    for column in required_columns:
        if column not in measurement_df.columns:
            print(f"Warning: Missing column {column} in the DataFrame.")
    
    # --- Statistics: Descriptive statistics for the measurements ---
    print("Summary Statistics:")
    print(measurement_df.describe())  # Basic statistics (mean, std, min, 25%, 50%, 75%, max)
    
    # --- Absolute Metrics ---
    # Example: Metric for A1 (could be specific characteristics or formulae for these metrics)
    #measurement_df["W1"] = measurement_df["A1"]  # Example, assuming A1 is a numeric column

    # --- Relative Metrics ---
    # Example: A1/A3, A4/A3, A5/A3 (just a few examples, can extend to others)
    measurement_df["W2"] = measurement_df["A1"] / measurement_df["A3"]
    measurement_df["W3"] = measurement_df["A4"] / measurement_df["A3"]
    measurement_df["W4"] = measurement_df["A5"] / measurement_df["A3"]

    # --- Pronoto: Kalinini has a longer pronoto compared to its width ---
    # Example: Calculating relative metric B4/B1, B4/B2, B4/B3
    measurement_df["W5"] = measurement_df["B4"] / measurement_df["B1"]
    measurement_df["W6"] = measurement_df["B4"] / measurement_df["B2"]
    measurement_df["W7"] = measurement_df["B4"] / measurement_df["B3"]

    # --- Mesosternal Process: Example of a metric based on "más brillante" (more shiny) ---
    # Let's assume we have a measure for brightness or some characteristic that corresponds to this.
    # Using D2 as an example for being "more short" (assumed metric), you could add a new metric:
    
    # --- More Width (D1/D2, D3/D2) ---
    measurement_df["W9"] = measurement_df["D2"] / measurement_df["D1"]
    measurement_df["W10"] = measurement_df["D2"] / measurement_df["D3"]

    # --- Alternative Metric (just an example, could be anything relevant to your dataset) ---
    # For now, assuming a combination of some columns as an alternative metric
  
    
    # --- Placa Prosternal: For kalinini and resplendens comparison ---
    # This part is a bit more abstract and depends on how we define these measurements.
    # You could categorize the species based on these characteristics if you have some measurements available.

    # Kalinini is round, triangular, and flat; resplendens is square. 
    # For simplicity, let's assume we can flag them based on a column like "shape" or similar
    

    # --- Relative Metric: E1/E2 ---
    measurement_df["W13"] = measurement_df["E1"] / measurement_df["E2"]

    # --- Return the updated DataFrame ---
    return measurement_df

# Read the CSV file into a DataFrame
date = datetime.date.today()
#file_path = f'summary just png files {date}.csv'  # Path to your saved file
file_path = f"summary just png files 2025-02-16.csv"
measurement_df = pd.read_csv(file_path, sep='\t', decimal=',', header=0)


# Print the first few rows to verify it worked
print(measurement_df.head())

# Example usage:
# Assuming `measurement_df` contains the extracted measurements from OCR.
# You can now apply the feature engineering function to the DataFrame.
measurement_df_2 = feature_engineering(measurement_df)

metrics_df = pd.DataFrame([])
metrics_df["code"] = measurement_df_2["code"]
columns = [col for col in measurement_df_2.columns if col != "code"]
for col in columns:
    metrics_df[col] = measurement_df_2[col]
# Print the final DataFrame to check the new features
#print(measurement_df_2)

#save the df into a file
filename = "metrics.csv"

# Save the DataFrame to a CSV file
metrics_df.to_csv(filename, index=False, sep = "\t", decimal = "." )
print(f"New metrics saved to {filename}")

        A1       A2       A3       A4       A5       B1       B2       B3  \
0  4120.58  3511.41  4674.61  1430.02  1958.09  5525.11  7967.86  8652.41   
1  4117.20  3529.93  4475.22  1462.50  1922.14  5352.33  8099.20  8615.37   
2  3971.18  3584.00  4563.49  1402.85  1924.31  5360.70  8185.66  8495.90   
3  3627.85  3538.61  4585.22  1475.45  2028.42  5328.44  8139.54  8178.22   
4      NaN      NaN      NaN      NaN      NaN  5326.82  7841.55  8184.22   

        B4      B5  ...       D3      D4      E1      E2       F1       F2  \
0  5101.49  155.26  ...  2132.02  680.01  579.64  713.80  1385.64  1399.18   
1  5225.63  148.98  ...  2278.68  650.62  507.14  848.75      NaN  1265.19   
2  5436.72  149.16  ...  2275.50  828.78  420.73  860.66  1424.02  1374.55   
3  5299.74  148.93  ...  2159.79  628.63  476.10  824.69  1367.70  1427.33   
4  5172.77  150.12  ...  2186.98  586.34  444.66  640.77  1391.86  1327.51   

        F3       F4       F5           code  
0  1475.44  1857.42  1

In [6]:
import pandas as pd

def detect_outliers(df):
    """
    Scans the DataFrame for outliers in all numeric columns (excluding the 'code' column).
    Returns a DataFrame with rows that have outliers, including:
      - the value of the 'code' column,
      - the column name where the outlier occurred,
      - the outlier value,
      - the median, lower whisker, and upper whisker for that column.
    """
    outlier_records = []
    
    # Identify numeric columns.
    # Exclude the 'code' column (assuming it is not to be analyzed as numeric data).
    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
    if 'code' in numeric_cols:
        numeric_cols.remove('code')
    
    # Process each numeric column
    for col in numeric_cols:
        # Compute statistics for the column.
        median = df[col].median()
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_whisker = Q1 - 1.5 * IQR
        upper_whisker = Q3 + 1.5 * IQR
        
        # Identify outlier rows for this column.
        # You can do this with a vectorized boolean condition:
        is_outlier = (df[col] < lower_whisker) | (df[col] > upper_whisker)
        
        # For every row that is an outlier, record the required details.
        for idx, row in df[is_outlier].iterrows():
            outlier_records.append({
                'code': row['code'],       # the value from the "code" column
                'column': col,             # the column where the outlier was found
                'outlier_value': row[col],
                'median': median,
                'lower_whisker': lower_whisker,
                'upper_whisker': upper_whisker
            })
    
    # Convert the list of records into a DataFrame to return.
    return pd.DataFrame(outlier_records)

detect_outliers(metrics_df)

Unnamed: 0,code,column,outlier_value,median,lower_whisker,upper_whisker
0,CICIMAUCR0232,A1,3492.88,4117.2,3520.86,4739.42
1,CICIMAUCR0204,A5,14.0,1696.3,750.17,2795.37
2,CICIMAUCR0252,C1,164.38,151.105,139.7325,163.3725
3,CICIMAUCR0233,D2,884.71,671.405,456.335,882.875
4,CICIMAUCR0215,D3,1554.09,2282.12,1619.62625,2881.93625
5,CICIMAUCR0242,D3,1495.3,2282.12,1619.62625,2881.93625
6,CICIMAUCR0261,F5,2485.28,1639.36,929.075,2346.715
7,CICIMAUCR0232,W2,0.729411,0.880972,0.746008,1.000297
8,CICIMAUCR0204,W4,0.00303,0.376002,0.167276,0.586846
9,CICIMAUCR0248,W6,0.542044,0.654028,0.602293,0.707183


In [None]:
# add species data

import sys
import os

# Add the directory containing datapath_selector.py to the system path
library_path = r"C:\Users\esteb\escarabajos\libraries"
sys.path.append(library_path)

# Now you can import datapath_selector.py as a module
import datapath_selector
import spectraltools
from datapath_selector import get_paths
from collection_tools import *
from datetime import datetime
collections_list = get_collections_list()
collections_dict = get_collections_dict()

# Define a function to apply species_lookup to each code
def get_species_for_code(code):
    # Use the species_lookup method from the relevant collection in collections_dict
    return collections_dict["CICIMAUCR1"].species_lookup(code=code, collection_list=collections_list)


# Apply the function to the 'code' column of your DataFrame
#convert codes into list
code_list = measurement_df_2["code"].tolist()
result_df = pd.DataFrame([])
result_df = pd.concat([measurement_df_2, result_df], axis=1)

#print(f"{result_df=}")

for code in code_list:
    info_df = get_specimen_info(code)  # Fetch information for the given code
    columns_of_interest = ["code", "species", "location_code", "sex_code"]
    new_columns_df = info_df[columns_of_interest]

    # Merge the new columns into result_df by "code"
    if result_df.empty:
        # If result_df is empty, initialize it with the first new_columns_df
        result_df = new_columns_df
    else:
        # Update or add information for the specific "code"
        for column in columns_of_interest:
            if column != "code":  # Avoid trying to overwrite the "code" column itself
                result_df.loc[result_df["code"] == code, column] = new_columns_df.loc[new_columns_df["code"] == code, column].values[0]

    
#measurement_df_2["species"] = measurement_df_2["code"].apply(get_species_for_code)
print(f"{result_df=}")

#define information_df
information_df = result_df

# First, group the entire dataframe by 'species', and then calculate the mean of 'A1'
metrics_under_consideration = ["A1", "A2", "A3", "A4", "B1", "B2", "B3", "B4", "B5", "C1", "D1", "D2", "D3", "D4", "E1", "E2", "F1", "F2", "F3", "F4", "F5", "W2", "W3", "W4", "W5", "W6", "W7", "W9", "W10", "W13"]
    
aggregated = result_df.groupby("species")[metrics_under_consideration].mean()

# Optionally, if you want to see the result:
#print(aggregated)


In [None]:
from reportlab.platypus import Image, Spacer, Paragraph
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
### Third test: Plots on demand
#!pip install reportlab
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER
import matplotlib.pyplot as plt
import numpy as np


In [None]:
import scipy.stats as stats
import pandas as pd

# Assuming 'measurement_df_2' is your dataframe
# Split the data by species
measurement_df_2 = result_df
kalinini_data = measurement_df_2[measurement_df_2["species"] == "kalinini"]
resplendens_data = measurement_df_2[measurement_df_2["species"] == "resplendens"]

# List of metrics to test (assuming your dataframe contains these columns)
metrics = metrics_under_consideration

# Dictionary to store test results
global t_test_results
t_test_results = {}

#Normality
normality_info_dict = {
    
}
for metric in metrics:
    # Extract the data for each species' metric
    kalinini_values = kalinini_data[metric].dropna()  # Remove missing values
    resplendens_values = resplendens_data[metric].dropna()  # Remove missing values
    
    # Check normality (Shapiro-Wilk test) for both species
    kalinini_normal = stats.shapiro(kalinini_values)[1] > 0.05  # p-value > 0.05 => normal
    resplendens_normal = stats.shapiro(resplendens_values)[1] > 0.05  # p-value > 0.05 => normal

    #save info for report
    normality_info_dict["kalinini"] = kalinini_normal
    normality_info_dict["resplendens"] = resplendens_normal
    
    if kalinini_normal and resplendens_normal:
        # Perform Levene's test for homogeneity of variance
        levene_test = stats.levene(kalinini_values, resplendens_values)
        print(levene_test)
        # Perform Student's t-test if variances are equal (Levene's test p > 0.05)
        if levene_test.pvalue > 0.05:
            t_stat, p_value = stats.ttest_ind(kalinini_values, resplendens_values)
            test_type = "Student's t-test"
        else:
            # If variances are unequal, use Welch's t-test (Welch correction)
            t_stat, p_value = stats.ttest_ind(kalinini_values, resplendens_values, equal_var=False)
            test_type = "Welch's t-test"
        
        # Interpretation
        interpretation = "significant difference" if p_value < 0.05 else "no significant difference"
        
        t_test_results[metric] = {
            "levene_test": levene_test.pvalue,
            "test_type": test_type,
            "t_stat": t_stat,
            "p_value": p_value,
            "interpretation": interpretation
        }
    else:
        # If normality fails, use the Mann-Whitney U test
        u_stat, p_value = stats.mannwhitneyu(kalinini_values, resplendens_values)
        test_type = "Mann-Whitney U test"
        
        # Interpretation
        interpretation = "significant difference" if p_value < 0.05 else "no significant difference"
        
        t_test_results[metric] = {
            "levene_test": levene_test.pvalue,
            "test_type": test_type,
            "u_stat": u_stat,
            "p_value": p_value,
            "interpretation": interpretation
        }



# Print results
global metric_description
metric_description = { "A1": "Vertical length of the head: measured from the center of the clipeus down to the middle of the back of the head.",
                      "A2": "Horizontal length between the left and right sutures",
                      "A3": "Horizontal length between the left and right eye’s canthus",
                      "A4": "Vertical ortogonal length of the clipeus measured from the front down to A2 line",
                      "A5": "Vertical ortogonal length of the clipeus measured from the front down to A3 line",
                      "B1": "Horizontal length between the pronotum’s frontal angles",
                      "B2": "Horizontal length between the pronotum’s middle angles",
                      "B3": "Horizontal length between the pronotum’s hind angles",
                      "B4": "Vertical length of the pronotum’s measured from the middle point of its front down to the middlepoint of its rear",
                      "B5": "Angle of its side measured between the tangent lines to its straightest sections in the front and back, as seen from the top ",
                      "C1": "Angle of its side measured between the tangent lines to its straightest sections in the front and back as seen by the side",
                      "D1": "Mesosternal process’ horizontal length measured from the secant point of the tangents of its sides with the horizontal line used to measure D1. ",
                      "D2": "Mesosternal process’ vertical length measured from the tip of the mesosternal process down to the line that joins the two lowest curves at the sides of the mesosternal process base",
                      "D3": "Horizontal width of the dark middle line measured from its two lower ends",
                      "D4": "Vertical length from the tip of the mesosternal process down to the lowest point of the black patch in the middle of the mesosternal process",
                      "E1": "Horizontal top width of the prosternal plate ",
                      "E2": "Horizontal bottom width of the prosternal plate ",
                      "F1": "Vertical length of the foremost ventral plate",
                      "F2": "Vertical length of the second foremost ventral plate",
                      "F3": "Vertical length of the third foremost ventral plate",
                      "F4": "Vertical length of the fourth foremost ventral plate ",
                      "F5": "Vertical length of the fifth foremost ventral plate",
                      "W2": "A1/A3 Measure of the vertical length of beetle's head relative to its canthuses' distance width",
                      "W3": "A4/A3 Measure of the vertical length of beetle's clipeum relative to its canthuses' distance width",
                      "W4": "A5/A3 Measure of the vertical length of beetle's eyes relative to its canthuses' distance width",
                      "W5": "Measure of the vertical length of the pronotum relative to its front width",
                      "W6": "Measure of the vertical length of the pronotum relative to its middle width",
                      "W7": "Measure of the vertical length of the pronotum relative to its back width",
                      "W9": "Measure of the vertical length of the mesosternal process relative to its back width. D2/D1",
                      "W10": "Measure of the vertical length of the mesosternal process relative to its middle width. D2/D3",
                      "W13": "Measure of how square the prosternal plate is. Front width back width ratio E1/E2",
                     }

statistical_analysis_text = ""




In [None]:

#constants
current_date = datetime.today().date()

#methods
def create_paragraph(text):
    title_style = getSampleStyleSheet()["Title"]
    centered_title_style = ParagraphStyle(
    name='CenteredTitle',
    parent=title_style,
    fontSize=14,
    alignment=1  # 0 for left, 1 for center, 2 for right
    )
    content = Paragraph(text, centered_title_style)
    return content
    

In [None]:
def front_page():
    elements = []

    # Title
    title_style = getSampleStyleSheet()["Title"]
    title = Paragraph("Biometry report", title_style)
    elements.append(title)
    elements.append(Spacer(1, 24))

    # Author
    #author_style = getSampleStyleSheet()["Normal"]
    centered_title_style = ParagraphStyle(
    name='CenteredTitle',
    parent=title_style,
    fontSize=14,
    alignment=1  # 0 for left, 1 for center, 2 for right
    )
    #heading1_style = getSampleStyleSheet()["Heading3"]
    author = Paragraph("Dra. Marcela Hernández, Dr. Esteban Bermúdez Ureña, Angel Aguirre & Esteban Soto.", centered_title_style)
    elements.append(author)
    elements.append(Spacer(1, 6))
    institution = Paragraph("Centro de Investigación en Ciencia e Ingeniería de los Materiales", centered_title_style)
    elements.append(institution)
    elements.append(create_paragraph("2025"))
    elements.append(create_paragraph("University of Costa Rica"))
    elements.append(create_paragraph(f"{current_date}"))
    elements.append(PageBreak())


    return elements

In [None]:
def introduction_section(df):
    
    #Define sections and results
    sections = {}
    results =[]
    information = []
    information.append(PageBreak())

    #Specimen count
    number_of_kalinini_specimens = df[df["species"] == "kalinini"]["code"].count()
    number_of_resplendens_specimens = df[df["species"] == "resplendens"]["code"].count()

    
    
    # Section 1: Introduction
    description = []
    title = [Paragraph(f"Introduction", getSampleStyleSheet()["Heading2"]), Spacer(1, 12),

            Paragraph(f"""Zubov et al. (2019) describe a new species of Chrysina. 
            In its comparative analysis and remark it is stated that the new species is 
            very simmilar to C. resplendens and only few morphological differences can be
            noted. This work intends to perform a quantitative analysis of these differences
            using a sample of {number_of_kalinini_specimens} C. kalinini specimens and 
            {number_of_resplendens_specimens} C. resplendens specimens. 
            """, getSampleStyleSheet()["Normal"]), Spacer(1, 12),
            Paragraph(f""" The measurements described in the article are specified more
            precisely and alternative metrics are analyzed.
            """, getSampleStyleSheet()["Normal"]), Spacer(1, 12),
            ]
    description += title
    

    return description + information

introduction_section(measurement_df_2 )

In [None]:
def methodology(df):
    
    #Define sections and results
    sections = {}
    results =[]
    information = []
    information.append(PageBreak())

    #Specimen count
    number_of_kalinini_specimens = df[df["species"] == "kalinini"]["code"].count()
    number_of_resplendens_specimens = df[df["species"] == "resplendens"]["code"].count()
    
    #Locations
    unique_locations_by_species = df.groupby("species")["location_code"].unique()

    # Number of male and female specimens per species
    number_of_kalinini_m_specimens = df[(df["species"] == "kalinini") & (df["sex_code"] == "M")]["code"].count()
    number_of_kalinini_f_specimens = df[(df["species"] == "kalinini") & (df["sex_code"] == "F")]["code"].count()
    number_of_kalinini_u_specimens = number_of_kalinini_specimens - number_of_kalinini_m_specimens -number_of_kalinini_f_specimens
    
    number_of_resplendens_m_specimens = df[(df["species"] == "resplendens") & (df["sex_code"] == "M")]["code"].count()
    number_of_resplendens_f_specimens = df[(df["species"] == "resplendens") & (df["sex_code"] == "F")]["code"].count()
    number_of_kalinini_u_specimens = number_of_kalinini_specimens - number_of_kalinini_m_specimens -number_of_kalinini_f_specimens

    
    # Section 1: Introduction
    description = []
    title = [Paragraph(f"Methodology", getSampleStyleSheet()["Heading2"]), Spacer(1, 12),

            Paragraph(f""" Chrysina samples were retrieved from the following locations: {unique_locations_by_species}
            """, getSampleStyleSheet()["Normal"]), Spacer(1, 12),
            Paragraph(f"""
            Sex distribution is the following:
            - C. kalinini: {number_of_kalinini_m_specimens} males, {number_of_kalinini_f_specimens} females , {number_of_kalinini_u_specimens} unknown
            - C. resplendens: {number_of_resplendens_m_specimens} males, {number_of_resplendens_f_specimens} females, {number_of_kalinini_u_specimens} unknown
            """, getSampleStyleSheet()["Normal"]), Spacer(1, 12),
            Paragraph(f""" Using an estereoscope its head, clipeum, mesosternal process, prosternal process and ventral plates were measured. 
            """, getSampleStyleSheet()["Normal"]), Spacer(1, 12),
            Paragraph(f"""A OCR software was used to retrieve the measurements and to add contextual information about collection location, sex,
            genus and species.
            """, getSampleStyleSheet()["Normal"]), Spacer(1, 12),
            Paragraph(f""" Zubov's et al morphological differences were calculated using the metrics taken with the estereoscope.
            """, getSampleStyleSheet()["Normal"]), Spacer(1, 12),
            Paragraph(f""" Afterwards, measurements were separated by species and a normality test was performed over the data. If the data was normal 
            and the variances between the populations are equal, a Student's T test was performed, if not a non-parametric Mann-Whitney U test was applied instead. If the p value was smaller than 0.05,
            a statistical significant difference between the two species is determined.
            """, getSampleStyleSheet()["Normal"]), Spacer(1, 12),
            #
            ]
    description += title
    

    return description + information



In [None]:
def generate_violin_plot(df, column, group_by):
    """Generates a violin plot for a given numerical column and returns a BytesIO buffer."""
    plt.figure(figsize=(6, 4))
    sns.violinplot(x=df[group_by], y=df[column], inner="quartile")
    if not column.startswith("W"):
        plt.ylabel("length [$\\mu$m]")
    plt.title(f"Violin Plot for {column}")
    
    buffer = BytesIO()
    plt.savefig(buffer, format="png")
    plt.close()
    buffer.seek(0)
    return buffer

def get_metric_image(image_path):
    """Compresses and returns a ReportLab Image object."""
    basename = Path(image_path).name
    original_image = PILImage.open(image_path)
    compressed_image_path = f"{basename}_c.jpg"
    
    try:
        original_image.save(compressed_image_path, "JPEG", quality=70)
        return Image(compressed_image_path, width=200, height=200)
    except:
        original_image.save(compressed_image_path, "PNG", quality=100)
        return Image(image_path, width=200, height=200) 
        
def dataset_description(df, group_by=None):
    """Creates a dataset description report with violin plots and statistical analysis."""
    styles = getSampleStyleSheet()
    description = []
    description.append(Paragraph("Dataset Description", styles["Heading2"]))
    description.append(Spacer(1, 12))
    
    description.append(Paragraph("Normality test", styles["Heading3"]))
    description.append(Spacer(1, 12))
    
    description.append(Paragraph(f""" Shapiro-Wilk p-values for C. kalinini population are {normality_info_dict["kalinini"]:.2f},
                                    and for C. resplendens population are {normality_info_dict["kalinini"]:.2f}""", styles["Normal"]))
    description.append(Spacer(1, 12))
    description.append(Paragraph("Violin plots for each metric:", styles["Heading3"]))
    description.append(Spacer(1, 12))
    
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        description.append(Paragraph(f"Metric {column}", styles["Heading3"]))
        description.append(Paragraph(f"{metric_description.get(column, 'No description available.')}", styles["Normal"]))
        description.append(Spacer(1, 12))
        
        # Generate violin plot
        plot_buffer = generate_boxplot_with_stripplot(df, column, group_by="species")
        #plot_buffer = generate_violin_plot(df, column, group_by)
        description.append(Image(plot_buffer, width=400*0.7, height=300*0.7))
        
        caption_text = f"Figure {figure_counter}: Violin Plot for {column}"
        figure_counter += 1
        caption_style = ParagraphStyle(name="CenteredCaption", parent=styles["BodyText"], alignment=1)
        description.append(Paragraph(caption_text, caption_style))
        description.append(Spacer(1, 12))
    
    description.append(PageBreak())
    return description

def statistical_analysis(df , figure_counter = figure_counter):
    """Generates a statistical analysis section with optional images."""
    styles = getSampleStyleSheet()
    stat_analysis = [Paragraph("Statistical Analysis", styles["Heading2"]), Spacer(1, 12)]
    
    for metric, result in t_test_results.items():
        stat_analysis.append(Paragraph(f"Metric: {metric}", styles["Heading3"]))
        stat_analysis.append(Paragraph(f"{metric_description.get(metric, 'No description available.')}", styles["Normal"]))
        
        # Generate violin plot
        plot_buffer = generate_boxplot_with_stripplot(df, metric, group_by="species")
        #plot_buffer = generate_violin_plot(df, metric, group_by = "species")
        stat_analysis.append(Image(plot_buffer, width=400, height=300))
        
        caption_text = f"Figure {figure_counter}: Violin Plot for {metric}"
        figure_counter += 1
        caption_style = ParagraphStyle(name="CenteredCaption", parent=styles["BodyText"], alignment=1)
        stat_analysis.append(Paragraph(caption_text, caption_style))
        stat_analysis.append(Spacer(1, 12))

        #test results
        stat_analysis.append(Paragraph(f"Test Type: {result['test_type']}", styles["Normal"]))
        stat_analysis.append(Paragraph(f"Test Statistic: {result.get('t_stat', result.get('u_stat'))}", styles["Normal"]))
        stat_analysis.append(Paragraph(f"P-value: {result['p_value']}", styles["Normal"]))
        stat_analysis.append(Paragraph(f"Interpretation: {result['interpretation']}", styles["Normal"]))
        stat_analysis.append(Spacer(1, 12))
        stat_analysis.append(PageBreak())
        
        # Include image only if metric does not start with 'W'
        if not metric.startswith("W") and metric in metric_image:
            
            image_path = Path(protocol_image_location) /metric_image[metric]
            img = get_metric_image(image_path)
            stat_analysis.append(img)
            caption_text = f"Figure {figure_counter}: Metric {metric}"
            figure_counter += 1
            caption_style = ParagraphStyle(name="CenteredCaption", parent=styles["BodyText"], alignment=1)
            stat_analysis.append(Paragraph(caption_text, caption_style))
        
        stat_analysis.append(Spacer(1, 12))
        stat_analysis.append(PageBreak())
    
    return stat_analysis

In [None]:
from reportlab.platypus import Table, TableStyle
from reportlab.lib import colors
import pandas as pd


def dataframe_to_pdf_table(df):
    """
    Converts a Pandas DataFrame to a ReportLab Table with Jupyter-like styling.
    Automatically adjusts column widths based on content.
    """
    table_1_col = ["A1", "W2", "W3", "W4", "W5"]
    table_2_col = ["W6", "W7", "W9", "W10", "W13"]

    df1 = df[table_1_col]
    df2 = df[table_2_col]

    info = []
    dataframes = [df1, df2]

    for df in dataframes:
        
        # Format numbers in the DataFrame to 3 decimal places
        df = df.round(3)
    
        # Convert DataFrame to list of lists (including column headers)
        data = [df.columns.to_list()] + df.values.tolist()

        # Auto-calculate column widths based on max text length
        col_widths = [max(len(str(val)) for val in df[col]) * 5 for col in df.columns]

        # Define table style without vertical black lines
        style = TableStyle([
            ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),  # Header row background
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),  # Header row text color
            ('ALIGN', (0, 0), (-1, -1), 'CENTER'),  # Center align all cells
            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),  # Header font
            ('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),  # Body font
            ('BOTTOMPADDING', (0, 0), (-1, 0), 6),  # Padding for header
            ('BACKGROUND', (0, 1), (-1, -1), colors.whitesmoke),  # Light gray background for body
            ('LINEBELOW', (0, 0), (-1, 0), 1, colors.black),  # Line below header
            ('LINEBELOW', (0, -1), (-1, -1), 1, colors.black),  # Line below last row
            ('BOX', (0, 0), (-1, -1), 1, colors.black),  # Outer border
        ])

        # Create table with dynamic column widths
        table = Table(data, colWidths=col_widths)
        table.setStyle(style)
        info.append(table)

    return info


In [None]:
def conclusion(df):
    styles = getSampleStyleSheet()
    content = [Paragraph("Comparison with Zubov et al. claims", styles["Heading3"]), Spacer(1, 12)]
    
    
    
    claims = [Paragraph("Claim 1", styles["Heading4"]), Spacer(1, 12),
                Paragraph("""
                The new species is very close to C. resplendens and has only few morphological
    differences from it. Clypeus of C. kalinini sp.n. is slightly longer t han in C. resplendens
                """, styles["Italic"]), Spacer(1, 12),
                Paragraph(""" Head's vertical length- canthuses-width ratio, W2, 
               does not show a statistically significant difference. Whereas,
               A2, the distance between sutures; and A3, distances between canthuses, are both 
               statistically significant. These can be used as alternatives to W2.
                """, styles["Normal"]), Spacer(1, 12),
                Paragraph("Claim 2", styles["Heading4"]), Spacer(1, 12),
                Paragraph("""
                Pronotum in C. kalinini sp.n. is slightly longer in relation to its width than in C. resplendens , its sides have smaller
    angles, whereas in C. resplendens the sides of pronotum are rounded.
                """, styles["Italic"]), Spacer(1, 12),
               Paragraph(""" 
               None of the pronotum's vertical length - horizontal width ratios (Metrics W5,W6,W7) showed a significant difference between species.
                """, styles["Normal"]), Spacer(1, 12),
               Paragraph("Claim 3", styles["Heading4"]), Spacer(1, 12),
               Paragraph("""
                Mesosternal process shiny, short
                er and wider than in C. resplendens, where the process is long and narrow and its
                apical half is greenish golden (Fig. 6 8).
                """, styles["Italic"]), Spacer(1, 12),
               Paragraph("""
                There is a significant difference between the absolute vertical length values
                between the two species (Metric D2, figure 23).
                There is no significant differences in their widths (Metrics D1 and D3). 
                There is a significant difference between species on its vertical distance between
                the tip of its mesosternal process and the lower point of the dark curve in the
                middle of it. 
                """, styles["Normal"]), Spacer(1, 12),
               Paragraph("Claim 4", styles["Heading4"]), Spacer(1, 12),
               Paragraph("""
                Prosternal plate of
                C. kalinini sp.n. is rounded triangular and flat, in C. resplendens it is square and has a clear dent
                """, styles["Italic"]), Spacer(1, 12),
               Paragraph("""
                Albeit there is a difference between the absolute values of the foremost width of the 
                prosternal process (Metric E1, figure 29), there is no significant difference in how square 
                the prosternal plate is for each one of the species when the ratio of lengths is accounted for
                (Metric W13, ratio between E1 and E2. Figure 51)
                """, styles["Normal"]), Spacer(1, 12),
               
              ]
    claims.append(PageBreak())
    content += claims
    # Summary table of tests
    
    comparative = [Paragraph("Comparative table", styles["Heading3"]), Spacer(1, 12),]

    content += comparative
    content += dataframe_to_pdf_table(df)
    
    content.append(PageBreak())

    
    return content
    
    
   

In [None]:
def create_bibliography():
    bibliography = [Paragraph("References", getSampleStyleSheet()["Heading1"]), Spacer(1, 12),
                    Paragraph("""1. Zubov, A.S.; Ivshin, N.V.; Titarenko, A. Y.; Andrianov, B.V.  (2019). Description of a new species of
                    Chrysina Kirby, 1828 (Coleoptera:Scarabaeidae: Rutelinae) from resplendens group, based on morphological characters
                    and mtDNA COX I molecular marker. Acta Biologica Sibirica, 5(1), 71–76.""", getSampleStyleSheet()["Normal"]),
                    #Paragraph("2. Author B, et al. (Year). Title of the paper. Journal Name, Volume(Issue), Page Numbers.", getSampleStyleSheet()["Normal"]),
                    Spacer(1, 24)]
    return bibliography

In [None]:
t_test_results_df = pd.DataFrame(t_test_results)

print(t_test_results_df.columns)
t_test_results_df[["A1","W2","W3","W4","W5","W6","W7","W9","W10","W13"]]

In [None]:

report_location = ""
# Create PDF report
def create_report():
    #filter to only kalinini and resplendens
    measurement_df = measurement_df_2[measurement_df_2["species"].isin(["kalinini", "resplendens"])]

    elements = []
    elements += front_page()
    elements += introduction_section(information_df)
    elements += methodology(information_df)
    #elements += dataset_description(measurement_df_2, group_by="species", figure_counter = figure_counter)
    elements += statistical_analysis(df = measurement_df, figure_counter = figure_counter)
    #elements += results_and_discussion(measurement_df_2)
    #sections, sections_start_pages = create_sections()

    ##elements += sections
    elements += conclusion(df = t_test_results_df)
    elements += create_bibliography()

    # # Update table of contents with page numbers
    # toc_style = getSampleStyleSheet()["Heading1"]
    # #toc_data = [[Paragraph(section, toc_style), str(page_num)] for section, page_num in sections_start_pages.items()]
    # toc_data = ["1"]
    # toc_table = Table(toc_data)
    # toc_table.setStyle(TableStyle([('ALIGN', (0, 0), (-1, -1), 'CENTER')]))
    # elements[-1] = toc_table
    # elements += create_table_of_contents(toc_data)
    
    # Get current date
    
    
    location = os.path.join(report_location, f"Biometry report {current_date}.pdf" )
    doc = SimpleDocTemplate(location, pagesize=letter)
    doc.build(elements)
    print(f"The report was saved at {location} ")
create_report()