In [1]:
import os
import re
import cv2
import pytesseract
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from tqdm import tqdm
from pathlib import Path
from io import BytesIO
from PIL import Image as PILImage  # Correct import for PIL Image

# ReportLab imports
#from reportlab.lib.pagesizes import letter
#from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
#from reportlab.platypus import Paragraph, Spacer, Image, PageBreak, SimpleDocTemplate
import datetime

# Counter
global figure_counter 
figure_counter = 1

# Methods
def createTextObject(text, style, centered=False):
    """
    Creates a Paragraph with optional centered alignment.
    """
    if centered:
        style = ParagraphStyle(name="Centered", parent=style, alignment=1)  # 1 = TA_CENTER
    return Paragraph(text, style)

def generate_violin_plot(df, column, group_by=None):
    """
    Generates a violin plot for a given column of a DataFrame.
    """
    plt.figure(figsize=(6, 4))
    if group_by:
        sns.violinplot(data=df, x=group_by, y=column)
    else:
        sns.violinplot(data=df[column])
    plt.tight_layout()
    
    buffer = BytesIO()
    plt.savefig(buffer, format="PNG")
    plt.close()
    buffer.seek(0)
    return buffer

from io import BytesIO
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def generate_boxplot_with_stripplot(df, column, group_by="species", output_filename = None):
    """
    Generates a boxplot with stripplot (points for each data point) where outliers
    (those falling outside 1.5 times the IQR) are shown as red points and the rest as black.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the data.
        column (str): The column to plot.
        group_by (str, optional): The column to group by. If None, no grouping is applied.

    Returns:
        BytesIO: A buffer containing the plot image in PNG format.
    """
    plt.figure(figsize=(8, 6))

    #check if ratio

    is_ratio = "÷" in column
    
    
    if group_by:
        # Draw the boxplot without fliers
        sns.boxplot(data=df, x=group_by, y=column, hue=group_by, palette="Set2", legend=False, showfliers=False)
        
        # For each group, compute the IQR-based bounds and plot points accordingly
        groups = df[group_by].unique()
        for grp in groups:
            # Select data for this group
            grp_data = df[df[group_by] == grp][column]
            Q1 = grp_data.quantile(0.25)
            Q3 = grp_data.quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Classify the points as non-outliers and outliers
            non_outliers = grp_data[(grp_data >= lower_bound) & (grp_data <= upper_bound)]
            outliers = grp_data[(grp_data < lower_bound) | (grp_data > upper_bound)]
            
            # Plot non-outliers (black)
            sns.stripplot(x=[grp] * len(non_outliers), y=non_outliers, color="black", alpha=0.5, jitter=True)
            # Plot outliers (red)
            sns.stripplot(x=[grp] * len(outliers), y=outliers, color="red", alpha=0.5, jitter=True)
        
        plt.xlabel("")  # No x-axis label needed when not grouped
        plt.title(f"Boxplot with Stripplot of {column} grouped by {group_by}")
    else:
        # Draw the boxplot without fliers for the whole column
        sns.boxplot(data=df, y=column, color="lightblue", showfliers=False)
        
        # Compute bounds for the overall data
        data = df[column]
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Classify the points
        non_outliers = data[(data >= lower_bound) & (data <= upper_bound)]
        outliers = data[(data < lower_bound) | (data > upper_bound)]
        
        # When no grouping, use a constant x value (e.g., 0) for all points
        sns.stripplot(x=[0] * len(non_outliers), y=non_outliers, color="black", alpha=0.5, jitter=True)
        sns.stripplot(x=[0] * len(outliers), y=outliers, color="red", alpha=0.5, jitter=True)
        plt.title(f"Boxplot with Stripplot of {column}")
        plt.xlabel("")  # No x-axis label needed when not grouped

    
    if not is_ratio:
        plt.ylabel(f"{column} [$\mu$m]")
    else:
        plt.ylabel(column)
    plt.tight_layout()
    
    # Save the plot into a BytesIO buffer in PNG format
    buffer = BytesIO()
    print(f"Saving as {output_filename}")
    plt.savefig(output_filename, format="png")
    plt.savefig(buffer, format="PNG", bbox_inches="tight")
    plt.close()
    buffer.seek(0)
    return buffer



# Constants
global metric_image
metric_image = {
    "A1": "Head_A1.png",
    "A2": "Head_A2.png",
    "A3": "Head_A3.png",
    "A4": "Head_A4.png",
    "A5": "Head_A5.png",
    "B1": "Pronotum_B1.png",
    "B2": "Pronotum_B2.png",
    "B3": "Pronotum_B3.png",
    "B4": "Pronotum_B4.png",
    "B5": "Pronotum_B5.png",
    "C1": "Lateral_C1.png",
    "D1": "Mesosternal_process_D1.png",
    "D2": "Mesosternal_process_D2.png",
    "D3": "Mesosternal_process_D3.png",
    "D4": "Mesosternal_process_D4.png",
    "E1": "Prosternal_process_E1.png",
    "E2": "Prosternal_process_E2.png",
    "F1": "Ventral.png",
    "F2": "Ventral.png",
    "F3": "Ventral.png",
    "F4": "Ventral.png",
    "F5": "Ventral.png",
    "A1÷A3": "Head.png",
    "A4÷A3": "Head.png",
    "A5÷A3": "Head.png",
    "B4÷B1": "Pronotum.png",
    "B4÷B2": "Pronotum.png",
    "B4÷B3": "Pronotum.png",
    "D2÷D1": "Mesosternal_process_D4.png",
    "D2÷D3": "Mesosternal_process_D4.png",
    "E1÷E2": "Prosternal_process.png",
}

                     
# Location of images
global protocol_image_location
protocol_image_location = Path(r"C:\Users\esteb\escarabajos\biometry\report_output\images\protocol")

# --- Saving and Loading the Dictionary ---

def save_dictionary_to_json(dictionary, filepath):
    """
    Saves the metric_image dictionary to a JSON file.
    
    Args:
        dictionary (dict): The dictionary to save.
        filepath (str or Path): The path to the file where the dictionary will be saved.
    """
    with open(filepath, 'w', encoding='utf-8') as file:
        json.dump(dictionary, file, indent=4)
    print(f"Dictionary saved to {filepath}")

def load_json_to_dictionary(filepath):
    """
    Loads the metric_image dictionary from a JSON file.
    
    Args:
        filepath (str or Path): The path to the JSON file.
    
    Returns:
        dict: The loaded dictionary.
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        loaded_dict = json.load(file)
    print(f"Dictionary loaded from {filepath}")
    return loaded_dict

# Define the path for the JSON file
metric_image_file = Path("metric_image.json")

# Save the metric_image dictionary
#save_dictionary_to_json(metric_image, metric_image_file)

# Later, load the dictionary from the file
metric_image_loaded = load_json_to_dictionary(metric_image_file)

# Optional: Verify that both dictionaries are the same
#assert metric_image == metric_image_loaded, "The loaded dictionary does not match the original!"


  plt.ylabel(f"{column} [$\mu$m]")


Dictionary loaded from metric_image.json


In [2]:
relative_metrics = []
absolute_metrics = ["A1", "A2", "A3", "A4", "A5", "B1", "B2", "B3", "B4", "B5", "C1", "D1", "D2", "D3", "D4", "E1", "E2", "F1", "F2", "F3", "F4", "F5"]
required_columns = absolute_metrics + ["code"]

def feature_engineering(measurement_df):
    # Ensure the required columns are present in the DataFrame
    # Check if all required columns are available in the DataFrame
    for column in required_columns:
        if column not in measurement_df.columns:
            print(f"Warning: Missing column {column} in the DataFrame.")
    
    # --- Statistics: Descriptive statistics for the measurements ---
    print("Summary Statistics:")
    print(measurement_df.describe())  # Basic statistics (mean, std, min, 25%, 50%, 75%, max)
    
    # --- Absolute Metrics ---
    # Example: Metric for A1 (could be specific characteristics or formulae for these metrics)
    #measurement_df["W1"] = measurement_df["A1"]  # Example, assuming A1 is a numeric column

    # --- Relative Metrics ---
   
    measurement_df["A1÷A3"] = measurement_df["A1"] / measurement_df["A3"]
    measurement_df["A4÷A3"] = measurement_df["A4"] / measurement_df["A3"]
    measurement_df["A5÷A3"] = measurement_df["A5"] / measurement_df["A3"]

    # --- Pronoto: Kalinini has a longer pronoto compared to its width ---
    # Example: Calculating relative metric B4/B1, B4/B2, B4/B3
    measurement_df["B4÷B1"] = measurement_df["B4"] / measurement_df["B1"]
    measurement_df["B4÷B2"] = measurement_df["B4"] / measurement_df["B2"]
    measurement_df["B4÷B3"] = measurement_df["B4"] / measurement_df["B3"]

    # --- Mesosternal Process: Example of a metric based on "más brillante" (more shiny) ---
    # Let's assume we have a measure for brightness or some characteristic that corresponds to this.
    # Using D2 as an example for being "more short" (assumed metric), you could add a new metric:
    
    # --- More Width (D1/D2, D3/D2) ---
    measurement_df["D2÷D1"] = measurement_df["D2"] / measurement_df["D1"]
    measurement_df["D3÷D1"] = measurement_df["D3"] / measurement_df["D1"]
    measurement_df["D4÷D2"] = measurement_df["D4"] / measurement_df["D2"]
    
    # --- Relative Metric: E1/E2 ---
    measurement_df["E1÷E2"] = measurement_df["E1"] / measurement_df["E2"]

    relative_metrics = ["A1÷A3", "A4÷A3", "A5÷A3", "B4÷B1", "B4÷B2", "B4÷B3", "D2÷D1", "D3÷D1", "D4÷D2", "E1÷E2"]
    # --- Return the updated DataFrame ---
    return measurement_df, relative_metrics

# Read the CSV file into a DataFrame
#date = datetime.date.today()
#file_path = f'summary just png files {date}.csv'  # Path to your saved file
file_path = f"summary just png files 2025-02-16.csv"
measurement_df = pd.read_csv(file_path, sep='\t', decimal='.', header=0)


# Print the first few rows to verify it worked
#print(measurement_df.head())

# Example usage:
# Assuming `measurement_df` contains the extracted measurements from OCR.
# You can now apply the feature engineering function to the DataFrame.
measurement_df_2, relative_metrics = feature_engineering(measurement_df)
print(relative_metrics)

metrics_df = pd.DataFrame([])
metrics_df["code"] = measurement_df_2["code"]
columns = [col for col in measurement_df_2.columns if col != "code"]
for col in columns:
    metrics_df[col] = measurement_df_2[col]
# Print the final DataFrame to check the new features
#print(measurement_df_2)

#save the df into a file
filename = "metrics.csv"

# Save the DataFrame to a CSV file
metrics_df.to_csv(filename, index=False, sep = "\t", decimal = "." )
print(f"New metrics saved to {filename}")

Summary Statistics:
                A1           A2           A3           A4           A5  \
count    37.000000    37.000000    37.000000    37.000000    37.000000   
mean   4109.505135  3697.212703  4712.424595  1812.224324  1779.767838   
std     250.101576   177.305029   181.444406   255.782814   294.472808   
min    3492.880000  3451.580000  4300.610000  1379.670000  1392.690000   
25%    3977.820000  3538.610000  4569.680000  1599.460000  1517.120000   
50%    4117.200000  3668.760000  4728.880000  1885.550000  1696.300000   
75%    4282.460000  3814.100000  4858.170000  2039.240000  2028.420000   
max    4594.630000  4090.030000  5058.530000  2229.410000  2363.890000   

                B1           B2           B3           B4          B5  ...  \
count    42.000000    42.000000    42.000000    42.000000   42.000000  ...   
mean   5553.055952  8276.477143  8737.027381  5405.136429  150.545000  ...   
std     248.590956   508.676618   537.294658   406.046257    3.647167  ...   
m

In [3]:
import pandas as pd

def detect_outliers(df):
    """
    Scans the DataFrame for outliers in all numeric columns (excluding the 'code' column).
    Returns a DataFrame with rows that have outliers, including:
      - the value of the 'code' column,
      - the column name where the outlier occurred,
      - the outlier value,
      - the median, lower whisker, and upper whisker for that column.
    """
    outlier_records = []
    
    # Identify numeric columns.
    # Exclude the 'code' column (assuming it is not to be analyzed as numeric data).
    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
    if 'code' in numeric_cols:
        numeric_cols.remove('code')
    
    # Process each numeric column
    for col in numeric_cols:
        # Compute statistics for the column.
        median = df[col].median()
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_whisker = Q1 - 1.5 * IQR
        upper_whisker = Q3 + 1.5 * IQR
        
        # Identify outlier rows for this column.
        # You can do this with a vectorized boolean condition:
        is_outlier = (df[col] < lower_whisker) | (df[col] > upper_whisker)
        
        # For every row that is an outlier, record the required details.
        for idx, row in df[is_outlier].iterrows():
            outlier_records.append({
                'code': row['code'],       # the value from the "code" column
                'column': col,             # the column where the outlier was found
                'outlier_value': row[col],
                'median': median,
                'lower_whisker': lower_whisker,
                'upper_whisker': upper_whisker
            })
    
    # Convert the list of records into a DataFrame to return.
    return pd.DataFrame(outlier_records)

detect_outliers(metrics_df)

Unnamed: 0,code,column,outlier_value,median,lower_whisker,upper_whisker
0,CICIMAUCR0232,A1,3492.88,4117.2,3520.86,4739.42
1,CICIMAUCR0252,C1,164.38,151.14,140.22,162.78
2,CICIMAUCR0233,D2,884.71,671.405,456.335,882.875
3,CICIMAUCR0215,D3,1554.09,2282.12,1619.62625,2881.93625
4,CICIMAUCR0242,D3,1495.3,2282.12,1619.62625,2881.93625
5,CICIMAUCR0261,F5,2485.28,1639.36,929.075,2346.715
6,CICIMAUCR0232,W2,0.729411,0.880972,0.746008,1.000297
7,CICIMAUCR0204,W4,0.00303,0.376002,0.167276,0.586846
8,CICIMAUCR0248,W6,0.542044,0.654028,0.602293,0.707183
9,CICIMAUCR0248,W7,0.512999,0.623922,0.566053,0.674038


In [4]:
# add species data

import sys
import os

# Add the directory containing datapath_selector.py to the system path
library_path = r"C:\Users\esteb\escarabajos\libraries"
sys.path.append(library_path)

# Now you can import datapath_selector.py as a module
import datapath_selector
import spectraltools
from datapath_selector import get_paths
from collection_tools import *
from datetime import datetime
collections_list = get_collections_list()
collections_dict = get_collections_dict()

# Define a function to apply species_lookup to each code
def get_species_for_code(code):
    # Use the species_lookup method from the relevant collection in collections_dict
    return collections_dict["CICIMAUCR1"].species_lookup(code=code, collection_list=collections_list)


# Apply the function to the 'code' column of your DataFrame
#convert codes into list
code_list = measurement_df_2["code"].tolist()
result_df = pd.DataFrame([])
result_df = pd.concat([measurement_df_2, result_df], axis=1)

#print(f"{result_df=}")

for code in code_list:
    info_df = get_specimen_info(code)  # Fetch information for the given code
    columns_of_interest = ["code", "species", "location_code", "sex_code"]
    new_columns_df = info_df[columns_of_interest]

    # Merge the new columns into result_df by "code"
    if result_df.empty:
        # If result_df is empty, initialize it with the first new_columns_df
        result_df = new_columns_df
    else:
        # Update or add information for the specific "code"
        for column in columns_of_interest:
            if column != "code":  # Avoid trying to overwrite the "code" column itself
                result_df.loc[result_df["code"] == code, column] = new_columns_df.loc[new_columns_df["code"] == code, column].values[0]

    
#measurement_df_2["species"] = measurement_df_2["code"].apply(get_species_for_code)
print(f"{result_df=}")

#define information_df
information_df = result_df

# First, group the entire dataframe by 'species', and then calculate the mean of 'A1'
metrics_under_consideration = absolute_metrics + relative_metrics
    
aggregated_mean = result_df.groupby("species")[metrics_under_consideration].mean()
aggregated_std = result_df.groupby("species")[metrics_under_consideration].std()

#aggregated_mean.columns = aggregated_mean.columns.str.replace('_', '-')
#aggregated_std.columns = aggregated_std.columns.str.replace('_', '-')
# Optionally, if you want to see the result:
#print(aggregated_mean)


  result_df.loc[result_df["code"] == code, column] = new_columns_df.loc[new_columns_df["code"] == code, column].values[0]


result_df=             code       A1       A2       A3       A4       A5       B1  \
0   CICIMAUCR0001  4120.58  3511.41  4674.61  1430.02  1958.09  5525.11   
1   CICIMAUCR0002  4117.20  3529.93  4475.22  1462.50  1922.14  5352.33   
2   CICIMAUCR0003  3971.18  3584.00  4563.49  1402.85  1924.31  5360.70   
3   CICIMAUCR0004  3627.85  3538.61  4585.22  1475.45  2028.42  5328.44   
4   CICIMAUCR0006      NaN      NaN      NaN      NaN      NaN  5326.82   
5   CICIMAUCR0008      NaN      NaN      NaN      NaN      NaN  5471.34   
6   CICIMAUCR0009      NaN      NaN      NaN      NaN      NaN  5989.42   
7   CICIMAUCR0097  4006.44  3685.78  4546.28  2076.73  1707.97  5380.23   
8   CICIMAUCR0105  4172.69  3806.58  4804.80  2101.69  1587.15  5606.02   
9   CICIMAUCR0113      NaN      NaN      NaN      NaN      NaN  5623.08   
10  CICIMAUCR0116      NaN      NaN      NaN      NaN      NaN  5434.79   
11  CICIMAUCR0201  4292.35  3605.92  4569.68  2042.05  1535.29  5393.30   
12  CICIMAUCR02

In [5]:
from reportlab.platypus import Image, Spacer, Paragraph
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
### Third test: Plots on demand
#!pip install reportlab
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER
import matplotlib.pyplot as plt
import numpy as np


In [6]:
relative_metrics

['A1÷A3',
 'A4÷A3',
 'A5÷A3',
 'B4÷B1',
 'B4÷B2',
 'B4÷B3',
 'D2÷D1',
 'D3÷D1',
 'D4÷D2',
 'E1÷E2']

In [7]:
import scipy.stats as stats
import pandas as pd

# Assuming 'measurement_df_2' is your dataframe
# Split the data by species
measurement_df_2 = result_df
kalinini_data = measurement_df_2[measurement_df_2["species"] == "kalinini"]
resplendens_data = measurement_df_2[measurement_df_2["species"] == "resplendens"]

# List of metrics to test (assuming your dataframe contains these columns)
metrics = metrics_under_consideration

# Dictionary to store test results
global t_test_results
t_test_results = {}

#Normality
normality_info_dict = {
}

#statistical info dict
statistical_info_dict = {
}
for metric in metrics:
    # Extract the data for each species' metric
    kalinini_values = kalinini_data[metric].dropna()  # Remove missing values
    resplendens_values = resplendens_data[metric].dropna()  # Remove missing values
    
    # Check normality (Shapiro-Wilk test) for both species
    kalinini_shapiro = stats.shapiro(kalinini_values)
    resplendens_shapiro = stats.shapiro(resplendens_values)

    #get statistic and p value
    SW_statistic_kalinini = kalinini_shapiro[0]
    SW_statistic_resplendens = resplendens_shapiro[0]

    SW_pval_kalinini = kalinini_shapiro[1]
    SW_pval_resplendens = resplendens_shapiro[1]
    
    kalinini_normal = SW_pval_kalinini > 0.05  # p-value > 0.05 => normal
    resplendens_normal = SW_pval_resplendens > 0.05  # p-value > 0.05 => normal

    #save info for report
    N_kalinini = kalinini_values.count()
    N_resplendens = resplendens_values.count()

    normality_info_dict[(metric, "N_kalinini")] = N_kalinini
    normality_info_dict[(metric, "N_resplendens")] = N_resplendens
    
    normality_info_dict[(metric, "kalinini")] = kalinini_normal
    normality_info_dict[(metric, "resplendens")] = resplendens_normal

    statistical_info_dict[(metric, "SW_kalinini")] = SW_statistic_kalinini
    statistical_info_dict[(metric, "SW_resplendens")] = SW_statistic_resplendens
    statistical_info_dict[(metric, "SW_pvalue_kalinini")] = SW_pval_kalinini
    statistical_info_dict[(metric, "SW_pvalue_resplendens")] = SW_pval_resplendens
    
    if kalinini_normal and resplendens_normal:
        # Perform Levene's test for homogeneity of variance
        levene_test = stats.levene(kalinini_values, resplendens_values)
        levene_statistic = levene_test.statistic
        levene_pvalue = levene_test.pvalue
        
        print(levene_test)
        #save results
        statistical_info_dict[(metric, "levene_statistic")] = levene_statistic
        statistical_info_dict[(metric, "levene_pvalue")]  = levene_pvalue
        
        # Perform Student's t-test if variances are equal (Levene's test p > 0.05)
        if levene_test.pvalue > 0.05:
            t_stat, p_value = stats.ttest_ind(kalinini_values, resplendens_values)
            test_type = "Student's t-test"

            #save info
            statistical_info_dict[(metric, "test_type")]  = test_type
            statistical_info_dict[(metric, "student_t_stat")]  = t_stat
            statistical_info_dict[(metric, "student_t_pvalue")]  = p_value
            
        else:
            # If variances are unequal, use Welch's t-test (Welch correction)
            t_stat, p_value = stats.ttest_ind(kalinini_values, resplendens_values, equal_var=False)
            test_type = "Welch's t-test"
            
            #save info
            statistical_info_dict[(metric, "test_type")]  = test_type
            statistical_info_dict[(metric, "welch_t_stat")]  = t_stat
            statistical_info_dict[(metric, "welch_t_pvalue")]  = p_value
        # Interpretation
        interpretation = "significant difference" if p_value < 0.05 else "no significant difference"
        
        t_test_results[metric] = {
            "levene_test": levene_test.pvalue,
            "test_type": test_type,
            "t_stat": t_stat,
            "p_value": p_value,
            "interpretation": interpretation
        }

        statistical_info_dict[(metric, "interpretation")]  = interpretation
    else:
        # If normality fails, use the Mann-Whitney U test
        u_stat, p_value = stats.mannwhitneyu(kalinini_values, resplendens_values)
        test_type = "Mann-Whitney U test"
        
        # Interpretation
        interpretation = "significant difference" if p_value < 0.05 else "no significant difference"
        
        t_test_results[metric] = {
            "levene_test": levene_test.pvalue,
            "test_type": test_type,
            "u_stat": u_stat,
            "p_value": p_value,
            "interpretation": interpretation
        } 

        #save info
        statistical_info_dict[(metric, "test_type")]  = test_type
        statistical_info_dict[(metric, "Mann_Whitney_u_stat")]  = u_stat
        statistical_info_dict[(metric, "Mann_Whitney_pvalue")]  = p_value
        statistical_info_dict[(metric, "interpretation")]  = interpretation

# Print results
global metric_description
metric_description = { "A1": "Vertical length of the head: measured from the center of the clipeus down to the middle of the back of the head.",
                      "A2": "Horizontal length between the left and right sutures",
                      "A3": "Horizontal length between the left and right eye’s canthus",
                      "A4": "Vertical ortogonal length of the clipeus measured from the front down to A2 line",
                      "A5": "Perpendicular vertical length of the clypeus, measured from its front edge to the $A2$ line, representing the clypeus height.",
                      "B1": "Horizontal length between the pronotum’s frontal angles",
                      "B2": "Horizontal length between the pronotum’s middle angles",
                      "B3": "Horizontal length between the pronotum’s hind angles",
                      "B4": "Vertical length of the pronotum’s measured from the middle point of its front down to the middlepoint of its rear",
                      "B5": "Angle of its side measured between the tangent lines to its straightest sections in the front and back, as seen from the top ",
                      "C1": "Angle of its side measured between the tangent lines to its straightest sections in the front and back as seen by the side",
                      "D1": "Mesosternal process’ horizontal length measured from the secant point of the tangents of its sides with the horizontal line used to measure D1. ",
                      "D2": "Mesosternal process’ vertical length measured from the tip of the mesosternal process down to the line that joins the two lowest curves at the sides of the mesosternal process base",
                      "D3": "Horizontal width of the dark middle line measured from its two lower ends",
                      "D4": "Vertical length from the tip of the mesosternal process down to the lowest point of the black patch in the middle of the mesosternal process",
                      "E1": "Horizontal top width of the prosternal plate ",
                      "E2": "Horizontal bottom width of the prosternal plate ",
                      "F1": "Vertical length of the foremost ventral plate",
                      "F2": "Vertical length of the second foremost ventral plate",
                      "F3": "Vertical length of the third foremost ventral plate",
                      "F4": "Vertical length of the fourth foremost ventral plate ",
                      "F5": "Vertical length of the fifth foremost ventral plate",
                      "A1÷A3": "A1/A3 Measure of the vertical length of beetle's head relative to its canthuses' distance width",
                      "A4÷A3": "A4/A3 Measure of the vertical length of beetle's clipeum relative to its canthuses' distance width",
                      "A5÷A3": "A5/A3 Measure of the vertical length of beetle's eyes relative to its canthuses' distance width",
                      "B4÷B1": "Measure of the vertical length of the pronotum relative to its front width. B4/B1",
                      "B4÷B2": "Measure of the vertical length of the pronotum relative to its middle width. B4/B2",
                      "B4÷B3": "Measure of the vertical length of the pronotum relative to its back width. B4/B3",
                      "D3÷D1": "Measure of the total vertical length of the mesosternal process relative to its back horizontal width. D3/D1",
                      "D2÷D1": "Measure of the middle horizontal length of the mesosternal process relative to its back horizontal width. D2/D1",
                      "D4÷D2": "Measure of the vertical length of the mesosternal process down to the middle dark stripe  relative to its middle width. D4/D2",
                      "E1÷E2": "Measure of how square the prosternal plate is. Front width back width ratio E1/E2",
                     }

Analysis_text = ""

# Define the path for the JSON file
metric_description_file = Path("metric_description.json")

# Save the metric_image dictionary
save_dictionary_to_json(metric_image, metric_description_file)

# Later, load the dictionary from the file
metric_image_loaded = load_json_to_dictionary(metric_description_file)



LeveneResult(statistic=np.float64(1.1873103588011569), pvalue=np.float64(0.2855027597681593))
LeveneResult(statistic=np.float64(1.8463607374171744), pvalue=np.float64(0.18545128487010076))
LeveneResult(statistic=np.float64(0.4593400301024974), pvalue=np.float64(0.5036995462867428))
LeveneResult(statistic=np.float64(0.6912440431025132), pvalue=np.float64(0.4119027326960786))
LeveneResult(statistic=np.float64(0.48183421153751266), pvalue=np.float64(0.4926015862846532))
LeveneResult(statistic=np.float64(0.8836542591104996), pvalue=np.float64(0.354244960311101))
LeveneResult(statistic=np.float64(0.05293752148541057), pvalue=np.float64(0.8194928965363276))
LeveneResult(statistic=np.float64(0.06502109723266758), pvalue=np.float64(0.8003607435549867))
LeveneResult(statistic=np.float64(1.535578378382783), pvalue=np.float64(0.22428930789353785))
LeveneResult(statistic=np.float64(0.03728661028032174), pvalue=np.float64(0.8481030704447662))
LeveneResult(statistic=np.float64(2.5025154095680486), p

In [8]:
statistical_info_dict.items()


dict_items([(('A1', 'SW_kalinini'), np.float64(0.8186901827269933)), (('A1', 'SW_resplendens'), np.float64(0.9717395005500788)), (('A1', 'SW_pvalue_kalinini'), np.float64(0.08597610747193121)), (('A1', 'SW_pvalue_resplendens'), np.float64(0.7305562800193002)), (('A1', 'levene_statistic'), np.float64(1.1873103588011569)), (('A1', 'levene_pvalue'), np.float64(0.2855027597681593)), (('A1', 'test_type'), "Student's t-test"), (('A1', 'student_t_stat'), np.float64(-1.3595850141076393)), (('A1', 'student_t_pvalue'), np.float64(0.18520827480214286)), (('A1', 'interpretation'), 'no significant difference'), (('A2', 'SW_kalinini'), np.float64(0.8512213169996418)), (('A2', 'SW_resplendens'), np.float64(0.9646542197574192)), (('A2', 'SW_pvalue_kalinini'), np.float64(0.1610204679797393)), (('A2', 'SW_pvalue_resplendens'), np.float64(0.5633950790160998)), (('A2', 'levene_statistic'), np.float64(1.8463607374171744)), (('A2', 'levene_pvalue'), np.float64(0.18545128487010076)), (('A2', 'test_type'), "S

In [9]:
# Save statistical results
statistical_info_df = pd.DataFrame(
    [(k1, k2, v) for (k1, k2), v in statistical_info_dict.items()],
     columns = ["metric", "statistic", "value"]                             )

df_pivoted = statistical_info_df.pivot(index='metric', columns='statistic', values='value')#.reset_index()
print(df_pivoted.columns)
df_pivoted = df_pivoted[["test_type", "interpretation", 'Mann_Whitney_u_stat', 'Mann_Whitney_pvalue', 'SW_kalinini',
       'SW_pvalue_kalinini','SW_resplendens', 'SW_pvalue_resplendens', 
       'levene_statistic', 'levene_pvalue', 
       'student_t_stat', 'student_t_pvalue', 
       'welch_t_stat', 'welch_t_pvalue']] #Mann_Whitney_pvalue']
df_pivoted.to_csv('statistical_tests.csv', index=True)
df_pivoted


Index(['Mann_Whitney_pvalue', 'Mann_Whitney_u_stat', 'SW_kalinini',
       'SW_pvalue_kalinini', 'SW_pvalue_resplendens', 'SW_resplendens',
       'interpretation', 'levene_pvalue', 'levene_statistic',
       'student_t_pvalue', 'student_t_stat', 'test_type', 'welch_t_pvalue',
       'welch_t_stat'],
      dtype='object', name='statistic')


statistic,test_type,interpretation,Mann_Whitney_u_stat,Mann_Whitney_pvalue,SW_kalinini,SW_pvalue_kalinini,SW_resplendens,SW_pvalue_resplendens,levene_statistic,levene_pvalue,student_t_stat,student_t_pvalue,welch_t_stat,welch_t_pvalue
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A1,Student's t-test,no significant difference,,,0.81869,0.085976,0.97174,0.730556,1.18731,0.285503,-1.359585,0.185208,,
A1÷A3,Student's t-test,no significant difference,,,0.853406,0.167666,0.97358,0.773805,1.502602,0.230856,0.140008,0.889693,,
A2,Student's t-test,significant difference,,,0.851221,0.16102,0.964654,0.563395,1.846361,0.185451,-2.167579,0.039176,,
A3,Student's t-test,significant difference,,,0.930251,0.582042,0.971987,0.736439,0.45934,0.5037,-3.297398,0.002739,,
A4,Mann-Whitney U test,no significant difference,46.0,0.2325,0.712256,0.008301,0.943177,0.210171,,,,,,
A4÷A3,Mann-Whitney U test,no significant difference,56.0,0.510901,0.759839,0.024823,0.898742,0.023817,,,,,,
A5,Mann-Whitney U test,no significant difference,68.0,0.979104,0.874675,0.245484,0.905434,0.032768,,,,,,
A5÷A3,Mann-Whitney U test,no significant difference,77.0,0.693912,0.860109,0.189542,0.876725,0.00864,,,,,,
B1,Mann-Whitney U test,significant difference,58.0,0.012306,0.801559,0.009862,0.97329,0.767065,,,,,,
B2,Student's t-test,significant difference,,,0.955762,0.71796,0.972666,0.75246,0.691244,0.411903,-2.709564,0.010733,,


In [10]:
def remove_decimals_from_latex(latex_text):
    """
    Receives a string containing LaTeX table code and returns a new string where
    any numeric values with decimals are rounded to the nearest integer.
    
    Args:
        latex_text (str): LaTeX code with tables.
    
    Returns:
        str: Modified LaTeX code with numbers formatted to have no decimals.
    """
    def replace_number(match):
        num_str = match.group(0)
        try:
            # Convert the string to a float, round it, and convert back to string.
            num = float(num_str)
            return str(int(round(num)))
        except ValueError:
            # In case of an error, return the original string.
            return num_str

    # This pattern matches numbers containing a decimal point.
    # It assumes numbers are positive. Modify the pattern if you need to match negatives.
    pattern = r'\b\d+\.\d+\b'
    
    # Replace all occurrences using the replace_number function.
    formatted_text = re.sub(pattern, replace_number, latex_text)
    return formatted_text

In [11]:
import scipy.stats as stats
import pandas as pd

# Assuming 'result_df' is your dataframe and 'metrics_under_consideration' is a list of metric names
measurement_df_2 = result_df
kalinini_data = measurement_df_2[measurement_df_2["species"] == "kalinini"]
resplendens_data = measurement_df_2[measurement_df_2["species"] == "resplendens"]

# List of metrics to test
metrics = metrics_under_consideration

# Dictionary to store test results
t_test_results = {}

for metric in metrics:
    # Extract the data for each species' metric and drop missing values
    kalinini_values = kalinini_data[metric].dropna()
    resplendens_values = resplendens_data[metric].dropna()
    
    # Check normality (Shapiro-Wilk test) for both species
    kalinini_normal = stats.shapiro(kalinini_values)[1] > 0.05  # p-value > 0.05 means normal
    resplendens_normal = stats.shapiro(resplendens_values)[1] > 0.05
    
    # Determine overall normality for this metric
    overall_normality = "normal" if (kalinini_normal and resplendens_normal) else "non normal"
    
    # Initialize a dictionary to store this metric's test results
    test_result = {
        "normality_kalinini": kalinini_normal,
        "normality_resplendens": resplendens_normal,
        "normality": overall_normality
    }
    
    if kalinini_normal and resplendens_normal:
        # Perform Levene's test for homogeneity of variances
        levene_test = stats.levene(kalinini_values, resplendens_values)
        # Determine variance equality category
        variance_category = "equal" if levene_test.pvalue > 0.05 else "different"
        test_result["levene_pvalue"] = levene_test.pvalue
        test_result["variance"] = variance_category
        
        # Choose the appropriate t-test
        if variance_category == "equal":
            t_stat, p_value = stats.ttest_ind(kalinini_values, resplendens_values)
            test_type = "Student's t-test"
        else:
            t_stat, p_value = stats.ttest_ind(kalinini_values, resplendens_values, equal_var=False)
            test_type = "Welch's t-test"
            
        test_result["test_type"] = test_type
        test_result["t_stat"] = t_stat
        test_result["p_value"] = p_value
    else:
        # When at least one group fails normality, use the Mann-Whitney U test
        u_stat, p_value = stats.mannwhitneyu(kalinini_values, resplendens_values)
        test_result["test_type"] = "Mann-Whitney U test"
        test_result["u_stat"] = u_stat
        test_result["p_value"] = p_value
        # For non-parametric tests, we generally consider the variance comparison as 'different'
        test_result["variance"] = "different"
        test_result["levene_pvalue"] = None

    # Interpretation of significance (textual) and add a categorical significance field
    interpretation = "significant difference" if p_value < 0.05 else "no significant difference"
    significance = "significant" if p_value < 0.05 else "non significant"
    test_result["interpretation"] = interpretation
    test_result["significance"] = significance
    
    # Save the results for this metric
    t_test_results[metric] = test_result

# Convert the results dictionary to a DataFrame
results_df = pd.DataFrame.from_dict(t_test_results, orient='index').reset_index().rename(columns={'index': 'metric'})
#results_df.columns = results_df.columns.str.replace('_', '-')
print(results_df)


   metric  normality_kalinini  normality_resplendens   normality  \
0      A1                True                   True      normal   
1      A2                True                   True      normal   
2      A3                True                   True      normal   
3      A4               False                   True  non normal   
4      A5                True                  False  non normal   
5      B1               False                   True  non normal   
6      B2                True                   True      normal   
7      B3                True                   True      normal   
8      B4                True                   True      normal   
9      B5               False                   True  non normal   
10     C1                True                   True      normal   
11     D1                True                   True      normal   
12     D2                True                   True      normal   
13     D3                True                   

In [12]:
from datetime import datetime
import pandas as pd

# Constants
current_date = datetime.today().date()

def create_paragraph(text):
    """
    Returns a LaTeX snippet for a centered paragraph.

    Parameters:
        text (str): The text to be inserted in the paragraph.
        
    Returns:
        str: LaTeX code for the centered paragraph.
    """
    # In LaTeX, centering text can be done with the center environment
    latex_paragraph = (
        "\\begin{center}\n"
        f"{text}\n"
        "\\end{center}\n\n"
    )
    return latex_paragraph
    

In [13]:
from datetime import datetime

# Global constant for the current date
current_date = datetime.today().date()

def front_page():
    """
    Returns the LaTeX code for a front page (title page) similar to the ReportLab front_page() function.
    """
    # Construct the LaTeX code for the title page
    latex_front_page = (
        "\\begin{titlepage}\n"
        "  \\centering\n\n"
        "  % Title\n"
        "  {\\Huge \\textbf{Biometry report} \\par}\n"
        "  \\vspace{1.5cm}\n\n"
        "  % Authors\n"
        "  {\\Large Dra. Marcela Hernández, Dr. Esteban Bermúdez Ureña, Angel Aguirre \\& Esteban Soto. \\par}\n"
        "  \\vspace{0.5cm}\n\n"
        "  % Institution\n"
        "  {\\Large Centro de Investigación en Ciencia e Ingeniería de los Materiales \\par}\n"
        "  \\vspace{0.5cm}\n\n"
        "  % Additional details\n"
        "  {\\Large 2025 \\par}\n"
        "  {\\Large University of Costa Rica \\par}\n"
        "  \\vspace{0.5cm}\n"
        "  {\\Large " + str(current_date) + " \\par}\n\n"
        "  \\vfill\n"
        "\\end{titlepage}\n"
        "\\newpage\n"
    )
    return latex_front_page




In [14]:
def introduction_section(df):
    """
    Generates LaTeX code for the introduction section of the report.

    Parameters:
        df (pandas.DataFrame): DataFrame containing specimen data with at least
                               columns 'species' and 'code'.

    Returns:
        str: LaTeX code for the introduction section.
    """
    # Compute specimen counts from the dataframe
    number_of_kalinini_specimens = df[df["species"] == "kalinini"]["code"].count()
    number_of_resplendens_specimens = df[df["species"] == "resplendens"]["code"].count()
    
    # Build the LaTeX introduction section
    latex_introduction = (
        "\\section{Introduction}\n\n"
        "Zubov et al. (2019) describe a new species of \\textit{Chrysina}. In its comparative analysis, "
        "it is stated that the new species is very similar to \\textit{C. resplendens} and only a few "
        "morphological differences can be noted. This work intends to perform a quantitative analysis of these "
        "differences using a sample of "
        f"{number_of_kalinini_specimens} \\textit{{C. kalinini}} specimens and "
        f"{number_of_resplendens_specimens} \\textit{{C. resplendens}} specimens.\n\n"
        "The measurements described in the article are specified more precisely and alternative metrics are analyzed.\n\n"
        "\\newpage\n"
    )
    
    return latex_introduction

introduction_section(measurement_df_2 )

'\\section{Introduction}\n\nZubov et al. (2019) describe a new species of \\textit{Chrysina}. In its comparative analysis, it is stated that the new species is very similar to \\textit{C. resplendens} and only a few morphological differences can be noted. This work intends to perform a quantitative analysis of these differences using a sample of 11 \\textit{C. kalinini} specimens and 23 \\textit{C. resplendens} specimens.\n\nThe measurements described in the article are specified more precisely and alternative metrics are analyzed.\n\n\\newpage\n'

In [15]:
def methodology_latex(df):
    """
    Generates LaTeX code for the methodology section of the report.

    Parameters:
        df (pandas.DataFrame): DataFrame containing specimen data with at least
                               the columns "species", "code", "sex_code", and "location_code".
    
    Returns:
        str: LaTeX code for the methodology section.
    """
    # Compute counts and groupings
    number_of_kalinini_specimens = df[df["species"] == "kalinini"]["code"].count()
    number_of_resplendens_specimens = df[df["species"] == "resplendens"]["code"].count()
    
    # Unique locations per species; convert numpy arrays to comma‐separated strings
    unique_locations_by_species = df.groupby("species")["location_code"].unique()
    unique_locations = {}
    for species, locations in unique_locations_by_species.items():
        # Convert each numpy array of locations to a string
        loc_str = ", ".join(map(str, locations))
        unique_locations[species] = loc_str
    
    # Sex counts for C. kalinini
    number_of_kalinini_m_specimens = df[(df["species"] == "kalinini") & (df["sex_code"] == "M")]["code"].count()
    number_of_kalinini_f_specimens = df[(df["species"] == "kalinini") & (df["sex_code"] == "F")]["code"].count()
    number_of_kalinini_u_specimens = number_of_kalinini_specimens - number_of_kalinini_m_specimens - number_of_kalinini_f_specimens

    # Sex counts for C. resplendens
    number_of_resplendens_m_specimens = df[(df["species"] == "resplendens") & (df["sex_code"] == "M")]["code"].count()
    number_of_resplendens_f_specimens = df[(df["species"] == "resplendens") & (df["sex_code"] == "F")]["code"].count()
    # Note: The original code repeats the unknown count for kalinini. If a similar unknown count is needed
    # for C. resplendens, it can be computed as shown below. Otherwise, you may adjust accordingly.
    number_of_resplendens_u_specimens = number_of_resplendens_specimens - number_of_resplendens_m_specimens - number_of_resplendens_f_specimens

    # Build the LaTeX section
    latex_methodology = (
        "\\section{Methodology}\n\n"
        "Chrysina samples were retrieved from the following locations: \\\\ \n"
    )

    # Include the unique locations information for each species
    for species, locs in unique_locations.items():
        latex_methodology += f"\\textit{{{species}}}: {locs} \\\\ \n"
    
    latex_methodology += "\n"

    # Sex distribution text as a bullet list
    latex_methodology += (
        "Sex distribution is as follows:\n"
        "\\begin{itemize}\n"
        f"  \\item \\textit{{C. kalinini}}: {number_of_kalinini_m_specimens} males, {number_of_kalinini_f_specimens} females, {number_of_kalinini_u_specimens} unknown\n"
        f"  \\item \\textit{{C. resplendens}}: {number_of_resplendens_m_specimens} males, {number_of_resplendens_f_specimens} females, {number_of_resplendens_u_specimens} unknown\n"
        "\\end{itemize}\n\n"
    )

    # Additional paragraphs with a bit of spacing
    latex_methodology += (
        "Using an estereoscope (Resolution 4.781 $\\mu$m per pixel), its head, clipeum, mesosternal process, prosternal process, and ventral plates were measured. \n\n"
        "An OCR software was used to retrieve the measurements and to add contextual information about collection location, sex, genus, and species. \n\n"
        "Zubov et al.'s morphological differences were calculated using the metrics taken with the estereoscope. \n\n"
        r"""For the statistical analysis, the measurement dataset is filtered to include only entries that correspond to either \textit{resplendens} or \textit{kalinini} specimens. 

For each metric, \texttt{NA} values are dropped, and a Shapiro-Wilk test for normality is performed for both species.

If the p-value of the Shapiro-Wilk test is greater than 0.05, the dataset can be assumed to be normal.

If both datasets are normal, a Levene test is performed to check for homogeneity of variances. 

If the value for the Levene’s test is greater than 0.05, variances are deemed to be equal.

At this point, one of the following cases will occur:

\begin{itemize}
    \item If the datasets are normal with equal variances, a Student’s t-test is applied.
    \item If the datasets are normal with different variances, a Welch’s t-test is applied.
    \item If at least one group is not normal, a Mann-Whitney U test is applied.
\end{itemize}

For all three tests, if the p-value is less than 0.05, there is no significant difference.

"""
    )

    # Append a page break at the end of the section
    latex_methodology += "\\newpage\n"

    return latex_methodology


In [16]:
from pathlib import Path
import pandas as pd

# Assume these global dictionaries exist:
# normality_info_dict, metric_description, t_test_results, metric_image, protocol_image_location
# Also assume figure_counter is initialized somewhere (e.g., figure_counter = 1)

# ------------------------------------------------------------------------------
# Helper Functions (Placeholders)
# ------------------------------------------------------------------------------

def get_metric_image_file(image_path):
    """
    Compresses and saves the image at image_path to a new file and returns the new filename.
    
    This is a placeholder for your image processing code.
    """
    from PIL import Image as PILImage
    # Create a compressed image filename based on the original filename.
    path = Path(image_path)
    basename = path.name.replace(".png", "") 
    parent = path.parent

    current_path = Path.cwd()  # Call the function to get the current working directory
    compressed_image_path = current_path / "report_output" / "images" / f"{basename}.jpeg"
    
    original_image = PILImage.open(image_path)
    try:
        original_image.save(compressed_image_path, "JPEG", quality=70)
    except Exception:
        original_image.save(compressed_image_path, "PNG")
    return compressed_image_path

# ------------------------------------------------------------------------------
# LaTeX Section Functions
# ------------------------------------------------------------------------------

def dataset_description_latex(df, image_path, group_by="species"):
    """
    Generates LaTeX code for a dataset description section that includes a
    normality test report and boxplot plots for each numerical metric.
    
    Parameters:
        df (pd.DataFrame): The dataset.
        group_by (str): Column name to group data by (default is "species").
        
    Returns:
        str: LaTeX code for the dataset description section.
    """
    latex = ""
    latex += "\\section{Dataset Description}\n\n"
    
    # Normality test subsection
    latex += "\\subsection{Normality Test}\n\n"
    latex += (
        f"Shapiro-Wilk p-values for \\textit{{C. kalinini}} population are "
        f"{normality_info_dict['kalinini']:.2f}, and for \\textit{{C. resplendens}} population are "
        f"{normality_info_dict['resplendens']:.2f}.\n\n"
    )
    
    # boxplot plots subsection
    latex += "\\subsection{Boxplots for Each Metric}\n\n"
    
    # Loop through all numeric columns
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    for column in numeric_columns:
        latex += f"\\subsubsection*{{Metric {column}}}\n\n"
        desc = metric_description.get(column, "No description available.")
        latex += desc + "\n\n"
        
        # Generate and save the boxplot plot image
        
        image_folder = Path("report_output") /"images"
        image_filename = image_folder / f"boxplot_{column}.png"
        generate_boxplot_with_stripplot(df, column, group_by, image_filename)

        image_location = Path("images") / f"boxplot_{column}.png"
        posix_image_filename = image_location.as_posix()
        
        # Include the image in a figure environment
        latex += "\\begin{figure}[H]\n\\centering\n"
        latex += f"\\includegraphics[width=0.7\\linewidth]{{{posix_image_filename}}}\n"
        latex += f"\\caption{{Boxplot and specimen distribution (superposed) for the metric {column} by {group_by}}}\n"
        latex += "\\end{figure}\n\n"
    
    latex += "\\newpage\n"
    return latex

def Analysis_latex(df, figure_counter=1, group_by = "species"):
    """
    Generates LaTeX code for a statistical analysis section. For each metric in the
    global t_test_results dictionary, it includes a boxplot plot, test details, and
    optionally a metric image if available.
    
    Parameters:
        df (pd.DataFrame): The dataset.
        figure_counter (int): Starting figure counter (default is 1).
        
    Returns:
        str: LaTeX code for the statistical analysis section.
    """
    #species_under_consideration = ["kalinini","resplendens"]
    #df = df[df["species"] ==species_under_consideration]
    
    latex = ""
    latex += "\\section{Statistical Analysis}\n\n"
    
    for metric, result in t_test_results.items():
        latex += f"\\subsection*{{Metric: {metric}}}\n\n"
        desc = metric_description.get(metric, "No description available.")
        latex += desc + "\n\n"
        
        # Generate and save the boxplot plot for the current metric
        image_filename = Path("report_output")/ "images" / "boxplot"/ f"boxplot_{metric}.png"
        generate_boxplot_with_stripplot(df, metric, "species", image_filename)

        image_location = Path("images")/ "boxplot" / f"boxplot_{metric}.png"
        posix_image_filename = image_location.as_posix()
        print(posix_image_filename)
        latex += "\\begin{figure}[H]\n\\centering\n"
        latex += f"\\includegraphics[width=0.7\\linewidth]{{{posix_image_filename}}}\n"
        latex += f"\\caption{{  Boxplot and specimen distribution (superposed) for the metric  {metric} by {group_by}}}\n"
        latex += "\\end{figure}\n\n"
        figure_counter += 1
        
        # Include test results as a series of bold labels and values
        latex += "\\noindent\\textbf{Test Type:} " + result["test_type"] + " \\\\\n"
        test_stat = result.get("t_stat", result.get("u_stat"))
        latex += "\\noindent\\textbf{Test Statistic:} " + f"{test_stat:.3f}" + " \\\\\n"
        latex += "\\noindent\\textbf{P-value:} " + f"{result["p_value"]:.3f}"+ " \\\\\n"
        latex += "\\noindent\\textbf{Interpretation:} " + result["interpretation"] + "\n\n"
        
        # Optionally include a metric image if it exists and the metric name does not start with "W"
        if (not metric.startswith("W")) and (metric in metric_image):
            # Build the image path and get the compressed image filename
            image_path = Path(protocol_image_location) / metric_image[metric]
            metric_img_file = get_metric_image_file(image_path)

            metric_img_location = Path("images")/"protocol"  / metric_image[metric]
            latex += "\\begin{figure}[H]\n\\centering\n"
            latex += f"\\includegraphics[width=0.5\\linewidth]{{{metric_img_location.as_posix()}}}\n"
            latex += f"\\caption{{ Metric {metric}}}\n"
            latex += "\\end{figure}\n\n"
            figure_counter += 1
        
        latex += "\\newpage\n"
    
    return latex


In [17]:
import pandas as pd

def dataframe_to_latex_tables(df , unit = "mm"):
    """
    Converts a Pandas DataFrame to a single LaTeX table with vertical dividers between columns,
    horizontal lines above, below the header, and at the bottom, and a gray background for the header.
    All numeric values are rounded to three decimals.

    Returns:
        str: A string containing the complete LaTeX code for the table.
    """
    # Round numeric values to three decimals
    df_rounded = df.round(3)
    
    # Determine the number of columns and create a column format with vertical dividers.
    n_cols = df_rounded.shape[1]
    col_format = "|" + "c|" * n_cols

    # Begin building the LaTeX table string.
    latex_str = "\\begin{table}[H]\n\\centering\n"
    latex_str += f"\\begin{{tabular}}{{{col_format}}}\n"
    
    # Add top horizontal line.
    latex_str += "\\hline\n"

    #Add units to every column 
    if unit:   
        unit_str = f"({unit})"
        keywords = ["÷", "test", "p_value", "percentage","percentual", "metric"]
        
        def insert_units(lst, unit_str):
            new_list = []
            for element in lst:
                if element == "species": #omit species
                    new_list.append( f"{element}" )
                    continue
                if any(keyword in element for keyword in keywords):
                    print(element)
                    new_list.append(f"{element}")
                    continue
                else:
                    new_list.append( f"{element} {unit_str}" )
            return new_list
    
        columns = insert_units(df_rounded.columns, unit_str)
    else:
        columns = df_rounded.columns
    # Add header row with gray background.
    # latex_str += "\\rowcolor{gray!30}\n"
    header_row = " & ".join(columns) + " \\\\ \n"
    header_row = header_row.replace("_", "\\_")
    latex_str += header_row
    
    # Insert a horizontal line below the header.
    latex_str += "\\hline\n"
    
    # Add each data row.
    for _, row in df_rounded.iterrows():
        row_items = " & ".join(str(item) for item in row)
        latex_str += row_items + " \\\\ \n"
    
    # Add a final horizontal line.
    latex_str += "\\hline\n"
    
    # End the tabular environment and the table.
    latex_str += "\\end{tabular}\n\\end{table}\n"
    
    return latex_str




In [18]:
def chunk_list(lst, chunk_size=3):
    """Split lst into chunks of size chunk_size."""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

In [19]:
def std_and_averages_for_each_metric(df, results_df):
    using_millimiters = True
    # Select only numeric columns
    #numeric_cols = df.select_dtypes(include='number').columns
    
    significant_metrics = results_df.loc[results_df["significance"] == "significant", "metric"].tolist()
    list_of_lists = chunk_list(significant_metrics, chunk_size=2)
    #print(list_of_lists)
    
    tables = []
    for x in list_of_lists: 
        # Group by 'species' and compute mean and std for numeric columns
        result = df.groupby('species')[x].agg(['mean', 'std'])
        if using_millimiters:
            result = result/1000 #convert to mm
        #print(result)
        
        # Optionally flatten MultiIndex columns if desired
        result.columns = ['_'.join(col).strip() for col in result.columns.values]
        result = result.reset_index()
        print(result.columns.values)
       
        
        table = dataframe_to_latex_tables(result, unit = "mm")
        tables += [table]
    concat = ""
    #concat += "\\subsection{Standard deviation and average for each significant metric} \n"
    for element in tables:
        text = element
        if not using_millimiters:
            text = remove_decimals_from_latex(latex_text=element)  #si sse hace en micrómetros
        concat += text
    concat += "\n"
    return concat
    #return tables

In [20]:
def pvalue_analysis(df, results_df):
    final_text = ""
    subsection = ""
    #subsection = "\\subsection{P value analysis for non significant metrics} \n"
    mean_of_pvalue = (results_df[results_df["significance"]== "non significant" ]["p_value"].mean())
    std_of_pvalue = (results_df[results_df["significance"]== "non significant" ]["p_value"].std())
    min_of_pvalue = (results_df[results_df["significance"]== "non significant" ]["p_value"].min())
    max_of_pvalue = (results_df[results_df["significance"]== "non significant" ]["p_value"].max())
    text_1 = f"For the non significant metrics, its range varies between {min_of_pvalue:.2f} and {max_of_pvalue:.2f}.\n "
    non_sig_df = results_df[results_df["significance"]== "non significant" ][["metric","p_value"]]
    #print(non_sig_df)
    non_sig_df["difference"] = non_sig_df["p_value"] - 0.05
    non_sig_df["percentual_diff"] = non_sig_df["difference"] /0.05 * 100
    
   
   
    m = non_sig_df[["metric", "p_value", "difference", "percentual_diff"]].sort_values(by="percentual_diff")
    m = m.round(2)
    table_text = dataframe_to_latex_tables(m)
    #table = remove_decimals_from_latex(latex_text=table_text) 
    #print(text_1)
    #print(text_2)
    #print(m)
    #print(table_text)
    close_metrics = non_sig_df["metric"].head(5).values
    text_2 = f"""That means for some non significant metrics, its p value is very close to 0.05, suggesting that there is a possibility with
    more samples its difference could be significant. For these metrics further investigation is required: {close_metrics}.\n
    """
    final_text += subsection
    final_text += text_1
    final_text += table_text
    final_text += text_2
    return final_text
    
pvalue_analysis(measurement_df_2, results_df)

metric
p_value
percentual_diff


"For the non significant metrics, its range varies between 0.06 and 0.98.\n \\begin{table}[H]\n\\centering\n\\begin{tabular}{|c|c|c|c|}\n\\hline\nmetric & p\\_value & difference (mm) & percentual\\_diff \\\\ \n\\hline\nE1÷E2 & 0.06 & 0.01 & 15.59 \\\\ \nF5 & 0.06 & 0.01 & 20.82 \\\\ \nB3 & 0.1 & 0.05 & 98.88 \\\\ \nB5 & 0.11 & 0.06 & 126.81 \\\\ \nD1 & 0.15 & 0.1 & 209.36 \\\\ \nA1 & 0.19 & 0.14 & 270.42 \\\\ \nA4 & 0.23 & 0.18 & 365.0 \\\\ \nB4 & 0.26 & 0.21 & 413.57 \\\\ \nD3 & 0.26 & 0.21 & 419.19 \\\\ \nD2÷D1 & 0.33 & 0.28 & 553.29 \\\\ \nD3÷D1 & 0.37 & 0.32 & 640.19 \\\\ \nC1 & 0.4 & 0.35 & 701.79 \\\\ \nA4÷A3 & 0.51 & 0.46 & 921.8 \\\\ \nD2 & 0.52 & 0.47 & 944.6 \\\\ \nB4÷B1 & 0.58 & 0.53 & 1061.64 \\\\ \nB4÷B2 & 0.61 & 0.56 & 1112.58 \\\\ \nA5÷A3 & 0.69 & 0.64 & 1287.82 \\\\ \nB4÷B3 & 0.8 & 0.75 & 1493.3 \\\\ \nA1÷A3 & 0.89 & 0.84 & 1679.39 \\\\ \nE2 & 0.94 & 0.89 & 1782.62 \\\\ \nA5 & 0.98 & 0.93 & 1858.21 \\\\ \n\\hline\n\\end{tabular}\n\\end{table}\nThat means for some non si

In [21]:
def create_tables_latex(df, results_df):
    text = ""
    text += std_and_averages_for_each_metric(df, results_df)
    text += r"\newpage"
    text += pvalue_analysis(df, results_df)
    return text

In [22]:
def result_summary(results_df):
    text = r"\onecolumngrid"
    text += r"\newpage"
    text += "\n"
    text += r"\small"
    #text += r"\begin{sidewaystable}[H]"
    text += r"\centering"
    print(results_df.columns)
    text += dataframe_to_latex_tables(results_df[['metric', 'normality_kalinini', 'normality_resplendens', 'normality',
       'levene_pvalue', 'variance', 'test_type', 't_stat', 'p_value']])   
    text += "\n" 
    #text += r"\end{sidewaystable}"
    text += "\n"
    text += r"\normalsize "
    
    text += r"\newpage"
    text += "\n"
    text += r"\small"
    #text += r"\begin{sidewaystable}[H]"
    text += r"\centering"
    print(results_df.columns)
    text += dataframe_to_latex_tables(results_df[['metric','interpretation', 'u_stat', 'significance']])   
    text += "\n" 
    #text += r"\end{sidewaystable}"
    text += "\n"
    text += r"\normalsize "
    return text
result_summary(results_df)

Index(['metric', 'normality_kalinini', 'normality_resplendens', 'normality',
       'levene_pvalue', 'variance', 'test_type', 't_stat', 'p_value',
       'interpretation', 'significance', 'u_stat'],
      dtype='object')
metric
test_type
p_value
Index(['metric', 'normality_kalinini', 'normality_resplendens', 'normality',
       'levene_pvalue', 'variance', 'test_type', 't_stat', 'p_value',
       'interpretation', 'significance', 'u_stat'],
      dtype='object')
metric


"\\onecolumngrid\\newpage\n\\small\\centering\\begin{table}[H]\n\\centering\n\\begin{tabular}{|c|c|c|c|c|c|c|c|c|}\n\\hline\nmetric & normality\\_kalinini (mm) & normality\\_resplendens (mm) & normality (mm) & levene\\_pvalue (mm) & variance (mm) & test\\_type & t\\_stat (mm) & p\\_value \\\\ \n\\hline\nA1 & True & True & normal & 0.286 & equal & Student's t-test & -1.36 & 0.185 \\\\ \nA2 & True & True & normal & 0.185 & equal & Student's t-test & -2.168 & 0.039 \\\\ \nA3 & True & True & normal & 0.504 & equal & Student's t-test & -3.297 & 0.003 \\\\ \nA4 & False & True & non normal & nan & different & Mann-Whitney U test & nan & 0.232 \\\\ \nA5 & True & False & non normal & nan & different & Mann-Whitney U test & nan & 0.979 \\\\ \nB1 & False & True & non normal & nan & different & Mann-Whitney U test & nan & 0.012 \\\\ \nB2 & True & True & normal & 0.412 & equal & Student's t-test & -2.71 & 0.011 \\\\ \nB3 & True & True & normal & 0.493 & equal & Student's t-test & -1.697 & 0.099 \\\

In [23]:
def generate_latex_preamble():
    """
    Generates a LaTeX preamble that includes all required packages and settings.

    Returns:
        str: A string containing the LaTeX preamble and beginning of the document.
    """
    preamble = r"""

% Input encoding
\usepackage[utf8]{inputenc}

% Language (optional; adjust if needed)
\usepackage[english]{babel}

% For graphics inclusion
\usepackage{graphicx}

% For controlling float placement (e.g. [H] for figures and tables)
\usepackage{float}

% For nicer tables (booktabs for horizontal lines, xcolor for coloring)
\usepackage{booktabs}
\usepackage[table]{xcolor}

% Adjust margins (optional)
\usepackage[margin=1in]{geometry}

% For hyperlinks (optional)
\usepackage{hyperref}

% For improved font rendering (optional)
\usepackage{lmodern}


% --------------------------
% You can now include sections, figures, tables, etc.
% --------------------------
"""
    return preamble




In [24]:
aggregated_mean

Unnamed: 0_level_0,A1,A2,A3,A4,A5,B1,B2,B3,B4,B5,...,A1÷A3,A4÷A3,A5÷A3,B4÷B1,B4÷B2,B4÷B3,D2÷D1,D3÷D1,D4÷D2,E1÷E2
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cupreomarginata,4039.8225,3546.72375,4507.655,1992.43375,1530.9825,5273.87,7510.0325,8050.69875,4812.46125,153.56875,...,0.897067,0.442298,0.339964,0.912445,0.64093,0.59783,0.498265,1.434794,1.203349,0.636558
kalinini,4002.656667,3609.385,4608.27,1658.206667,1854.68,5490.752727,8238.267273,8718.554545,5458.398182,151.621818,...,0.868765,0.359353,0.403076,0.99406,0.662431,0.62618,0.562978,1.862876,1.062452,0.612242
resplendens,4161.616087,3772.468261,4810.819565,1789.721304,1846.759565,5679.961304,8561.34087,8984.585217,5585.811304,148.978261,...,0.865372,0.371797,0.383791,0.983549,0.65259,0.622042,0.542486,1.807503,1.2268,0.688782


In [25]:

significant_results = results_df.loc[results_df["significance"] == "significant", "metric"].to_list()
significant_metrics_df = aggregated_mean[significant_results]
non_significant_results = results_df.loc[results_df["significance"] != "significant", "metric"].to_list()
non_significant_metrics_df = aggregated_mean[significant_results]
significant_metrics_df

Unnamed: 0_level_0,A2,A3,B1,B2,D4,E1,F1,F2,F3,F4,D4÷D2
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cupreomarginata,3546.72375,4507.655,5273.87,7510.0325,689.93,415.8725,1447.68375,1481.27125,1538.33125,1971.52125,1.203349
kalinini,3609.385,4608.27,5490.752727,8238.267273,732.170909,503.52,1402.484,1397.5,1451.6,1935.343,1.062452
resplendens,3772.468261,4810.819565,5679.961304,8561.34087,859.733478,576.953043,1464.046087,1475.331739,1537.430435,2184.256087,1.2268


In [26]:
def metric_significance_description(metric, mean_df, std_df, significance, unit = ""):
    text = ""
    species = ["kalinini", "resplendens"]
    #print(species)
    text += f"For the metric {metric} "
    unit = "mm"
    
    for sp in species:
        average = mean_df.loc[sp, metric]
        std = std_df.loc[sp, metric]
        average_mm = average/1000
        std_mm = std/1000
        txt = f"C. {sp} has an average of {average_mm:.2f} ${unit}$ and a standard deviation of  {std_mm:.2f} ${unit}$. " #{std_mm:.1e}
        text += txt
    text += f"The difference between species is {significance}\n"
    return text
metric_significance_description(metric = "A2", mean_df = aggregated_mean, std_df = aggregated_std, significance = "statiscally significant", unit="\\mu m")

'For the metric A2 C. kalinini has an average of 3.61 $mm$ and a standard deviation of  0.12 $mm$. C. resplendens has an average of 3.77 $mm$ and a standard deviation of  0.17 $mm$. The difference between species is statiscally significant\n'

In [27]:
def conclusion_latex(df):
    """
    Generates LaTeX code for the conclusion section.
    
    This section includes:
      - A subsection heading for the comparison with Zubov et al. claims.
      - Several claims with italicized and normal text.
      - A page break after the claims.
      - A comparative table (generated from the DataFrame).
      - A final page break.
      
    Parameters:
        df (pandas.DataFrame): The DataFrame to be used for generating the comparative table.
        
    Returns:
        str: LaTeX code for the conclusion section.
    """
    latex = ""
    
    # Comparison heading
    latex += "\\subsection{Comparison with Zubov et al. claims}\n\n"
    
    # Claim 1
    latex += "\\subsubsection*{Claim 1}\n"
    latex += "\\textit{The new species is very close to \\textit{C. resplendens} and has only few morphological differences from it. "
    latex += "Clypeus of \\textit{C. kalinini sp.n.} is slightly longer than in \\textit{C. resplendens}.}\n\n"
    latex += "Head's vertical clipeus length, \\textbf{A5}, shows a statistically significant difference. "
    #latex += metric_significance_description(metric = "A5", mean_df = aggregated_mean, std_df = aggregated_std, significance = "statiscally significant", unit="\\mu m")
    #latex += metric_significance_description(metric = "A3", mean_df = aggregated_mean, std_df = aggregated_std, significance = "statiscally significant", unit="\\mu m")
    #latex += "These can be used as alternatives to \\textbf{A1÷A3}.\n\n"
    
    # Claim 2
    latex += "\\subsubsection*{Claim 2}\n"
    latex += "\\textit{Pronotum in \\textit{C. kalinini sp.n.} is slightly longer in relation to its width than in \\textit{C. resplendens}, its sides have smaller angles, whereas in \\textit{C. resplendens} the sides of pronotum are rounded.}\n\n"
    latex += "None of the pronotum's vertical length---horizontal width ratios (Metrics \\textbf{B4÷B1}, \\textbf{B4÷B2}, \\textbf{B4÷B3}) showed a significant difference between species.\n\n"
    latex += metric_significance_description(metric = "B1", mean_df = aggregated_mean, std_df = aggregated_std, significance = "statiscally significant", unit="\\mu m")
    latex += metric_significance_description(metric = "B2", mean_df = aggregated_mean, std_df = aggregated_std, significance = "statiscally significant", unit="\\mu m")
    latex += "The angle of the pronotum, as seen from its side (C1), has no significant difference between species."
    latex += "The angle of the pronotum, or as seen from the top (B5), has no significant difference between species."
    
    # Claim 3
    latex += "\\subsubsection*{Claim 3}\n"
    latex += "\\textit{Mesosternal process shiny, shorter and wider than in \\textit{C. resplendens}, where the process is long and narrow and its "
    latex += "apical half is greenish golden (Fig. 6--8).}\n\n"
    latex += "The first approach is to interpret the claim as a statement about the width-length ratio of the mesosternal process. \n\n"
    latex += "The vertical length base width ratio, \\textbf{D3÷D1},  is not statistically significant \n\n"
    latex += "The vertical length down to the vertex of the dark stripe of the mesosternal process- horizontal length of the dark stripe, \\textbf{D4÷D2}\n\n, is not statistically significant."
    
    latex += "There is a significant difference in  the absolute vertical length values between the two species (Metric \\textbf{D2}, Figure 23). "
    latex += "There is no significant difference in their widths (Metrics \\textbf{D1} and \\textbf{D3}). "
    latex += "There is a significant difference between species in the vertical distance between the tip of the mesosternal process and the lower point of the dark curve in its middle.\n\n"
    
    
    
    # Claim 4
    latex += "\\subsubsection*{Claim 4}\n"
    latex += "\\textit{Prosternal plate of \\textit{C. kalinini sp.n.} is rounded triangular and flat, whereas in \\textit{C. resplendens} it is square and has a clear dent.}\n\n"
    latex += "Although there is a difference between the absolute values of the foremost width of the prosternal process (Metric \\textbf{E1}, Figure 29), "
    latex += "there is no significant difference in how square the prosternal plate is for each species when the ratio of lengths is taken into account "
    latex += "(Metric \\textbf{E1÷E2}, ratio between \\textbf{E1} and \\textbf{E2}; Figure 51).\n\n"

    # other metrics
    latex += "\\subsubsection*{Other metrics}\n"
    latex += "Other metrics of interest, not directly related to any of Zubov et al's claims are the following:\n\n"
    for metric in ["D4","E1","F1","F2","F3","F4"]:
        
        latex += metric_significance_description(metric = metric, mean_df = aggregated_mean, std_df = aggregated_std, significance = "statiscally significant", unit="\\mu m")
        latex += "\n\n"
    # Insert a page break after the claims
    latex += "\\newpage\n\n"
    
    # Comparative table heading
    latex += "\\subsection{A word of caution}\n\n"
    latex += "Even though there are multiple significant metrics, all of them are unfeasible to be used in the field given that most of these differences are of less than 1 mm in length.:\n\n"
    # Generate LaTeX tables from the DataFrame.
    # This assumes that you have a function 'dataframe_to_latex_tables' that returns a list of table strings.
    #tables = dataframe_to_latex_tables(df)
    #for table in tables:
    #    latex += table + "\n"
    
    # Final page break
    latex += "\\newpage\n\n"
    
    return latex



In [28]:
def create_bib_file():
    bib_content = """@article{zubov2019chrysina,
  author = {A.S. Zubov and N.V. Ivshin and A. Yu. Titarenko and B.V. Andrianov},
  title = {Description of a new species of Chrysina Kirby, 1828 (Coleoptera: Scarabaeidae: Rutelinae) from resplendens group, based on morphological characters and mtDNA COX I molecular marker},
  journal = {Acta Biologica Sibirica},
  year = {2019},
  volume = {5},
  number = {1},
  pages = {71--76},
  issn = {2412-1908},
  doi = {10.14258/abs.v5.i1.5194}
}









%
% Below is a list of possible bibliography entries along with their required and optional fields
%    Taken from the wikipedia entry for BibTeX at http://en.wikipedia.wiki/BibTeX 19-Nov-2007
%
%
%

%article
%    An article from a journal or magazine.
%    Required fields: author, title, journal, year
%    Optional fields: volume, number, pages, month, note, key
%book
%    A book with an explicit publisher.
%    Required fields: author/editor, title, publisher, year
%    Optional fields: volume, series, address, edition, month, note, key
%booklet
%    A work that is printed and bound, but without a named publisher or sponsoring institution.
%    Required fields: title
%    Optional fields: author, howpublished, address, month, year, note, key
%conference
%    The same as inproceedings, included for Scribe (markup language) compatibility.
%    Required fields: author, title, booktitle, year
%    Optional fields: editor, pages, organization, publisher, address, month, note, key
%inbook
%    A part of a book, which may be a chapter (or section or whatever) and/or a range of pages.
%    Required fields: author/editor, title, chapter/pages, publisher, year
%    Optional fields: volume, series, address, edition, month, note, key
%incollection
%    A part of a book having its own title.
%   Required fields: author, title, booktitle, year
%    Optional fields: editor, pages, organization, publisher, address, month, note, key
%inproceedings
%    An article in a conference proceedings.
%    Required fields: author, title, booktitle, year
%    Optional fields: editor, pages, organization, publisher, address, month, note, key
%manual
%    Technical documentation.
%    Required fields: title
%    Optional fields: author, organization, address, edition, month, year, note, key
%mastersthesis
%    A Master's thesis.
%    Required fields: author, title, school, year
%    Optional fields: address, month, note, key
%misc
%    For use when nothing else fits.
%    Required fields: none
%    Optional fields: author, title, howpublished, month, year, note, key
%phdthesis
%    A Ph.D. thesis.
%    Required fields: author, title, school, year
%    Optional fields: address, month, note, key
%proceedings
%    The proceedings of a conference.
%    Required fields: title, year
%    Optional fields: editor, publisher, organization, address, month, note, key
%techreport
%    A report published by a school or other institution, usually numbered within a series.
%    Required fields: author, title, institution, year
%    Optional fields: type, number, address, month, note, key
%unpublished
%   A document having an author and title, but not formally published.
%    Required fields: author, title, note
%    Optional fields: month, year, key
"""
    with open(r"report_output/ref.bib", "w") as bib_file:
        bib_file.write(bib_content)
    print("BibTeX file 'ref.bib' has been created.")

In [29]:
def create_bibliography_latex():
    """
    Generates LaTeX code for a bibliography (references) section.
    
    Returns:
        str: LaTeX code for the bibliography.
    """
    latex = ""
    
    # Create an unnumbered section for References.
    latex += "\\section*{References}\n\n"
    
    # Begin an enumerated list of references.
    latex += "\\begin{enumerate}\n"
    
    # Reference 1
    latex += ("  \\item Zubov, A.S.; Ivshin, N.V.; Titarenko, A.Y.; Andrianov, B.V. (2019). Description of a new species of "
              "Chrysina Kirby, 1828 (Coleoptera: Scarabaeidae: Rutelinae) from the resplendens group, based on morphological characters "
              "and mtDNA COX I molecular marker. \\textit{Acta Biologica Sibirica}, 5(1), 71--76.\n")
    
    # Additional references can be added here.
    # latex += "  \\item Author B, et al. (Year). Title of the paper. Journal Name, Volume(Issue), Page Numbers.\n"
    
    latex += "\\end{enumerate}\n"
    
    # Optionally, add some vertical space after the bibliography.
    latex += "\n\\vspace{1cm}\n"
    
    return latex

# --- Example usage ---
if __name__ == "__main__":
    bib_code = create_bibliography_latex()
    with open("bibliography.tex", "w", encoding="utf-8") as f:
        f.write(bib_code)
    print("LaTeX bibliography generated as 'bibliography.tex'.")


LaTeX bibliography generated as 'bibliography.tex'.


In [30]:
def create_main_latex():
    latex_code = r"""
%\documentclass[aps,twocolumn,secnumarabic,nobalancelastpage,amsmath,amssymb,nofootinbib]{revtex4}
\documentclass[aps,secnumarabic,nobalancelastpage,amsmath,amssymb,nofootinbib]{revtex4}
\usepackage{gensymb} 
\usepackage{multirow}
\usepackage{graphics}      
\usepackage{graphicx}      
\usepackage{longtable}     
\usepackage{url}           
\usepackage{bm}            
\usepackage[utf8]{inputenc}
\usepackage{comment}
\usepackage{pdflscape}
\usepackage{rotating}

\usepackage[letterpaper,top= 2.75cm,bottom=3.5cm,left=1.8cm,right=1.8cm]{geometry}
\usepackage{ifsym}                        
\usepackage{amssymb}                      
\usepackage{amsmath}                      
\usepackage{amsthm}                       
\usepackage{color}                        
\usepackage{multienum}                    
\usepackage{tabularx}                     
\usepackage{booktabs}                     
\usepackage{fancyhdr}
\usepackage{pgf}
\usepackage{tikz}
\tikzstyle{guiones}+=[dashed]
\usetikzlibrary{patterns,arrows,snakes,shapes,automata,plotmarks,backgrounds}
\usepackage{lscape}
\usepackage{titlesec}
\usepackage{array,ragged2e}
\newcolumntype{P}[1]{>{\RaggedRight\arraybackslash}p{#1}}
\usepackage{float}
\usepackage{placeins}
\usepackage{pdfpages} % Required for including PDF files

\setlength{\columnsep}{7.5mm} 
\titleformat*{\section}{\normalsize\bfseries}
\titleformat*{\subsection}{\normalsize\bfseries}  

\def\bibsection{\section*{\refname}} 
\usepackage[pdfborder={0 0 0},colorlinks=false]{hyperref}
\usepackage{xurl} 
\usepackage{adjustbox}
\usepackage{titlesec}

% Cambiar tamaño de letra manteniendo la negrita
\titleformat{\section}
  {\normalfont\large\bfseries}  % Negrita y tamaño grande
  {\thesection}{1em}{}

\titleformat{\subsection}
  {\normalfont\large\bfseries}  % Negrita y tamaño mediano
  {\thesubsection}{1em}{}

\titleformat{\subsubsection}
  {\normalfont\fontsize{11}{14}\selectfont\bfseries\itshape}  % Negrita y tamaño normal
  {\thesubsubsection}{1em}{}

\usepackage{hyperref}

\newcommand{\displayboxplot}[1]{%
    \begin{figure}[H]
        \centering
        \includegraphics[trim=0cm 0cm 0cm 0.9cm, clip, width=0.9\linewidth]{images/boxplot/boxplot_#1.png}
        \vspace{-0.4cm}
        \caption{Boxplot and specimen distribution (superposed) for the metric $#1$, by species.}
    \end{figure}
}
%Para los ratios, el comando displayboxplot hará mal la caption (por los $$, pero me parecen necesarios), no es imposible de arreglar, pero tampoco hay tantos ratios.

\begin{document}

{\begin{flushleft}
\vskip-25pt 
{\includegraphics[width = 0.15\textwidth]{images/escudos/firma-promocional-con-texto-negro.png}}
\end{flushleft}}

\title{{\Large Biometry report}}
\author{Dra. Marcela Hernández}
\author{Dr. Esteban Bermúdez Ureña}
\author{Esteban Soto}
\author{Ángel Aguirre}
\email{marcela.hernandezjimenez@ucr.ac.cr}
\email{esteban.bermudezurena@ucr.ac.cr}
\email{esteban.sotomonge@ucr.ac.cr}
\email{angel.aguirre@ucr.ac.cr}

%Hay que mejorar como se ven los correos

\affiliation{Centro de Investigación en Ciencia e Ingeniería de los Materiales, Universidad de Costa Rica}
\date{\today} 

\input{Abstract}

\maketitle
\newpage
%\input{Introduction}

%\input{Methodology}

%\input{Analysis}

%\input{Claims}
%\newpage
\input{Tables}
%\input{Results}

\bibliographystyle{apsrev4-1}
\bibliographystyle{plain}
\bibliography{ref}

%\section{Anexos}
%\appendix

\end{document}
"""
    return latex_code



In [31]:
def create_abstract_latex():
    text = r""""\begin{abstract}
Zubov et al. (2019)\cite{zubov2019chrysina} describe a new species of Chrysina. In their comparative analysis and remarks, it is stated that the new species is very similar to C. resplendens, with only a few morphological differences being noted. This study aims to perform a quantitative analysis of these differences using a sample of 11 C. kalinini specimens and 23 C. resplendens specimens. The measurements described in the article are specified with greater precision, and alternative metrics are analyzed. Furthermore, the claims made in the mentioned paper are reviewed, assessing their validity and exploring new methods for differentiating the two beetle species with striking similarities.
\end{abstract}"""
    return text

In [32]:
import pandas as pd
import numpy as np

def set_significant_figures(df, columns, sig_figs):
    """
    Set a specified number of significant figures for certain columns in a pandas DataFrame
    and apply formatting to the display, ensuring no scientific notation is used.
    
    Args:
    - df (pd.DataFrame): The input DataFrame.
    - columns (list): List of column names to format.
    - sig_figs (int): The number of significant figures.
    
    Returns:
    - pd.DataFrame: The DataFrame with the modified values and formatted display.
    """
    # Function to round the values based on significant figures
    def round_sig(x, sig):
        if x == 0:
            return 0
        else:
            return round(x, sig - int(np.floor(np.log10(abs(x)))) - 1)

    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()

    # Apply rounding to the specified columns
    for column in columns:
        if column in df_copy.columns:
            df_copy[column] = df_copy[column].apply(lambda x: round_sig(x, sig_figs))

    # Apply the formatting for display purposes
    def format_cell(val):
        # Use fixed-point notation without unnecessary decimals
        if isinstance(val, (int, float)):
            # Remove unnecessary decimal places
            return f"{val:.{sig_figs}g}".rstrip('0').rstrip('.') if not val.is_integer() else f"{int(val)}"
        return val

    # Apply formatting to the specified columns
    df_style = df_copy.style.format({col: lambda x: format_cell(x) for col in columns})

    return df_style

# Example usage
data = {'A': [123.4567, 2345.6789, 34567.1234], 'B': [0.001234, 123.456, 789.01], 'C': [5.678, 34.567, 123.456]}
df = pd.DataFrame(data)

# Specify which columns to format and the number of significant figures
columns_to_format = ['A', 'B']
sig_figs = 4

# Get the modified DataFrame with styling
styled_df = set_significant_figures(df, columns_to_format, sig_figs)

# Display the DataFrame in Jupyter Notebook or other environments
styled_df



Unnamed: 0,A,B,C
0,123.5,0.001234,5.678
1,2346.0,123.5,34.567
2,34570.0,789.0,123.456


In [33]:
import os
import subprocess
from datetime import datetime

# =============================================================================

#   generate_latex_preamble()       -> returns the preamble string
#   front_page_latex()              -> returns LaTeX code for the front page
#   introduction_latex(df)          -> returns LaTeX code for the introduction section
#   methodology_latex(df)           -> returns LaTeX code for the methodology section
#   dataset_description_latex(df, group_by="species") -> returns LaTeX code for dataset description section
#   Analysis_latex(df, figure_counter=1)  -> returns LaTeX code for statistical analysis section
#   conclusion_latex(df)            -> returns LaTeX code for the conclusion section
#   create_bibliography_latex()     -> returns LaTeX code for the bibliography
#

# =============================================================================
def compile_latex(cwd, compile_cmd):
    try:
        result = subprocess.run(
            compile_cmd, cwd=cwd, check=True,
            stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
        )
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("LaTeX compilation failed.")
        print("Return code:", e.returncode)
        print("Output:", e.stdout)
        print("Error output:", e.stderr)
        raise



def create_report(df = measurement_df_2, cached_tex_files = True,group_by = "species"):
    report_location = r"C:\Users\esteb\escarabajos\biometry\report_output"
    # Create the output folder if it doesn't exist
    if not os.path.exists(report_location):
        os.makedirs(report_location)

    if not cached_tex_files:
        # Prepare each section's LaTeX code
        preamble = generate_latex_preamble() #x
        front_page_section = front_page()
        introduction = introduction_section(df)  # Pass your dataframe as needed
        methodology = methodology_latex(df)    # Pass your dataframe as needed
        summary_table = result_summary(results_df)
        tables = create_tables_latex(df, results_df)
        main = create_main_latex()
        dataset_description =  dataset_description_latex(df, image_path = r"C:\Users\esteb\escarabajos\biometry\report_output\images", group_by="species")
        analysis = Analysis_latex(df, figure_counter=1, group_by = "species")  # Pass your df and figure_counter
        conclusion = conclusion_latex(df)      # Pass your dataframe as needed x
        bibliography = create_bibliography_latex()
        abstract = create_abstract_latex()
        create_bib_file()
    
        # Write each section to a separate .tex file (optional, for organization)
        sections = {
            "preamble.tex": preamble,
            "front_page.tex": front_page_section,
            "introduction.tex": introduction,
            "methodology.tex": methodology,
            "Analysis.tex": analysis,
            "Tables.tex": tables,
            "Results.tex": summary_table,
            "Claims.tex": conclusion,
            "bibliography.tex": bibliography,
            "main.tex": main,
            "Abstract.tex": abstract
        }
        
        for filename, content in sections.items():
            path = os.path.join(report_location, filename)
            with open(path, "w", encoding="utf-8") as f:
                f.write(content)
    


    # Now compile main.tex into a PDF using pdflatex.
    # Run pdflatex twice to ensure that references are updated.
    compile_cmd = ["pdflatex", "-interaction=nonstopmode", "main.tex"]
    cwd = os.path.abspath(report_location)
    compile_latex(cwd, compile_cmd)

    
    try:
        # First pass
        subprocess.run(compile_cmd, cwd=cwd, check=True)
        # Second pass
        subprocess.run(compile_cmd, cwd=cwd, check=True)
    except subprocess.CalledProcessError as e:
        print("Error during LaTeX compilation:", e)
        return

    
    # The resulting PDF will be at report_location/main.pdf
    pdf_path = os.path.join(cwd, "main.pdf")
    print(f"The report has been generated at: {pdf_path}")

if __name__ == "__main__":
    #filter cupreomarginata
    measurement_df_2 = measurement_df_2[(measurement_df_2["species"] == "kalinini") | (measurement_df_2["species"] == "resplendens")]
    print(measurement_df_2)
    
    create_report(measurement_df_2, cached_tex_files = False, group_by = "species")


             code       A1       A2       A3       A4       A5       B1  \
0   CICIMAUCR0001  4120.58  3511.41  4674.61  1430.02  1958.09  5525.11   
1   CICIMAUCR0002  4117.20  3529.93  4475.22  1462.50  1922.14  5352.33   
2   CICIMAUCR0003  3971.18  3584.00  4563.49  1402.85  1924.31  5360.70   
3   CICIMAUCR0004  3627.85  3538.61  4585.22  1475.45  2028.42  5328.44   
4   CICIMAUCR0006      NaN      NaN      NaN      NaN      NaN  5326.82   
5   CICIMAUCR0008      NaN      NaN      NaN      NaN      NaN  5471.34   
6   CICIMAUCR0009      NaN      NaN      NaN      NaN      NaN  5989.42   
7   CICIMAUCR0097  4006.44  3685.78  4546.28  2076.73  1707.97  5380.23   
8   CICIMAUCR0105  4172.69  3806.58  4804.80  2101.69  1587.15  5606.02   
9   CICIMAUCR0113      NaN      NaN      NaN      NaN      NaN  5623.08   
10  CICIMAUCR0116      NaN      NaN      NaN      NaN      NaN  5434.79   
15  CICIMAUCR0210  3908.96  3849.10  4976.72  2090.46  1635.22  5832.83   
16  CICIMAUCR0212  4437.5

KeyError: 'kalinini'