In [2]:
import pandas as pd
import os
import pathlib
import re

In [3]:
path_outsiders = "/export/data_ml4ds/NextProcurement/Junio_2025/pliegosPlace/red_data_outsiders_2024_conTitleCPVLink_chunks"
path_insiders = "/export/data_ml4ds/NextProcurement/Junio_2025/pliegosPlace/red_data_insiders_2024_conTitleCPVLink_chunks"

In [4]:
df_insiders = pd.read_parquet(path_insiders)
df_outsiders = pd.read_parquet(path_outsiders)

# make id unique
df_insiders["id"] = "I" + df_insiders["id"].astype(str)
df_outsiders["id"] = "O" + df_outsiders["id"].astype(str)

df_in_out = pd.concat([df_insiders, df_outsiders], ignore_index=True)

In [5]:
df_in_out.columns

Index(['place_id', 'link',
       'ContractFolderStatus.ProcurementProject.RequiredCommodityClassification.ItemClassificationCode',
       'title', 'url', 'id', 'resultado_tecnico', 'path_tecnico',
       'resultado_administrativo', 'path_administrativo', 'texto_tecnico',
       'texto_administrativo'],
      dtype='object')

In [6]:
print(df_in_out["id"].duplicated().sum())
assert len(df_in_out) == len(df_in_out["id"].unique()) == len(df_insiders) + len(df_outsiders)
print(len(df_outsiders))
print(len(df_insiders))

0
35340
114600


# Objective extractor experiments

We get a random sample of 500 outsiders + 500 insiders

In [11]:
ERR = "[ERROR: PDF sin texto extraíble (posiblemente escaneado)]"

text_col = df_in_out["texto_tecnico"].astype("string").fillna("")
id_str   = df_in_out["id"].astype("string").fillna("")

mask_I = id_str.str.startswith("I")
mask_O = id_str.str.startswith("O")
mask_ok_text = ~text_col.str.strip().eq(ERR)

# rows that are entirely control chars / whitespace (at least one char)
mask_all_control = ~text_col.str.fullmatch(r'[\x00-\x1F\x7F\s]+', na=False)

# rows whose title column is not nan or empty after stripping
title_col = df_in_out["title"].astype("string").fillna("")
mask_title_ok = ~title_col.str.strip().eq("")

df_I = df_in_out.loc[mask_I & mask_all_control & mask_ok_text & mask_title_ok]
df_O = df_in_out.loc[mask_O & mask_all_control & mask_ok_text & mask_title_ok]

df_sample = pd.concat([
    (df_I.sample(min(500, len(df_I)), random_state=42) if len(df_I) else df_I.head(0)),
    (df_O.sample(min(500, len(df_O)), random_state=42) if len(df_O) else df_O.head(0)),
], ignore_index=True)


path_save = "/export/data_ml4ds/NextProcurement/pruebas_oct_2025/objective_extractor/data/insiders_outsiders_500_500.parquet"

os.makedirs(pathlib.Path(path_save).parent, exist_ok=True)
df_sample.to_parquet(path_save, index=False)

In [18]:
df_sample.title


0      Elaboración y redacción del Plan de Movilidad ...
1      Contratación del suministro de flores y planta...
2      Contratación del servicio de Análisis Clínicos...
3      Objeto del contrato: La finalidad del presente...
4      CEMILVET - Suministro de instalación de 2 auto...
                             ...                        
995    MRU Serveis de redacció de l'estudi de seguret...
996          Servei de menjador de l'Escola de Rellinars
997    Suministros y servicios para la protección y c...
998    l’adjudicació de la prestació de suport al ser...
999    Acord Marc del subministrament de Pròtesis de ...
Name: title, Length: 1000, dtype: string

# Divide by CPV and get CPV5 and CPV8

In [None]:
import numpy as np
import ast

def safe_parse_possible_array_string(item):
    """Fix format like array(['[50300000, 50330000]']) and ignore 'nan' strings."""
    if isinstance(item, np.ndarray) and len(item) == 1:
        string = item[0]
        if isinstance(string, str) and string.strip().lower() == "nan":
            return []  # Treat as empty
        try:
            parsed = ast.literal_eval(string)
            if isinstance(parsed, list):
                return parsed
        except (ValueError, SyntaxError):
            return None
    return None

def extract_cpv_depth(code):
    """Extracts CPV depth from a single CPV code if valid."""
    try:
        code_float = float(code)
        code_str = str(int(code_float))
        return len(code_str.rstrip('0'))
    except (ValueError, TypeError):
        return None

def analyze_cpv_depths(df, column='cpv'):
    depths = []
    format_issues = []
    nan_count = 0
    total = 0

    for item in df[column]:
        
        if isinstance(item, np.ndarray) and len(item) == 1 and str(item[0]).strip().lower() == "nan":
            nan_count += 1
            continue
        
        if isinstance(item, list):
            if len(item) == 0:
                nan_count += 1
                continue
            for code in item:
                total += 1
                depth = extract_cpv_depth(code)
                if depth is not None:
                    depths.append(depth)
                else:
                    format_issues.append(code)

        elif pd.isna(item):
            nan_count += 1

        else:
            # Try to fix malformed numpy array string like: array(['[50300000, 50330000]'])
            recovered_list = safe_parse_possible_array_string(item)
            if recovered_list:
                for code in recovered_list:
                    total += 1
                    depth = extract_cpv_depth(code)
                    if depth is not None:
                        depths.append(depth)
                    else:
                        format_issues.append(code)
            else:
                total += 1
                depth = extract_cpv_depth(item)
                if depth is not None:
                    depths.append(depth)
                else:
                    format_issues.append(item)

    print(f"Total CPV codes processed: {total}")
    print(f"Format issues: {len(format_issues)}")
    print(f"NaNs or empty lists: {nan_count}")
    print(f"Valid CPV codes with depth: {len(depths)}")

    depth_counts = pd.Series(depths).value_counts().sort_index()
    return depth_counts, format_issues, nan_count, total

depth_counts, bad_cpvs, nan_count, total_cpvs = analyze_cpv_depths(df)

print(depth_counts)
print("\nExamples of format issues:")
print(bad_cpvs[:10])  # first 10 malformed entries

def format_latex_count_and_percentage_with_nans(depth_counts, nan_count, total_rows):
    full_counts = depth_counts.copy()
    full_counts["NaN / empty"] = nan_count  # Add nan count as its own category

    formatted = []
    for depth, count in full_counts.items():
        percent = 100 * count / total_rows
        count_str = f"{count:,}".replace(",", r"\,")
        formatted.append((str(depth), f"\\({count_str}\\) ({percent:.2f}\\%)"))

    return pd.DataFrame(formatted, columns=["CPV Code Depth", "Count (Percentage)"])

depth_table = format_latex_count_and_percentage_with_nans(depth_counts, nan_count, total_cpvs)

print(depth_table.to_latex(index=False, escape=False))

# check sum of depth_counts and nan_count
total_count = depth_counts.sum() + nan_count
print(f"Total count of CPV codes (including NaNs): {total_count} (should match total rows in DataFrame: {len(df)})")
assert total_count >= len(df), "Total count of CPV codes (including NaNs) should be greater than or equal to total rows in DataFrame."
