In [158]:
import pandas as pd
import json
from google.cloud import storage
from google.cloud import bigquery
client = bigquery.Client()

# Load data

## Excel extracts

In [228]:
pcp_sample_cas_ms = pd.read_excel("Data_msr_v2/samples-RMPCP-CAS.xlsx")
pcp_sample_cosmetochem_ms = pd.read_excel("Data_msr_v2/samples-RMPCP-COSMETOCHEM.xlsx")
pcp_sample_echa_ms = pd.read_excel("Data_msr_v2/samples-RMPCP-ECHA.xlsx")
pcp_sample_lims_ms = pd.read_excel("Data_msr_v2/samples-RMPCP-LIMS-29JUL.xlsx")
pcp_test_lims_ms = pd.read_excel("Data_msr_v2/Tests-RMPCP-LIMS-30JUL.xlsx")

In [229]:
pcp_sample_cas_ms["id"] = pcp_sample_cas_ms["id"].astype(str)
pcp_sample_cas_ms["external_id"] = pcp_sample_cas_ms["external_id"].astype(str)
pcp_sample_echa_ms["id"] = pcp_sample_echa_ms["id"].astype(str)
pcp_sample_echa_ms["external_id"] = pcp_sample_echa_ms["external_id"].astype(str)
pcp_sample_lims_ms["id"] = pcp_sample_lims_ms["id"].astype(str)
pcp_sample_lims_ms["external_id"] = pcp_sample_lims_ms["external_id"].astype(str)
pcp_sample_cosmetochem_ms["id"] = pcp_sample_cosmetochem_ms["id"].astype(str)
pcp_sample_cosmetochem_ms["external_id"] = pcp_sample_cosmetochem_ms["external_id"].astype(str)
pcp_test_lims_ms["id"] = pcp_test_lims_ms["id"].astype(str)
pcp_test_lims_ms["external_id"] = pcp_test_lims_ms["external_id"].astype(str)

## SQL

In [161]:
with open(f"SQL/msr_v2/get_sample_cas.sql", "r") as sql_file:
    sql_template = sql_file.read()

query_job = client.query(sql_template)
pcp_sample_cas_dwh = query_job.to_dataframe().drop_duplicates()

In [162]:
with open(f"SQL/msr_v2/get_sample_cas_sc_null_allowed.sql", "r") as sql_file:
    sql_template = sql_file.read()

query_job = client.query(sql_template)
pcp_sample_cas_dwh_sc_null_allowed = query_job.to_dataframe().drop_duplicates()

In [163]:
with open(f"SQL/msr_v2/get_sample_cas_sc_null_allowed_aa_cas_allowed.sql", "r") as sql_file:
    sql_template = sql_file.read()

query_job = client.query(sql_template)
pcp_sample_cas_dwh_sc_null_allowed_aa_cas_allowed = query_job.to_dataframe().drop_duplicates()

In [164]:
with open(f"SQL/msr_v2/get_sample_echa.sql", "r") as sql_file:
    sql_template = sql_file.read()

query_job = client.query(sql_template)
pcp_sample_echa_dwh = query_job.to_dataframe().drop_duplicates()

In [165]:
with open(f"SQL/msr_v2/get_sample_aa.sql", "r") as sql_file:
    sql_template = sql_file.read()

query_job = client.query(sql_template)
pcp_sample_lims_dwh = query_job.to_dataframe().drop_duplicates()

In [166]:
with open(f"SQL/msr_v2/get_sample_cosmetochem.sql", "r") as sql_file:
    sql_template = sql_file.read()

query_job = client.query(sql_template)
pcp_sample_cosmetochem_dwh = query_job.to_dataframe().drop_duplicates()

In [167]:
with open(f"SQL/msr_v2/get_sample_aa_no_cas.sql", "r") as sql_file:
    sql_template = sql_file.read()

query_job = client.query(sql_template)
pcp_sample_lims_dwh_no_cas = query_job.to_dataframe().drop_duplicates()

In [168]:
with open(f"SQL/msr_v2/get_sample_aa_no_cas_sc_null_allowed.sql", "r") as sql_file:
    sql_template = sql_file.read()

query_job = client.query(sql_template)
pcp_sample_lims_dwh_no_cas_sc_null_allowed = query_job.to_dataframe().drop_duplicates()

In [235]:
with open(f"SQL/msr_v2/get_test_aa.sql", "r") as sql_file:
    sql_template = sql_file.read()

query_job = client.query(sql_template)
pcp_test_lims_dwh = query_job.to_dataframe().drop_duplicates()

## GA function

In [169]:
def gap_analysis_dataframes(df1: pd.DataFrame, df2: pd.DataFrame):
    """
    Performs a comprehensive gap analysis between two pandas DataFrames.

    This function provides useful information to assess the data gaps, including:
    - Number of lines in both DataFrames.
    - Percentage of NaN (missing) values by attribute for each DataFrame.
    - For attributes with 0% NaN values in both DataFrames, it compares their unique entries,
      reporting:
      - The count of common unique entries.
      - The count of unique entries present only in DataFrame 1 (ms).
      - The count of unique entries present only in DataFrame 2 (dwh).
    
    All values are converted to string type before comparison to prevent type mismatch issues.

    Args:
        df1 (pd.DataFrame): The first pandas DataFrame (e.g., ms).
        df2 (pd.DataFrame): The second pandas DataFrame (e.g., dwh).
    """

    print("--- Starting Gap Analysis between DataFrames ---")
    print(f"\nDataFrame 1 (ms) has {len(df1)} lines.")
    print(f"DataFrame 2 (dwh) has {len(df2)} lines.")

    common_attributes = [
        'id', 'sample_type', 'external_id', 'sample_code', 'fabrication_code',
        'batch_code_number', 'batch_code_supplier', 'sample_ec_code',
        'sample_smiles_code', 'created_at', 'updated_at'
    ]

    # --- NaN Value Analysis ---
    print("\n--- NaN Value Analysis by Attribute ---")
    nan_info = {} # Store NaN percentages for later use in value comparison
    for attr in common_attributes:
        nan_percent_df1 = (df1[attr].isnull().sum() / len(df1)) * 100 if len(df1) > 0 else 0
        nan_percent_df2 = (df2[attr].isnull().sum() / len(df2)) * 100 if len(df2) > 0 else 0
        nan_info[attr] = {'df1': nan_percent_df1, 'df2': nan_percent_df2} # Store percentages

        print(f"\nAttribute: '{attr}'")
        print(f"  - DataFrame 1 (ms): {nan_percent_df1:.2f}% NaN values")
        print(f"  - DataFrame 2 (dwh): {nan_percent_df2:.2f}% NaN values")

        if abs(nan_percent_df1 - nan_percent_df2) > 0.01:
            print(f"  * Note: There is a significant difference in NaN percentages for '{attr}'.")

    # --- Value Comparison for Complete Attributes ---
    print("\n--- Value Comparison for Attributes with 0% NaN ---")
    attributes_compared_for_values = False
    for attr in common_attributes:
        # Only compare values if the attribute has 0% NaN in *both* DataFrames
        if nan_info[attr]['df1'] == 0 and nan_info[attr]['df2'] == 0:
            attributes_compared_for_values = True
            print(f"\nAttribute: '{attr}' (0% NaN in both DataFrames)")

            # Convert column values to string type before converting to sets for consistent comparison
            set1 = set(df1[attr].astype(str).tolist())
            set2 = set(df2[attr].astype(str).tolist())

            common_unique_values = set1.intersection(set2)
            unique_only_in_df1 = set1.difference(set2)
            unique_only_in_df2 = set2.difference(set1)

            print(f"  - Number of common unique entries: {len(common_unique_values)}")
            print(f"  - Number of unique entries only in DataFrame 1 (pcp_sample_cas_ms): {len(unique_only_in_df1)}")
            print(f"  - Number of unique entries only in DataFrame 2 (pcp_sample_cas_dwh): {len(unique_only_in_df2)}")

            # Optionally, you could print the actual differing values for small sets:
            # if len(unique_only_in_df1) > 0 and len(unique_only_in_df1) < 6:
            #     print(f"    Values unique to DF1: {list(unique_only_in_df1)}")
            # if len(unique_only_in_df2) > 0 and len(unique_only_in_df2) < 6:
            #     print(f"    Values unique to DF2: {list(unique_only_in_df2)}")
        
    if not attributes_compared_for_values:
        print("  No attributes were found with 0% NaN values in both DataFrames to compare individual entries.")

    print("\n--- Gap Analysis Complete ---")

# SAMPLE PCP

## Source CAS

In [170]:
pcp_sample_cas_ms.sample(n=2)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
17025,34494,CAS,6369591,6369-59-1,,,,,,2025-04-28T16:33:03.656Z,2025-04-28T16:33:03.656Z
10175,41344,CAS,92704079,92704-07-9,,,,,,2025-06-13T16:48:35.716Z,2025-06-13T16:48:35.716Z


In [171]:
pcp_sample_cas_dwh.sample(n=2)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
2876,48085,CAS,3913711,3913-71-1,,,,,,2025-06-20 21:19:18.446666+00:00,2025-06-20 21:19:18.446666+00:00
1308,47990,CAS,10043660,10043-66-0,,,,,,2025-06-20 20:48:22.850000+00:00,2025-06-20 20:48:22.850000+00:00


In [172]:
gap_analysis_dataframes(pcp_sample_cas_ms, pcp_sample_cas_dwh)

--- Starting Gap Analysis between DataFrames ---

DataFrame 1 (ms) has 17391 lines.
DataFrame 2 (dwh) has 12558 lines.

--- NaN Value Analysis by Attribute ---

Attribute: 'id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_type'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'external_id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_code'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'fabrication_code'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'batch_code_number'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'batch_code_supplier'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'sample_ec_code'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame

In [173]:
# Entries in MS not in DWH 
pcp_sample_cas_ms[~(pcp_sample_cas_ms["external_id"].isin(pcp_sample_cas_dwh["external_id"]))].sample(n=5)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
9508,42011,CAS,92875097,92875-09-7,,,,,,2025-06-13T20:08:24.026Z,2025-06-13T20:08:24.026Z
10231,41288,CAS,90082829,90082-82-9,,,,,,2025-06-13T09:48:27.930Z,2025-06-13T09:48:27.930Z
12985,38534,CAS,101794745,101794-74-5,,,,,,2025-06-11T16:55:42.003Z,2025-06-11T16:55:42.003Z
16344,35175,CAS,101794905,101794-90-5,,,,,,2025-04-30T15:35:24.623Z,2025-04-30T15:35:24.623Z
11191,40328,CAS,229327939,229327-93-9,,,,,,2025-06-13T01:42:13.890Z,2025-06-13T01:42:13.890Z


## Source CAS (source_code null inclus)

In [174]:
gap_analysis_dataframes(pcp_sample_cas_ms, pcp_sample_cas_dwh_sc_null_allowed)

--- Starting Gap Analysis between DataFrames ---

DataFrame 1 (ms) has 17391 lines.
DataFrame 2 (dwh) has 17396 lines.

--- NaN Value Analysis by Attribute ---

Attribute: 'id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_type'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'external_id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_code'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'fabrication_code'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'batch_code_number'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'batch_code_supplier'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'sample_ec_code'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame

In [175]:
# Entries in MS not in DWH 
try:
    pcp_sample_cas_ms[~(pcp_sample_cas_ms["external_id"].isin(pcp_sample_cas_dwh_sc_null_allowed["external_id"]))].sample(n=5)
except:
    print("MS contenu dans DWH")

MS contenu dans DWH


In [176]:
# Entries in dwh not in ms
pcp_sample_cas_dwh_sc_null_allowed[~(pcp_sample_cas_dwh_sc_null_allowed["external_id"].isin(pcp_sample_cas_ms["external_id"]))].head(n=11)


Unnamed: 0,id,source_code,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
0,51524,,CAS,1174327617,1174327-61-7,,,,,,2025-07-28 08:12:18.916666+00:00,2025-07-28 08:12:18.916666+00:00
9,51523,,CAS,68958565,68958-56-5,,,,,,2025-07-28 08:04:50.293333+00:00,2025-07-28 08:04:50.293333+00:00
4844,51522,SOURCE_CAS,CAS,183158705,183158-70-5,,,,,,2025-07-25 11:19:56.136666+00:00,2025-07-25 11:19:56.136666+00:00
4849,51521,SOURCE_CAS,CAS,32539836,32539-83-6,,,,,,2025-07-23 14:22:55.670000+00:00,2025-07-23 14:22:55.670000+00:00
4870,51520,SOURCE_CAS,CAS,87785,87-78-5,,,,,,2025-07-23 08:24:26.076666+00:00,2025-07-23 08:24:26.076666+00:00


## Source CAS (source_code null inclus, AA-CAS inclus)

In [177]:
gap_analysis_dataframes(pcp_sample_cas_ms, pcp_sample_cas_dwh_sc_null_allowed_aa_cas_allowed)

--- Starting Gap Analysis between DataFrames ---

DataFrame 1 (ms) has 17391 lines.
DataFrame 2 (dwh) has 17396 lines.

--- NaN Value Analysis by Attribute ---

Attribute: 'id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_type'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'external_id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_code'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'fabrication_code'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'batch_code_number'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'batch_code_supplier'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'sample_ec_code'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame

In [178]:
# Entries in MS not in DWH 
pcp_sample_cas_ms[~(pcp_sample_cas_ms["external_id"].isin(pcp_sample_cas_dwh_sc_null_allowed_aa_cas_allowed["external_id"]))].head(n=7)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at


In [179]:
pcp_sample_cas_dwh_sc_null_allowed_aa_cas_allowed[~(pcp_sample_cas_dwh_sc_null_allowed_aa_cas_allowed["external_id"].isin(pcp_sample_cas_ms["external_id"]))].head(n=7)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
1,51524,CAS,1174327617,1174327-61-7,,,,,,2025-07-28 08:12:18.916666+00:00,2025-07-28 08:12:18.916666+00:00
11,51522,CAS,183158705,183158-70-5,,,,,,2025-07-25 11:19:56.136666+00:00,2025-07-25 11:19:56.136666+00:00
19,51521,CAS,32539836,32539-83-6,,,,,,2025-07-23 14:22:55.670000+00:00,2025-07-23 14:22:55.670000+00:00
39,51523,CAS,68958565,68958-56-5,,,,,,2025-07-28 08:04:50.293333+00:00,2025-07-28 08:04:50.293333+00:00
45,51520,CAS,87785,87-78-5,,,,,,2025-07-23 08:24:26.076666+00:00,2025-07-23 08:24:26.076666+00:00


## Source ECHA


In [180]:
pcp_sample_echa_ms.sample(n=2)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
1154,10940,CAS,cef821f3-df6e-4a31-952c-26f0aa257517,99-96-7,,,,202-804-9,OC(=O)c1ccc(O)cc1,2025-06-20T15:51:52.730Z,2025-06-20T15:51:52.730Z
4524,7570,CAS,20feded8-d30d-43c3-a5d2-bae5ff14b5bc,7631-90-5,,,,231-548-0,[Na+].OS(=O)[O-],2025-04-30T18:48:59.913Z,2025-04-30T18:48:59.913Z


In [181]:
pcp_sample_echa_dwh.sample(n=2)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
3859,8518,CAS,59a74557-2396-4831-a256-c1a1a563c06f,68921-45-9,,,,272-940-1,Variable; substance is a UVCB,2025-06-13 16:46:30.546666+00:00,2025-06-13 16:46:30.546666+00:00
232,8164,CAS,39da9a90-1dde-4466-976a-a0b03474ebd2,8050-26-8,,,,232-479-9,Not Available ( UVCB substance),2025-06-12 10:34:35.416666+00:00,2025-06-12 10:34:35.416666+00:00


In [182]:
gap_analysis_dataframes(pcp_sample_echa_ms, pcp_sample_echa_dwh)

--- Starting Gap Analysis between DataFrames ---

DataFrame 1 (ms) has 5152 lines.
DataFrame 2 (dwh) has 5153 lines.

--- NaN Value Analysis by Attribute ---

Attribute: 'id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_type'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'external_id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_code'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'fabrication_code'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'batch_code_number'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'batch_code_supplier'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'sample_ec_code'
  - DataFrame 1 (ms): 5.69% NaN values
  - DataFrame 2 (

In [183]:
# Entries in MS not in DWH 
pcp_sample_echa_ms[~(pcp_sample_echa_ms["external_id"].isin(pcp_sample_echa_dwh["external_id"]))].head(65)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at


In [184]:
# Entries in DWH not in MS
pcp_sample_echa_dwh[~(pcp_sample_echa_dwh["external_id"].isin(pcp_sample_echa_ms["external_id"]))].head(1)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
2,12095,CAS,dd612f10-a5b8-4b78-a087-46eb65dabd48,32539-83-6,,,,251-090-5,C1(OCCC2)=C2CCCCCCCCCC1,2025-07-23 14:23:14.883333+00:00,2025-07-23 14:23:14.883333+00:00


## Source LIMS

In [185]:
pcp_sample_lims_ms.sample(n=2)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
6596,25113,Refcomm,588922,C61572804,,R0077677A 001 L 001,,,,2025-07-03T16:44:21.443Z,2025-07-03T16:44:21.443Z
4427,27283,Refcomm,642957,C20177649,,R0038146A 000 L 004,,,,2025-07-04T08:16:49.910Z,2025-07-04T08:16:49.910Z


In [186]:
pcp_sample_lims_dwh.sample(n=2)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
144,27772,Refcomm,664011,43471634,DGA2020418,DGA2020418-,,,,2025-07-04 08:59:43.133333+00:00,2025-07-04 08:59:43.133333+00:00
4557,25159,Refcomm,591519,C24431487,,P0705059,,,,2025-07-03 16:45:54.153333+00:00,2025-07-03 16:45:54.153333+00:00


In [187]:
gap_analysis_dataframes(pcp_sample_lims_ms, pcp_sample_lims_dwh)

--- Starting Gap Analysis between DataFrames ---

DataFrame 1 (ms) has 9025 lines.
DataFrame 2 (dwh) has 8106 lines.

--- NaN Value Analysis by Attribute ---

Attribute: 'id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_type'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'external_id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_code'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'fabrication_code'
  - DataFrame 1 (ms): 39.53% NaN values
  - DataFrame 2 (dwh): 35.36% NaN values
  * Note: There is a significant difference in NaN percentages for 'fabrication_code'.

Attribute: 'batch_code_number'
  - DataFrame 1 (ms): 0.06% NaN values
  - DataFrame 2 (dwh): 0.06% NaN values

Attribute: 'batch_code_supplier'
  - DataFrame 1 (ms): 42.56% NaN values
  - DataFrame 2 (dwh): 38.80% NaN values
  * No

In [188]:
# Entries in DWH not in MS 
pcp_sample_lims_dwh[~(pcp_sample_lims_dwh["id"].isin(pcp_sample_lims_ms["id"]))].sample(n=5)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
304,22741,Refcomm,875027,46577119,B0000205997,B0000205997-777033,777033,,,2025-07-03 13:44:29.560000+00:00,2025-07-07 12:45:14.856666+00:00
2039,28700,Refcomm,790773,B1181217,DGA2220226,DGA2220226-1521I036,1521I036,,,2025-07-04 14:52:36.863333+00:00,2025-07-04 14:52:37.830000+00:00
3915,22715,Refcomm,865228,C23704656,DGA2210221,DGA2210221-0102459845,0102459845,,,2025-07-03 13:42:24.836666+00:00,2025-07-04 08:17:58.160000+00:00
5465,31499,Refcomm,801073,C37994190,DGA2220322,DGA2220322-H052K9U153,H052K9U153,,,2025-07-07 12:33:40.800000+00:00,2025-07-17 12:24:13+00:00
3219,22745,Refcomm,874630,C22017913,DGA2350113,DGA2350113-ESD0904215,ESD0904215,,,2025-07-03 13:44:42.396666+00:00,2025-07-04 14:53:29.796666+00:00


In [189]:
pcp_sample_lims_dwh[~(pcp_sample_lims_dwh["id"].isin(pcp_sample_lims_ms["id"]))]["sample_type"].value_counts()

sample_type
Refcomm    11
Name: count, dtype: int64

In [190]:
# Entries in MS not in DWH 
pcp_sample_lims_ms[~(pcp_sample_lims_ms["external_id"].isin(pcp_sample_lims_dwh["external_id"]))].head()

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
122,31591,Refcomm,817844,C20042883,DGA23D0069,DGA23D0069-DGK2350178,DGK2350178,,,2025-07-07T12:43:58.106Z,2025-07-07T12:43:58.106Z
127,31586,Refcomm,817846,49086564,DGA2350354,DGA2350354-2304B17102,2304B17102,,,2025-07-07T12:43:40.880Z,2025-07-07T12:43:40.880Z
128,31585,Refcomm,817847,64102891,B0000192139,B0000192139-22F3913100,22F3913100,,,2025-07-07T12:43:39.050Z,2025-07-07T12:43:39.050Z
129,31584,Refcomm,817845,53013361,DGA2290398,DGA2290398-7020206,7020206,,,2025-07-07T12:43:37.190Z,2025-07-07T12:43:37.190Z
134,31579,Refcomm,818640,C23419812,DGA23D0067,DGA23D0067-DGK1810413,DGK1810413,,,2025-07-07T12:43:25.683Z,2025-07-07T12:43:25.683Z


## Source LIMS (sample_type CAS exclus)

In [191]:
gap_analysis_dataframes(pcp_sample_lims_ms, pcp_sample_lims_dwh_no_cas)

--- Starting Gap Analysis between DataFrames ---

DataFrame 1 (ms) has 9025 lines.
DataFrame 2 (dwh) has 8106 lines.

--- NaN Value Analysis by Attribute ---

Attribute: 'id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_type'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'external_id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_code'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'fabrication_code'
  - DataFrame 1 (ms): 39.53% NaN values
  - DataFrame 2 (dwh): 35.36% NaN values
  * Note: There is a significant difference in NaN percentages for 'fabrication_code'.

Attribute: 'batch_code_number'
  - DataFrame 1 (ms): 0.06% NaN values
  - DataFrame 2 (dwh): 0.06% NaN values

Attribute: 'batch_code_supplier'
  - DataFrame 1 (ms): 42.56% NaN values
  - DataFrame 2 (dwh): 38.80% NaN values
  * No

In [192]:
# Entries in DWH not in MS 
pcp_sample_lims_dwh_no_cas[~(pcp_sample_lims_dwh_no_cas["id"].isin(pcp_sample_lims_ms["id"]))].sample(n=2)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
2042,28701,Refcomm,790773,B1181217,DGA2220226,DGA2220226-1521I036,1521I036,,,2025-07-04 14:52:40.813333+00:00,2025-07-04 14:53:01.720000+00:00
1923,23084,Refcomm,853778,72748924,B0000197661,B0000197661-UB24040008,UB24040008,,,2025-07-03 14:08:42.150000+00:00,2025-07-17 12:22:21.756666+00:00


In [193]:
# Entries in MS not in DWH 
pcp_sample_lims_ms[~(pcp_sample_lims_ms["external_id"].isin(pcp_sample_lims_dwh_no_cas["external_id"]))].sample(n=2)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
3423,28287,Refcomm,649911,C22035504,,0M700174212,,,,2025-07-04T10:27:52.060Z,2025-07-04T10:27:52.060Z
3444,28266,Refcomm,649922,B1299815,,T52811,,,,2025-07-04T10:27:13.000Z,2025-07-04T10:27:13.000Z


### Deep dive into one entry in DWH not in MS (id: 22716, external_id: 790773, sample_code: B1181217)

In [194]:
pcp_sample_lims_dwh_no_cas[pcp_sample_lims_dwh_no_cas["sample_code"]=="B1181217"]

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
2039,28700,Refcomm,790773,B1181217,DGA2220226,DGA2220226-1521I036,1521I036,,,2025-07-04 14:52:36.863333+00:00,2025-07-04 14:52:37.830000+00:00
2040,28702,Refcomm,790773,B1181217,DGA2220226,DGA2220226-1521I036,1521I036,,,2025-07-04 14:53:03.693333+00:00,2025-07-04 14:53:05.523000+00:00
2041,22716,Refcomm,790773,B1181217,DGA2220226,DGA2220226-1521I036,1521I036,,,2025-07-03 13:42:30.086666+00:00,2025-07-04 14:52:33.656666+00:00
2042,28701,Refcomm,790773,B1181217,DGA2220226,DGA2220226-1521I036,1521I036,,,2025-07-04 14:52:40.813333+00:00,2025-07-04 14:53:01.720000+00:00


In [195]:
pcp_sample_lims_ms[pcp_sample_lims_ms["sample_code"]=="B1181217"]

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
3010,28702,Refcomm,790773,B1181217,DGA2220226,DGA2220226-1521I036,1521I036,,,2025-07-04T14:53:03.693Z,2025-07-04T14:53:05.523Z


Les 11 entrées dans le DWH pas dans le MS peuvent être expliqués par ça 

### Deep dive into one entry in MS not in DWH (id: 23851, external_id: 824825, sample_code: 71634281)

In [196]:
pcp_sample_lims_dwh_no_cas[pcp_sample_lims_dwh_no_cas["sample_code"]=="71634281"]

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
1812,24169,Refcomm,824779,71634281,B0000195906,B0000195906-K-64162,K-64162,,,2025-07-03 15:49:54.566666+00:00,2025-07-03 15:49:54.566666+00:00
1813,28680,Refcomm,824730,71634281,B0000195906,B0000195906-K-64162,K-64162,,,2025-07-04 11:09:10.490000+00:00,2025-07-04 11:09:10.490000+00:00
1814,28697,Refcomm,824761,71634281,B0000195906,B0000195906-K-64162,K-64162,,,2025-07-04 11:09:58.930000+00:00,2025-07-04 11:09:58.930000+00:00


In [197]:
pcp_sample_lims_ms[pcp_sample_lims_ms["sample_code"]=="71634281"]

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
3013,28697,Refcomm,824761,71634281,B0000195906,B0000195906-K-64162,K-64162,,,2025-07-04T11:09:58.930Z,2025-07-04T11:09:58.930Z
3030,28680,Refcomm,824730,71634281,B0000195906,B0000195906-K-64162,K-64162,,,2025-07-04T11:09:10.490Z,2025-07-04T11:09:10.490Z
7540,24169,Refcomm,824779,71634281,B0000195906,B0000195906-K-64162,K-64162,,,2025-07-03T15:49:54.566Z,2025-07-03T15:49:54.566Z
7858,23851,Refcomm,824825,71634281,B0000195906,B0000195906-K-64162,K-64162,,,2025-07-03T15:00:26.550Z,2025-07-03T15:00:26.550Z


Il y a bien un id 23851 en DWH MSR mais son source_code est null ! Donc la clause source_code = "ANALYSE_AULNAY" le fait disparaître à tort

## Source LIMS (sample_type CAS exclus, source_code null inclus)

In [198]:
gap_analysis_dataframes(pcp_sample_lims_ms, pcp_sample_lims_dwh_no_cas_sc_null_allowed)

--- Starting Gap Analysis between DataFrames ---

DataFrame 1 (ms) has 9025 lines.
DataFrame 2 (dwh) has 9036 lines.

--- NaN Value Analysis by Attribute ---

Attribute: 'id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_type'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'external_id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_code'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'fabrication_code'
  - DataFrame 1 (ms): 39.53% NaN values
  - DataFrame 2 (dwh): 39.49% NaN values
  * Note: There is a significant difference in NaN percentages for 'fabrication_code'.

Attribute: 'batch_code_number'
  - DataFrame 1 (ms): 0.06% NaN values
  - DataFrame 2 (dwh): 0.06% NaN values

Attribute: 'batch_code_supplier'
  - DataFrame 1 (ms): 42.56% NaN values
  - DataFrame 2 (dwh): 42.50% NaN values
  * No

Leurs id sont consécutifs ! 

In [199]:
# Entries in MS not in DWH 
pcp_sample_lims_ms[~(pcp_sample_lims_ms["external_id"].isin(pcp_sample_lims_dwh_no_cas_sc_null_allowed["external_id"]))].head(65)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at


In [200]:
# Entries in DWH not in MS 
pcp_sample_lims_dwh_no_cas_sc_null_allowed[~(pcp_sample_lims_dwh_no_cas_sc_null_allowed["id"].isin(pcp_sample_lims_ms["id"]))]\
    [["id","external_id","sample_code"]].head(11)

Unnamed: 0,id,external_id,sample_code
345,22741,875027,46577119
932,23075,849774,51993750
1696,26931,801585,63830958
2027,23084,853778,72748924
2150,28700,790773,B1181217
2152,22716,790773,B1181217
2153,28701,790773,B1181217
3441,22745,874630,C22017913
4170,22718,798601,C23703964
4195,22715,865228,C23704656


## Source COSMETOCHEM

In [201]:
gap_analysis_dataframes(pcp_sample_cosmetochem_ms, pcp_sample_cosmetochem_dwh)

--- Starting Gap Analysis between DataFrames ---

DataFrame 1 (ms) has 3368 lines.
DataFrame 2 (dwh) has 3371 lines.

--- NaN Value Analysis by Attribute ---

Attribute: 'id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_type'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'external_id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'sample_code'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'fabrication_code'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'batch_code_number'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'batch_code_supplier'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2 (dwh): 100.00% NaN values

Attribute: 'sample_ec_code'
  - DataFrame 1 (ms): 100.00% NaN values
  - DataFrame 2

In [202]:
# Entrées en DWH pas en MS (Ras juste de la récence)
pcp_sample_cosmetochem_dwh[~(pcp_sample_cosmetochem_dwh["id"].isin(pcp_sample_cosmetochem_ms["id"]))].head(11)

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
4,6927,CAS,183158705,183158-70-5,,,,,,2025-07-25 11:22:52.740000+00:00,2025-07-25 11:22:52.740000+00:00
8,6926,CAS,32539836,32539-83-6,,,,,,2025-07-23 14:25:47.430000+00:00,2025-07-23 14:25:47.430000+00:00
21,6925,CAS,87785,87-78-5,,,,,,2025-07-23 08:36:26.720000+00:00,2025-07-23 08:36:26.720000+00:00


In [203]:
pcp_sample_cosmetochem_ms[pcp_sample_cosmetochem_ms["id"]=="6926"]

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at


# TEST PCP

## GA function test

In [223]:
def gap_analysis_dataframes_test(df1: pd.DataFrame, df2: pd.DataFrame):
    """
    Performs a comprehensive gap analysis between two pandas DataFrames.

    This function provides useful information to assess the data gaps, including:
    - Number of lines in both DataFrames.
    - Percentage of NaN (missing) values by attribute for each DataFrame.
    - For attributes with 0% NaN values in both DataFrames, it compares their unique entries,
      reporting:
      - The count of common unique entries.
      - The count of unique entries present only in DataFrame 1 (ms).
      - The count of unique entries present only in DataFrame 2 (dwh).
    
    All values are converted to string type before comparison to prevent type mismatch issues.

    Args:
        df1 (pd.DataFrame): The first pandas DataFrame (e.g., ms).
        df2 (pd.DataFrame): The second pandas DataFrame (e.g., dwh).
    """

    print("--- Starting Gap Analysis between DataFrames ---")
    print(f"\nDataFrame 1 (ms) has {len(df1)} lines.")
    print(f"DataFrame 2 (dwh) has {len(df2)} lines.")

    common_attributes = ['external_id', 'tested_sample_id',
       'measure_objective_type_code', 'test_sub_type', 'test_name',
       'test_variation', 'test_version', 'test_validation_date', 'created_at',
       'updated_at']

    # --- NaN Value Analysis ---
    print("\n--- NaN Value Analysis by Attribute ---")
    nan_info = {} # Store NaN percentages for later use in value comparison
    for attr in common_attributes:
        nan_percent_df1 = (df1[attr].isnull().sum() / len(df1)) * 100 if len(df1) > 0 else 0
        nan_percent_df2 = (df2[attr].isnull().sum() / len(df2)) * 100 if len(df2) > 0 else 0
        nan_info[attr] = {'df1': nan_percent_df1, 'df2': nan_percent_df2} # Store percentages

        print(f"\nAttribute: '{attr}'")
        print(f"  - DataFrame 1 (ms): {nan_percent_df1:.2f}% NaN values")
        print(f"  - DataFrame 2 (dwh): {nan_percent_df2:.2f}% NaN values")

        if abs(nan_percent_df1 - nan_percent_df2) > 0.01:
            print(f"  * Note: There is a significant difference in NaN percentages for '{attr}'.")

    # --- Value Comparison for Complete Attributes ---
    print("\n--- Value Comparison for Attributes with 0% NaN ---")
    attributes_compared_for_values = False
    for attr in common_attributes:
        # Only compare values if the attribute has 0% NaN in *both* DataFrames
        if nan_info[attr]['df1'] == 0 and nan_info[attr]['df2'] == 0:
            attributes_compared_for_values = True
            print(f"\nAttribute: '{attr}' (0% NaN in both DataFrames)")

            # Convert column values to string type before converting to sets for consistent comparison
            set1 = set(df1[attr].astype(str).tolist())
            set2 = set(df2[attr].astype(str).tolist())

            common_unique_values = set1.intersection(set2)
            unique_only_in_df1 = set1.difference(set2)
            unique_only_in_df2 = set2.difference(set1)

            print(f"  - Number of common unique entries: {len(common_unique_values)}")
            print(f"  - Number of unique entries only in DataFrame 1 (pcp_sample_cas_ms): {len(unique_only_in_df1)}")
            print(f"  - Number of unique entries only in DataFrame 2 (pcp_sample_cas_dwh): {len(unique_only_in_df2)}")

            # Optionally, you could print the actual differing values for small sets:
            # if len(unique_only_in_df1) > 0 and len(unique_only_in_df1) < 6:
            #     print(f"    Values unique to DF1: {list(unique_only_in_df1)}")
            # if len(unique_only_in_df2) > 0 and len(unique_only_in_df2) < 6:
            #     print(f"    Values unique to DF2: {list(unique_only_in_df2)}")
        
    if not attributes_compared_for_values:
        print("  No attributes were found with 0% NaN values in both DataFrames to compare individual entries.")

    print("\n--- Gap Analysis Complete ---")

## Processing

In [230]:
pcp_test_lims_ms.head(2)

Unnamed: 0,id,measure_objective_type_id,measure_objective_type_code,external_id,tested_sample_id,source_id,source_code,method_id,method_code,test_type,...,device_id,input_structure_smiles,clean_structure_smiles,version,reviewer_role,created_at,updated_at,deleted_at,tested_operating_parameters,individual_measurements
0,60110,2,WET_POINT,1470782,31711,6,ANALYSE_AULNAY,86.0,METHOD_NO_INFO,Mesure,...,,,,,AU_RES_DEM,2025-07-23T12:59:41.560Z,2025-07-23T12:59:41.560Z,,,
1,60109,1,DENSITY_01,1470781,31711,6,ANALYSE_AULNAY,79.0,METHOD_CID_025_00_URL,Mesure,...,,,,,AU_RES_DEM,2025-07-23T12:59:37.690Z,2025-07-23T12:59:37.690Z,,,


In [231]:
pcp_test_lims_dwh.head(2)

Unnamed: 0,id,external_id,tested_sample_id,measure_objective_type_code,test_sub_type,test_name,test_variation,test_version,test_validation_date,created_at,updated_at
0,43894,1535948,23197,CHARGE_DENSITY,MEAS_PHYS_CHEM,MEAS_CHARGE_DENSITY,ANIONIC-CATIONIC,1,2023-08-01 14:21:11+00:00,2025-07-03 14:18:51.490000+00:00,2025-07-03 14:18:51.490000+00:00
1,49776,1598430,25482,CHARGE_DENSITY,MEAS_PHYS_CHEM,MEAS_CHARGE_DENSITY,ANIONIC-CATIONIC,1,2024-03-27 10:05:32+00:00,2025-07-04 08:38:05.343333+00:00,2025-07-04 08:38:05.343333+00:00


In [232]:
gap_analysis_dataframes_test(pcp_test_lims_ms, pcp_test_lims_dwh)

--- Starting Gap Analysis between DataFrames ---

DataFrame 1 (ms) has 15610 lines.
DataFrame 2 (dwh) has 15660 lines.

--- NaN Value Analysis by Attribute ---

Attribute: 'external_id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'tested_sample_id'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'measure_objective_type_code'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'test_sub_type'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'test_name'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'test_variation'
  - DataFrame 1 (ms): 31.48% NaN values
  - DataFrame 2 (dwh): 31.48% NaN values

Attribute: 'test_version'
  - DataFrame 1 (ms): 0.00% NaN values
  - DataFrame 2 (dwh): 0.00% NaN values

Attribute: 'test_validation_date'
  - DataFrame 1 (ms): 0.00% NaN values
  - 

Pourquoi le DWH dépasse sur le MS ? Des données inactives mal filtrées ? Quelques samples concentrent plusieurs entrées en écart

In [236]:
# Inspect 50 entries in DWH not in MS 
pcp_test_lims_dwh[~(pcp_test_lims_dwh["id"].isin(pcp_test_lims_ms["id"]))]

Unnamed: 0,id,external_id,tested_sample_id,measure_objective_type_code,test_sub_type,test_name,test_variation,test_version,test_validation_date,created_at,updated_at
48,43263,1704205,22702,TURBIDITY_GEL,MEAS_PHYS_CHEM,MEAS_TURBIDITY,,1,2025-06-23 14:15:32+00:00,2025-07-03 13:39:57.700000+00:00,2025-07-04 09:19:09.136666+00:00
107,43322,1719506,22715,VISCOSITY_VS_CONCENTRATION,MEAS_PHYS_CHEM,MEAS_VISCOSITY,VISCOSITY_VS_CONCENT,4,2025-06-22 14:29:26+00:00,2025-07-03 13:42:24.866666+00:00,2025-07-04 08:17:58.170000+00:00
108,43323,1732235,22716,VISCOSITY_VS_CONCENTRATION,MEAS_PHYS_CHEM,MEAS_VISCOSITY,VISCOSITY_VS_CONCENT,4,2025-06-22 13:47:22+00:00,2025-07-03 13:42:30.140000+00:00,2025-07-04 14:52:33.660000+00:00
109,43324,1611267,22717,TURBIDITY_GEL,MEAS_PHYS_CHEM,MEAS_TURBIDITY,,1,2025-06-22 14:22:16+00:00,2025-07-03 13:42:41.933333+00:00,2025-07-04 14:53:18.750000+00:00
111,43326,1611268,22717,VISCOSITY_VS_CONCENTRATION,MEAS_PHYS_CHEM,MEAS_VISCOSITY,VISCOSITY_VS_CONCENT,3,2025-06-22 14:22:16+00:00,2025-07-03 13:42:57.436666+00:00,2025-07-04 14:53:18.753333+00:00
113,43328,1704206,22702,VISCOSITY_VS_CONCENTRATION,MEAS_PHYS_CHEM,MEAS_VISCOSITY,VISCOSITY_VS_CONCENT,4,2025-06-22 13:21:28+00:00,2025-07-03 13:43:02.273333+00:00,2025-07-04 09:19:09.143333+00:00
114,43329,1733426,22718,VISCOSITY_VS_CONCENTRATION,MEAS_PHYS_CHEM,MEAS_VISCOSITY,VISCOSITY_VS_CONCENT,4,2025-06-22 13:43:18+00:00,2025-07-03 13:43:05.016666+00:00,2025-07-04 15:06:41.186666+00:00
146,43361,1738628,22741,VISCOSITY_VS_CONCENTRATION,MEAS_PHYS_CHEM,MEAS_VISCOSITY,VISCOSITY_VS_CONCENT,4,2025-06-06 13:43:04+00:00,2025-07-03 13:44:29.636666+00:00,2025-07-07 12:45:15+00:00
150,43365,1735884,22745,RHEOLOGICAL_PROP_FLOW,PROFIL,PROFIL_RHEOLOGY,FLOW,2,2025-05-28 16:34:28+00:00,2025-07-03 13:44:42.423333+00:00,2025-07-04 14:53:29.920000+00:00
168,43383,1550441,22759,ZETA_POTENTIAL,MEAS_PHYS_CHEM,MEAS_POT_ZETA,,3,2025-05-21 16:32:57+00:00,2025-07-03 13:45:35.786666+00:00,2025-07-04 14:53:07.570000+00:00
