# Gap analysis doublosn MSR

## 1. Comparison deleted data vs duplicated data

### 1.1. Data linked to a deleted sample

In [1]:
# ! pip install requirements.txt

In [2]:
import pandas as pd
import json
from google.cloud import storage
from google.cloud import bigquery
client = bigquery.Client()

In [3]:
with open(f"SQL/get_delete_events.sql", "r") as sql_file:
    sql = sql_file.read()

query_job = client.query(sql)
delete_events = query_job.to_dataframe().drop_duplicates()
nb_events, _ = delete_events.shape
print(nb_events, 'deletion events')
delete_events.head()

1921 deletion events


Unnamed: 0,json_blob
0,"{""event_id"":""5fd517b4-a2dd-4f71-a55a-e90368c8d..."
1,"{""event_id"":""dc398e44-348b-4994-a794-04e50ff4a..."
2,"{""event_id"":""ce2f8eda-e8c4-4f08-8f2c-ffd9b27d9..."
3,"{""event_id"":""c8f1659f-4f3c-4cde-8cd4-6977d2943..."
4,"{""event_id"":""363e7273-9d07-4491-bbff-3889cb85c..."


In [4]:
deleted_samples = []
dates = []
for i in range(nb_events):
    event = json.loads(delete_events['json_blob'][i])
    deleted_samples.append(str(event['data_key']))
    dates.append(event['event_date'])

deleted_samples = list(set(deleted_samples))
nb_u_samples = len(deleted_samples)
print(f"We have {nb_events} delete events on {nb_u_samples} samples")

We have 1921 delete events on 1793 samples


In [5]:
print(f"oldest event : {sorted(dates)[0]}") 
print(f"newest event : {sorted(dates)[len(dates)-1]}") 

oldest event : 2024-11-11T07:08:54.142Z
newest event : 2025-04-17T10:35:25.407Z


In [6]:
with open(f"SQL/dwh_msr/get_sample_test_result.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{samples}", str(tuple(deleted_samples)))

query_job = client.query(sql)
all_data = query_job.to_dataframe().drop_duplicates()
all_data.head()

Unnamed: 0,ak_key_msr_sample,lkp_key_lma_sample,id_msr_sample,ak_key_msr_test,lkp_lma_test,id_msr_test,ak_key_msr_result,lkp_lma_results,id_msr_result
0,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72987,47874974,0f2a6273-6c7d-47c3-aebe-ac93a880f35f
1,19378,803581,1bda9b66-beec-4b73-a6fd-640f21f65ba2,37928,1562176,02b8a035-96c7-4979-ba1a-2f156e3400ae,71359,53034362,6b2b29be-22f5-4ab0-9648-8e8ddbb2cacf
2,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72990,51310940,4f8e9486-5750-457d-aa1c-47e2a7dfbacc
3,19378,803581,1bda9b66-beec-4b73-a6fd-640f21f65ba2,37928,1562176,02b8a035-96c7-4979-ba1a-2f156e3400ae,71361,53034361,26e3b172-316c-4fe8-a083-2faa445150cd
4,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72992,47874973,e81c0a53-8860-416a-a908-a237c119fced


In [7]:
impacted_lma_sample = all_data['lkp_key_lma_sample'].drop_duplicates()
print(f"Number of deleted lma sample : {all_data['lkp_key_lma_sample'].nunique()}\n")
impacted_lma_test = all_data['lkp_lma_test'].drop_duplicates()
impacted_lma_res = all_data['lkp_lma_results'].drop_duplicates()
print(f"{all_data['lkp_lma_results'].nunique()} lkp_lma_results in {all_data['lkp_lma_test'].nunique()} lkp_lma_test are linked to a deleted sample")

Number of deleted lma sample : 1248

23468 lkp_lma_results in 1342 lkp_lma_test are linked to a deleted sample


### 1.2. Duplicated data in msr dwh

#### 1.2.1 Sample

In [8]:
want_to_group_also_by_version = True
version = 's.ver_lma_sample,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_sample.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_samples = query_job.to_dataframe().drop_duplicates()
nb_duplicated_sample = duplicated_samples['lkp_key_lma_sample'].nunique()
print(nb_duplicated_sample, 'lkp_lma_sample with several ak_key_msr_sample')

9760 lkp_lma_sample with several ak_key_msr_sample


In [9]:
duplicated_sample_not_deleted = duplicated_samples[~duplicated_samples['lkp_key_lma_sample'].isin(impacted_lma_sample)]
print(f"{duplicated_sample_not_deleted['lkp_key_lma_sample'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_sample_not_deleted.head()

8609 lma_res with several msr_res without any deleted sample


Unnamed: 0,ver_lma_sample,lkp_key_lma_sample,nb_msr_sample
349,4,800486,3
757,2,753763,3
1038,2,716653,3
1077,2,702445,3
1078,2,593400,3


In [10]:
duplicated_sample_w_deleted_sample = duplicated_samples[duplicated_samples['lkp_key_lma_sample'].isin(impacted_lma_sample)]
print(f"{duplicated_sample_w_deleted_sample['lkp_key_lma_sample'].nunique()} lma_sample with several msr_sample with a deleted sample event")
duplicated_sample_w_deleted_sample.head()

1151 lma_sample with several msr_sample with a deleted sample event


Unnamed: 0,ver_lma_sample,lkp_key_lma_sample,nb_msr_sample
0,5,788123,5
1,2,775644,5
2,4,783740,4
3,4,783737,4
4,4,783736,4


In [11]:
pc = round(100 * len(duplicated_sample_w_deleted_sample) / nb_duplicated_sample, 2)
print(f'duplicated sample are due to delete events less than {pc} % of the time')

duplicated sample are due to delete events less than 11.79 % of the time


In [12]:
pc = round(100 * len(duplicated_sample_w_deleted_sample) / len(impacted_lma_sample), 2)
print(f'{pc}% of deleted sample are duplicated')

92.23% of deleted sample are duplicated


In [13]:
for sample in impacted_lma_sample :
    if sample not in duplicated_samples['lkp_key_lma_sample'].to_list():
        print('example of a lkp_key_msr_sample linked to a deleted ak_key_msr_sample without generate duplication troubles :', sample  )
        break

example of a lkp_key_msr_sample linked to a deleted ak_key_msr_sample without generate duplication troubles : 650720


Now list all duplicates associated to a delete and all duplicates not associated to a delete (Ano 1 - Slide 5 : Gap analysis event delete expliquent doublons ?)

In [14]:
duplicated_samples_linked_to_delete_event = []
for sample in impacted_lma_sample:
    if sample in duplicated_samples["lkp_key_lma_sample"].to_list():
        duplicated_samples_linked_to_delete_event.append(sample)

In [15]:
"788123" in duplicated_samples_linked_to_delete_event

True

In [16]:
duplicated_samples_not_linked_to_delete_event = []
for sample in duplicated_samples["lkp_key_lma_sample"].to_list():
    if sample not in impacted_lma_sample:
        duplicated_samples_not_linked_to_delete_event.append(sample)

In [17]:
"800486" in duplicated_samples_not_linked_to_delete_event

True

In [18]:
def get_df_replay(lkp):
    if lkp:
        target_lkp_key_lma_sample = f"AND CAST(s.lkp_key_lma_sample AS STRING) = '{lkp}'" # Cast the column
    else:
        target_lkp_key_lma_sample = ""

    with open(f"SQL_replay/get_duplicated_samples.sql", "r") as sql_file:
        sql_template = sql_file.read()

    sql = sql_template.replace("{target_lkp_key_lma_sample}", target_lkp_key_lma_sample)
    query_job = client.query(sql)
    df_replay = query_job.to_dataframe().drop_duplicates()
    return df_replay 

# "800486" in duplicated_samples_not_linked_to_delete_event
# "788123" in duplicated_samples_linked_to_delete_event
df_replay = get_df_replay('788123')

In [19]:
df_replay.head()

Unnamed: 0,id_msr_sample,lkp_key_lma_sample,ver_lma_sample,id_lma_sample,ak_key_msr_sample,source_code,sys_app_origin
0,52fcbb7f-2cf3-47b8-a59c-5582bc492e0d,788123,5,85f28735-e1bc-4b50-a125-c5077e42684d,28325,ANALYSE_AULNAY,MSR_TI
1,7fab28a1-a80d-49f8-bceb-2f0e196bcf79,788123,5,85f28735-e1bc-4b50-a125-c5077e42684d,38148,ANALYSE_AULNAY,MSR_TI
2,20b45bd3-334e-41dc-9fc1-39b68778f54c,788123,5,85f28735-e1bc-4b50-a125-c5077e42684d,39524,ANALYSE_AULNAY,MSR_TI
3,6691f80b-92b9-4402-8cbe-02fa4b06444c,788123,5,85f28735-e1bc-4b50-a125-c5077e42684d,48792,ANALYSE_AULNAY,MSR_TI
4,0b61fdb0-1a0d-4deb-b087-806ec7222ab2,788123,5,85f28735-e1bc-4b50-a125-c5077e42684d,49220,ANALYSE_AULNAY,MSR_TI


#### 1.2.2 Test

In [20]:
want_to_group_also_by_version = True
version = 't.ver_lma_test,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_test.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_tests = query_job.to_dataframe().drop_duplicates()
nb_duplicated_test = duplicated_tests['lkp_lma_test'].nunique()
print(nb_duplicated_test, 'lkp_lma_test with several ak_key_msr_test')

10581 lkp_lma_test with several ak_key_msr_test


In [21]:
duplicated_test_not_deleted = duplicated_tests[~duplicated_tests['lkp_lma_test'].isin(impacted_lma_test)]
print(f"{duplicated_test_not_deleted['lkp_lma_test'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_test_not_deleted.head()

9396 lma_res with several msr_res without any deleted sample


Unnamed: 0,ver_lma_test,lkp_lma_test,nb_msr_test
352,6,510570,3
367,6,493529,3
368,6,346407,3
369,5,527047,3
400,6,502460,3


In [22]:
duplicated_test_w_deleted_sample = duplicated_tests[duplicated_tests['lkp_lma_test'].isin(impacted_lma_test)]
print(f"{duplicated_test_w_deleted_sample['lkp_lma_test'].nunique()} lma_test with several msr_test with a deleted sample event")
duplicated_test_w_deleted_sample.head()

1185 lma_test with several msr_test with a deleted sample event


Unnamed: 0,ver_lma_test,lkp_lma_test,nb_msr_test
0,4,1491428,5
1,7,1523483,5
2,4,1662697,5
3,4,1662698,5
4,4,1662699,5


In [23]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / nb_duplicated_test)
print(f'duplicated test are due to delete events less than {pc} % of the time')

duplicated test are due to delete events less than 11 % of the time


In [24]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / len(impacted_lma_test), 2)
print(f'{pc}% of deleted sample are duplicated')

88.3% of deleted sample are duplicated


#### 1.2.3 Results

In [25]:
want_to_group_also_by_version = True
version = 'r.ver_lma_results,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_results.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_resuts = query_job.to_dataframe().drop_duplicates()
nb_duplicated_result = duplicated_resuts['lkp_lma_results'].nunique()
print(nb_duplicated_result, 'lkp_lma_results with several ak_key_msr_result')

178701 lkp_lma_results with several ak_key_msr_result


In [26]:
duplicated_resuts_not_deleted = duplicated_resuts[~duplicated_resuts['lkp_lma_results'].isin(impacted_lma_res)]
print(f"{duplicated_resuts_not_deleted['lkp_lma_results'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_resuts_not_deleted.head()

155720 lma_res with several msr_res without any deleted sample


Unnamed: 0,ver_lma_results,lkp_lma_results,nb_msr_res
0,11,51308514,6
212,11,51308513,4
5217,10,47875994,3
5218,10,47875763,3
5219,10,47875796,3


In [27]:
duplicated_resuts_w_deleted_sample = duplicated_resuts[duplicated_resuts['lkp_lma_results'].isin(impacted_lma_res)]
print(f"{duplicated_resuts_w_deleted_sample['lkp_lma_results'].nunique()} lma_res with several msr_res with a deleted sample event")
duplicated_resuts_w_deleted_sample.head()

22981 lma_res with several msr_res with a deleted sample event


Unnamed: 0,ver_lma_results,lkp_lma_results,nb_msr_res
1,156,55503098,5
2,166,55503127,5
3,150,55503187,5
4,154,55503147,5
5,173,55503107,5


In [28]:
pc = round(100 * len(duplicated_resuts_w_deleted_sample) / nb_duplicated_result)
print(f'duplicated results are due to delete events less than {pc} % of the time')

duplicated results are due to delete events less than 17 % of the time


In [29]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / len(impacted_lma_test), 2)
print(f'{pc}% of deleted sample are duplicated')

88.3% of deleted sample are duplicated


### 1.3. Comparison with lims output

In [30]:
lims_output_sample = pd.read_excel("Data/RM PCP LIMS - samples.xlsx")

In [31]:
lims_output_sample

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
0,22212,Refcomm,849910,55194284,B0000192703,B0000192703-00001231001,00001231001,,,2025-02-04T17:21:56.500Z,2025-02-04T17:21:56.500Z
1,22211,Refcomm,840091,D13736572,DGA2270177,DGA2270177-21LNC1021SQ89,21LNC1021SQ89,,,2025-02-04T16:32:41.296Z,2025-02-04T16:32:41.296Z
2,22210,Refcomm,849766,C34035984,DGA2270073,DGA2270073-0001968392,0001968392,,,2025-02-04T16:18:33.136Z,2025-02-04T16:18:33.136Z
3,22209,Refcomm,849773,53586881,B0000191247,B0000191247-0001897027,0001897027,,,2025-02-04T16:18:29.796Z,2025-02-04T16:18:29.796Z
4,22208,Refcomm,849767,C24504104,DGA2240002,DGA2240002-DGC21N0010,DGC21N0010,,,2025-02-04T16:18:22.673Z,2025-02-04T16:18:22.673Z
...,...,...,...,...,...,...,...,...,...,...,...
9312,12786,Refcomm,820359,63795945,B0000186839,B0000186839-20200729-100,20200729-100,,,2024-05-31T10:10:05.670Z,2024-05-31T10:10:05.670Z
9313,12785,Refcomm,820362,436072,B0000192345,B0000192345-23A25108-00,23A25108-00,,,2024-05-31T10:10:02.200Z,2024-05-31T10:10:02.200Z
9314,12784,Refcomm,820371,C24687885,DGA23N0080,DGA23N0080-0027451126,0027451126,,,2024-05-31T10:09:56.790Z,2024-05-31T10:09:56.790Z
9315,12783,Refcomm,824712,C20042883,DGA23D0069,DGA23D0069-DGK2350178,DGK2350178,,,2024-05-31T10:09:53.863Z,2024-05-31T10:09:53.863Z


## 2. Comparison real_time vs dwh msr

Get data from dwh msr

In [32]:
with open(f"SQL/20250307_analysis/dwh_msr/get_data_from_0703.sql", "r") as sql_file:
    sql = sql_file.read()

query_job = client.query(sql)
updated_sample_dwh = query_job.to_dataframe().drop_duplicates()
nb_updated_s_dwh = updated_sample_dwh['ak_key_msr_sample'].nunique()
print(f'nb_updated sample in dwh msr : {nb_updated_s_dwh}')
updated_sample_dwh.head()

nb_updated sample in dwh msr : 2076


Unnamed: 0,ak_key_msr_sample,source_code,sys_app_origin,ak_key_msr_test
0,6938,SOURCE_ECHA,MSR_CPC,68981
1,6938,SOURCE_ECHA,MSR_CPC,68974
2,6938,SOURCE_ECHA,MSR_CPC,68975
3,6938,SOURCE_ECHA,MSR_CPC,68976
4,6938,SOURCE_ECHA,MSR_CPC,68979


### 2.1. sample

In [33]:
with open(f"SQL/20250307_analysis/inp_realtime/get_data_from_0703.sql", "r") as sql_file:
    sql = sql_file.read()

query_job = client.query(sql)
event_realtime = query_job.to_dataframe().drop_duplicates()
nb_sample_realtime = event_realtime['sample_id'].nunique()
print(f'nb_updated sample in dwh msr : {nb_sample_realtime}')
event_realtime.head()

nb_updated sample in dwh msr : 6195


Unnamed: 0,sample_external_id,sample_id,test_id,date,domain
0,470499,44472,50503,2025-03-26 00:00:00+00:00,MSR_TI
1,413555,43870,49898,2025-03-26 00:00:00+00:00,MSR_TI
2,862631,22381,42699,2025-04-01 00:00:00+00:00,MSR_CPC
3,744801,14623,28286,2025-04-01 00:00:00+00:00,MSR_CPC
4,807042,15203,29020,2025-03-19 00:00:00+00:00,MSR_CPC


In [34]:
missing_sample = event_realtime[~event_realtime['sample_id'].isin(updated_sample_dwh['ak_key_msr_sample'])][['sample_external_id', 'sample_id', 'domain', 'date']].drop_duplicates()
print(f'nb missing sample in dwh : {missing_sample["sample_external_id"].nunique()}')
missing_sample[['sample_id', 'sample_external_id', 'domain']].to_excel('result/missing_sample_in_dwh_msr.xlsx')
missing_sample

nb missing sample in dwh : 4524


OSError: Cannot save file into a non-existent directory: 'result'

In [41]:
missing_sample.groupby('domain').nunique().reset_index()

Unnamed: 0,domain,sample_external_id,sample_id,test_id,date
0,MSR_CPC,1847,1847,2344,1
1,MSR_TI,202,202,206,1


In [47]:
retrieved_sample = event_realtime[event_realtime['sample_id'].isin(updated_sample_dwh['ak_key_msr_sample'])][['sample_external_id', 'sample_id', 'domain', 'date']].drop_duplicates()
print(f'nb retrived sample in dwh : {retrieved_sample["sample_external_id"].nunique()}')
retrieved_sample[['sample_id', 'sample_external_id', 'domain']].to_excel('result/recovered_sample_in_dwh_msr.xlsx')
retrieved_sample

nb retrived sample in dwh : 411


Unnamed: 0,sample_external_id,sample_id,domain,date
8,588862,20724,MSR_CPC,2025-03-07 00:00:00+00:00
28,588848,20439,MSR_CPC,2025-03-07 00:00:00+00:00
30,588903,20277,MSR_CPC,2025-03-07 00:00:00+00:00
33,588971,21406,MSR_CPC,2025-03-07 00:00:00+00:00
34,588970,20281,MSR_CPC,2025-03-07 00:00:00+00:00
...,...,...,...,...
3039,851995,49452,MSR_TI,2025-03-07 00:00:00+00:00
3074,588874,21713,MSR_CPC,2025-03-07 00:00:00+00:00
3087,588942,21780,MSR_CPC,2025-03-07 00:00:00+00:00
3095,589036,20689,MSR_CPC,2025-03-07 00:00:00+00:00


In [45]:
retrieved_sample.groupby('domain').nunique().reset_index()

Unnamed: 0,domain,sample_external_id,sample_id,date
0,MSR_CPC,386,386,1
1,MSR_TI,25,25,1


In [46]:
unexpected_sample = updated_sample_dwh[~updated_sample_dwh['ak_key_msr_sample'].isin(event_realtime['sample_id'])]
unexpected_sample

Unnamed: 0,ak_key_msr_sample,source_code,sys_app_origin,ak_key_msr_test
0,12796,ANALYSE_AULNAY,MSR_CPC,24334
1,12796,ANALYSE_AULNAY,MSR_CPC,24361
2,16405,ANALYSE_AULNAY,MSR_CPC,31768
3,16523,ANALYSE_AULNAY,MSR_CPC,32186
4,16523,ANALYSE_AULNAY,MSR_CPC,32253
...,...,...,...,...
863,16392,ANALYSE_AULNAY,MSR_CPC,31700
865,12804,ANALYSE_AULNAY,MSR_CPC,24383
951,12813,ANALYSE_AULNAY,MSR_CPC,24332
952,12813,ANALYSE_AULNAY,MSR_CPC,24365


### 2.2. Test

In [48]:
with open(f"SQL/20250307_analysis/inp_realtime/get_data_from_0703.sql", "r") as sql_file:
    sql = sql_file.read()

query_job = client.query(sql)
event_test_realtime = query_job.to_dataframe().drop_duplicates()
nb_sample_realtime = event_test_realtime['sample_id'].nunique()
print(f'nb_updated sample in dwh msr : {nb_sample_realtime}')
event_test_realtime.head()



nb_updated sample in dwh msr : 2460


Unnamed: 0,sample_external_id,sample_id,test_id,date,domain
0,581020,45920,51988,2025-03-07 00:00:00+00:00,MSR_TI
1,739880,47801,53901,2025-03-07 00:00:00+00:00,MSR_TI
2,553586,45641,51709,2025-03-07 00:00:00+00:00,MSR_TI
3,516308,21731,41798,2025-03-07 00:00:00+00:00,MSR_CPC
4,651866,20731,40315,2025-03-07 00:00:00+00:00,MSR_CPC


In [34]:
missing_test = event_test_realtime[~event_test_realtime['test_id'].isin(updated_sample_dwh['ak_key_msr_test'])]
print(f'nb missing test in dwh : {missing_test["test_id"].nunique()}')
missing_test[['sample_id', 'sample_external_id', 'test_id', 'domain']].to_excel('result/missing_test_in_dwh_msr.xlsx')
missing_test

nb missing test in dwh : 2680


Unnamed: 0,sample_external_id,sample_id,test_id,date,domain
0,581020,45920,51988,2025-03-07 00:00:00+00:00,MSR_TI
1,739880,47801,53901,2025-03-07 00:00:00+00:00,MSR_TI
2,553586,45641,51709,2025-03-07 00:00:00+00:00,MSR_TI
3,516308,21731,41798,2025-03-07 00:00:00+00:00,MSR_CPC
4,651866,20731,40315,2025-03-07 00:00:00+00:00,MSR_CPC
...,...,...,...,...,...
3127,836391,20068,39361,2025-03-07 00:00:00+00:00,MSR_CPC
3128,838099,19962,39072,2025-03-07 00:00:00+00:00,MSR_CPC
3129,837817,17255,34376,2025-03-07 00:00:00+00:00,MSR_CPC
3130,795598,15443,37693,2025-03-07 00:00:00+00:00,MSR_CPC


In [35]:
missing_test.groupby('domain').nunique().reset_index()

Unnamed: 0,domain,sample_external_id,sample_id,test_id,date
0,MSR_CPC,1892,1892,2474,1
1,MSR_TI,202,202,206,1


In [36]:
retrieved_test = event_test_realtime[event_test_realtime['test_id'].isin(updated_sample_dwh['ak_key_msr_test'])]
print(f'nb retrived test in dwh : {retrieved_test["test_id"].nunique()}')
retrieved_test[['sample_id', 'sample_external_id', 'test_id', 'domain']].to_excel('result/recovered_test_in_dwh_msr.xlsx')
retrieved_test

nb retrived test in dwh : 452


Unnamed: 0,sample_external_id,sample_id,test_id,date,domain
8,588862,20724,40308,2025-03-07 00:00:00+00:00,MSR_CPC
28,588848,20439,39910,2025-03-07 00:00:00+00:00,MSR_CPC
30,588903,20277,39748,2025-03-07 00:00:00+00:00,MSR_CPC
33,588971,21406,41295,2025-03-07 00:00:00+00:00,MSR_CPC
34,588970,20281,39752,2025-03-07 00:00:00+00:00,MSR_CPC
...,...,...,...,...,...
3095,589036,20689,40271,2025-03-07 00:00:00+00:00,MSR_CPC
3106,680840,13065,24834,2025-03-07 00:00:00+00:00,MSR_CPC
3107,805032,17044,34012,2025-03-07 00:00:00+00:00,MSR_CPC
3114,542552,15779,30037,2025-03-07 00:00:00+00:00,MSR_CPC


In [37]:
retrieved_test.groupby('domain').nunique().reset_index()

Unnamed: 0,domain,sample_external_id,sample_id,test_id,date
0,MSR_CPC,386,386,427,1
1,MSR_TI,25,25,25,1


In [49]:
unexpected_test = updated_sample_dwh[~updated_sample_dwh['ak_key_msr_test'].isin(event_realtime['test_id'])]
unexpected_test

Unnamed: 0,ak_key_msr_sample,source_code,sys_app_origin,ak_key_msr_test
0,12796,ANALYSE_AULNAY,MSR_CPC,24334
1,12796,ANALYSE_AULNAY,MSR_CPC,24361
2,16405,ANALYSE_AULNAY,MSR_CPC,31768
3,16523,ANALYSE_AULNAY,MSR_CPC,32186
4,16523,ANALYSE_AULNAY,MSR_CPC,32253
...,...,...,...,...
863,16392,ANALYSE_AULNAY,MSR_CPC,31700
865,12804,ANALYSE_AULNAY,MSR_CPC,24383
951,12813,ANALYSE_AULNAY,MSR_CPC,24332
952,12813,ANALYSE_AULNAY,MSR_CPC,24365


Comparaison log lims, MS, dwh_lma