# Comparison deleted data vs duplicated data

## 1. Data linked to a deleted sample

In [1]:
# ! pip install requirements.txt

In [25]:
import pandas as pd
import json
from google.cloud import storage
from google.cloud import bigquery
client = bigquery.Client()





In [26]:
with open(f"SQL/get_delete_events.sql", "r") as sql_file:
    sql = sql_file.read()

query_job = client.query(sql)
delete_events = query_job.to_dataframe().drop_duplicates()
nb_events, _ = delete_events.shape
print(nb_events, 'deletion events')
delete_events.head()

510 deletion events




Unnamed: 0,json_blob
0,"{""event_id"":""a6a0d570-adf4-4668-bae9-ae3f2bc72..."
1,"{""event_id"":""727ee08a-dbe1-4ea8-a53f-2c2a5634d..."
2,"{""event_id"":""09eebb6f-9ae0-4aa4-82e0-1444adce5..."
3,"{""event_id"":""76667fad-08fe-4040-b97a-ab305b4bd..."
4,"{""event_id"":""06313a83-7865-4cef-9629-566aaeda2..."


In [27]:
deleted_samples = []
dates = []
for i in range(nb_events):
    event = json.loads(delete_events['json_blob'][i])
    deleted_samples.append(str(event['data_key']))
    dates.append(event['event_date'])

deleted_samples = list(set(deleted_samples))
nb_u_samples = len(deleted_samples)
print(f"We have {nb_events} delete events on {nb_u_samples} samples")

We have 510 delete events on 437 samples


In [28]:
print(f"oldest event : {sorted(dates)[0]}") 
print(f"newest event : {sorted(dates)[len(dates)-1]}") 

oldest event : 2024-11-11T07:08:54.142Z
newest event : 2025-02-27T14:39:32.654Z


In [29]:
with open(f"SQL/dwh_msr/get_sample_test_result.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{samples}", str(tuple(deleted_samples)))

query_job = client.query(sql)
all_data = query_job.to_dataframe().drop_duplicates()
all_data.head()



Unnamed: 0,ak_key_msr_sample,lkp_key_lma_sample,id_msr_sample,ak_key_msr_test,lkp_lma_test,id_msr_test,ak_key_msr_result,lkp_lma_results,id_msr_result
0,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72987,47874974,0f2a6273-6c7d-47c3-aebe-ac93a880f35f
1,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72990,51310940,4f8e9486-5750-457d-aa1c-47e2a7dfbacc
2,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72993,47874973,99a0e474-8e66-40ea-8274-b43f505a77ff
3,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72991,47874973,e983e2c4-41b7-430e-937e-e16de48446d6
4,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72992,47874973,e81c0a53-8860-416a-a908-a237c119fced


In [30]:
impacted_lma_sample = all_data['lkp_key_lma_sample'].drop_duplicates()
print(f"Number of deleted lma sample : {all_data['lkp_key_lma_sample'].nunique()}\n")
impacted_lma_test = all_data['lkp_lma_test'].drop_duplicates()
impacted_lma_res = all_data['lkp_lma_results'].drop_duplicates()
print(f"{all_data['lkp_lma_results'].nunique()} lkp_lma_results in {all_data['lkp_lma_test'].nunique()} lkp_lma_test are linked to a deleted sample")

Number of deleted lma sample : 473

7865 lkp_lma_results in 463 lkp_lma_test are linked to a deleted sample


## 2. Duplicated data in msr dwh

### 2.1 Sample

In [53]:
want_to_group_also_by_version = True
version = 's.ver_lma_sample,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_sample.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_samples = query_job.to_dataframe().drop_duplicates()
nb_duplicated_sample = duplicated_samples['lkp_key_lma_sample'].nunique()
print(nb_duplicated_sample, 'lkp_lma_sample with several ak_key_msr_sample')

11693 lkp_lma_sample with several ak_key_msr_sample


In [54]:
duplicated_sample_not_deleted = duplicated_samples[~duplicated_samples['lkp_key_lma_sample'].isin(impacted_lma_sample)]
print(f"{duplicated_sample_not_deleted['lkp_key_lma_sample'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_sample_not_deleted.head()

11304 lma_res with several msr_res without any deleted sample


Unnamed: 0,ver_lma_sample,lkp_key_lma_sample,nb_msr_sample
33,2,702445,6
332,2,314716,5
333,2,277905,5
335,2,620270,5
336,2,304881,5


In [55]:
duplicated_sample_w_deleted_sample = duplicated_samples[duplicated_samples['lkp_key_lma_sample'].isin(impacted_lma_sample)]
print(f"{duplicated_sample_w_deleted_sample['lkp_key_lma_sample'].nunique()} lma_sample with several msr_sample with a deleted sample event")
duplicated_sample_w_deleted_sample.head()

389 lma_sample with several msr_sample with a deleted sample event


Unnamed: 0,ver_lma_sample,lkp_key_lma_sample,nb_msr_sample
0,5,788123,7
1,4,783751,6
2,4,804262,6
3,4,795842,6
4,4,804260,6


In [34]:
pc = round(100 * len(duplicated_sample_w_deleted_sample) / nb_duplicated_sample, 2)
print(f'duplicated sample are due to delete events less than {pc} % of the time')

duplicated sample are due to delete events less than 3.82 % of the time


In [35]:
pc = round(100 * len(duplicated_sample_w_deleted_sample) / len(impacted_lma_sample), 2)
print(f'{pc}% of deleted sample are duplicated')

78.86% of deleted sample are duplicated


In [36]:
for sample in impacted_lma_sample :
    if sample not in duplicated_samples['lkp_key_lma_sample'].to_list():
        print('example of a lkp_key_msr_sample linked to a deleted ak_key_msr_sample without generate duplication troubles :', sample  )
        break

example of a lkp_key_msr_sample linked to a deleted ak_key_msr_sample without generate duplication troubles : 650720


### 2.2 Test

In [37]:
want_to_group_also_by_version = True
version = 't.ver_lma_test,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_test.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_tests = query_job.to_dataframe().drop_duplicates()
nb_duplicated_test = duplicated_tests['lkp_lma_test'].nunique()
print(nb_duplicated_test, 'lkp_lma_test with several ak_key_msr_test')



10579 lkp_lma_test with several ak_key_msr_test


In [38]:
duplicated_test_not_deleted = duplicated_tests[~duplicated_tests['lkp_lma_test'].isin(impacted_lma_test)]
print(f"{duplicated_test_not_deleted['lkp_lma_test'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_test_not_deleted.head()

10205 lma_res with several msr_res without any deleted sample


Unnamed: 0,ver_lma_test,lkp_lma_test,nb_msr_test
8,6,536436,3
10,6,510570,3
14,5,458341,3
17,6,493532,3
22,5,539072,3


In [39]:
duplicated_test_w_deleted_sample = duplicated_tests[duplicated_tests['lkp_lma_test'].isin(impacted_lma_test)]
print(f"{duplicated_test_w_deleted_sample['lkp_lma_test'].nunique()} lma_test with several msr_test with a deleted sample event")
duplicated_test_w_deleted_sample.head()

374 lma_test with several msr_test with a deleted sample event


Unnamed: 0,ver_lma_test,lkp_lma_test,nb_msr_test
0,7,1523483,4
1,4,1662699,4
2,4,1662698,4
3,4,1662697,4
4,4,1662700,4


In [40]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / nb_duplicated_test)
print(f'duplicated test are due to delete events less than {pc} % of the time')

duplicated test are due to delete events less than 4 % of the time


In [41]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / len(impacted_lma_test), 2)
print(f'{pc}% of deleted sample are duplicated')

80.6% of deleted sample are duplicated


### 2.3 Results

In [42]:
want_to_group_also_by_version = True
version = 'r.ver_lma_results,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_results.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_resuts = query_job.to_dataframe().drop_duplicates()
nb_duplicated_result = duplicated_resuts['lkp_lma_results'].nunique()
print(nb_duplicated_result, 'lkp_lma_results with several ak_key_msr_result')



176173 lkp_lma_results with several ak_key_msr_result


In [43]:
duplicated_resuts_not_deleted = duplicated_resuts[~duplicated_resuts['lkp_lma_results'].isin(impacted_lma_res)]
print(f"{duplicated_resuts_not_deleted['lkp_lma_results'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_resuts_not_deleted.head()

168548 lma_res with several msr_res without any deleted sample


Unnamed: 0,ver_lma_results,lkp_lma_results,nb_msr_res
0,11,51308514,6
1,11,51308513,4
199,10,47875741,3
200,10,47875334,3
201,10,47874676,3


In [44]:
duplicated_resuts_w_deleted_sample = duplicated_resuts[duplicated_resuts['lkp_lma_results'].isin(impacted_lma_res)]
print(f"{duplicated_resuts_w_deleted_sample['lkp_lma_results'].nunique()} lma_res with several msr_res with a deleted sample event")
duplicated_resuts_w_deleted_sample.head()

7625 lma_res with several msr_res with a deleted sample event


Unnamed: 0,ver_lma_results,lkp_lma_results,nb_msr_res
2,150,55503187,4
3,154,55503147,4
4,173,55503107,4
5,166,55503127,4
6,156,55503098,4


In [45]:
pc = round(100 * len(duplicated_resuts_w_deleted_sample) / nb_duplicated_result)
print(f'duplicated results are due to delete events less than {pc} % of the time')

duplicated results are due to delete events less than 6 % of the time


In [46]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / len(impacted_lma_test), 2)
print(f'{pc}% of deleted sample are duplicated')

80.6% of deleted sample are duplicated


## 3. Comparison with lims output

In [48]:
lims_output_sample = pd.read_excel("Data/RM PCP LIMS - samples.xlsx")

In [49]:
lims_output_sample

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
0,22212,Refcomm,849910,55194284,B0000192703,B0000192703-00001231001,00001231001,,,2025-02-04T17:21:56.500Z,2025-02-04T17:21:56.500Z
1,22211,Refcomm,840091,D13736572,DGA2270177,DGA2270177-21LNC1021SQ89,21LNC1021SQ89,,,2025-02-04T16:32:41.296Z,2025-02-04T16:32:41.296Z
2,22210,Refcomm,849766,C34035984,DGA2270073,DGA2270073-0001968392,0001968392,,,2025-02-04T16:18:33.136Z,2025-02-04T16:18:33.136Z
3,22209,Refcomm,849773,53586881,B0000191247,B0000191247-0001897027,0001897027,,,2025-02-04T16:18:29.796Z,2025-02-04T16:18:29.796Z
4,22208,Refcomm,849767,C24504104,DGA2240002,DGA2240002-DGC21N0010,DGC21N0010,,,2025-02-04T16:18:22.673Z,2025-02-04T16:18:22.673Z
...,...,...,...,...,...,...,...,...,...,...,...
9312,12786,Refcomm,820359,63795945,B0000186839,B0000186839-20200729-100,20200729-100,,,2024-05-31T10:10:05.670Z,2024-05-31T10:10:05.670Z
9313,12785,Refcomm,820362,436072,B0000192345,B0000192345-23A25108-00,23A25108-00,,,2024-05-31T10:10:02.200Z,2024-05-31T10:10:02.200Z
9314,12784,Refcomm,820371,C24687885,DGA23N0080,DGA23N0080-0027451126,0027451126,,,2024-05-31T10:09:56.790Z,2024-05-31T10:09:56.790Z
9315,12783,Refcomm,824712,C20042883,DGA23D0069,DGA23D0069-DGK2350178,DGK2350178,,,2024-05-31T10:09:53.863Z,2024-05-31T10:09:53.863Z


"id" correspond à id_msr_sample ? ak_key_msr ? sample_id ? 

Sample en doublons vs sample du lims ? Combien sont dans les 2 ? Combien sont seulement en doublons ? Seulement issu du lims ? 