# Comparison deleted data vs duplicated data

## 1. Data linked to a deleted sample

In [1]:
# ! pip install requirements.txt

In [2]:
import json
from google.cloud import storage
from google.cloud import bigquery
client = bigquery.Client()



In [3]:
with open(f"SQL/get_delete_events.sql", "r") as sql_file:
    sql = sql_file.read()

query_job = client.query(sql)
delete_events = query_job.to_dataframe().drop_duplicates()
nb_events, _ = delete_events.shape
print(nb_events, 'deletion events')
delete_events.head()

510 deletion events




Unnamed: 0,json_blob
0,"{""event_id"":""a6a0d570-adf4-4668-bae9-ae3f2bc72..."
1,"{""event_id"":""727ee08a-dbe1-4ea8-a53f-2c2a5634d..."
2,"{""event_id"":""09eebb6f-9ae0-4aa4-82e0-1444adce5..."
3,"{""event_id"":""76667fad-08fe-4040-b97a-ab305b4bd..."
4,"{""event_id"":""06313a83-7865-4cef-9629-566aaeda2..."


In [None]:
deleted_samples = []
dates = []
for i in range(nb_events):
    event = json.loads(delete_events['json_blob'][i])
    deleted_samples.append(str(event['data_key']))
    dates.append(event['event_date'])

deleted_samples = list(set(deleted_samples))
nb_u_samples = len(deleted_samples)
print(f"We have {nb_events} delete events on {nb_u_samples} samples")

We have 510 delete events on 437 samples


In [5]:
print(f"oldest event : {sorted(dates)[0]}") 
print(f"newest event : {sorted(dates)[len(dates)-1]}") 

oldest event : 2024-11-11T07:08:54.142Z
newest event : 2025-02-27T14:39:32.654Z


In [6]:
with open(f"SQL/dwh_msr/get_sample_test_result.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{samples}", str(tuple(deleted_samples)))

query_job = client.query(sql)
all_data = query_job.to_dataframe().drop_duplicates()
all_data.head()



Unnamed: 0,ak_key_msr_sample,lkp_key_lma_sample,id_msr_sample,ak_key_msr_test,lkp_lma_test,id_msr_test,ak_key_msr_result,lkp_lma_results,id_msr_result
0,17278,784903,9aea81b3-684d-4d22-8196-b14665610759,34404,1542583,8e40dabc-1f76-4c55-bfd4-2f72488dd7fb,61479,52254382,f89185d8-bb79-428e-a04b-1cfcce620cee
1,17278,784903,9aea81b3-684d-4d22-8196-b14665610759,34404,1542583,8e40dabc-1f76-4c55-bfd4-2f72488dd7fb,61480,52254383,4e8ee194-9a9e-4729-ad3e-eaef6fd80534
2,17455,792632,2c60a457-cdd2-4665-840a-66fca1507fa6,34721,1536013,01d3b7cd-a20d-4a1c-9237-0bf625134bb0,62469,51940281,2708a00f-bf6c-41ec-8d55-2fb7d301cbf9
3,17455,792632,2c60a457-cdd2-4665-840a-66fca1507fa6,34721,1536013,01d3b7cd-a20d-4a1c-9237-0bf625134bb0,62471,51940283,24565875-fb50-45a7-ba48-b2f513b70948
4,17455,792632,2c60a457-cdd2-4665-840a-66fca1507fa6,34721,1536013,01d3b7cd-a20d-4a1c-9237-0bf625134bb0,62472,51940284,c3029a82-3f9a-481f-8f89-989e156a0d06


In [7]:
impacted_lma_sample = all_data['lkp_key_lma_sample'].drop_duplicates()
print(f"Number of deleted lma sample : {all_data['lkp_key_lma_sample'].nunique()}\n")
impacted_lma_test = all_data['lkp_lma_test'].drop_duplicates()
impacted_lma_res = all_data['lkp_lma_results'].drop_duplicates()
print(f"{all_data['lkp_lma_results'].nunique()} lkp_lma_results in {all_data['lkp_lma_test'].nunique()} lkp_lma_test are linked to a deleted sample")

Number of deleted lma sample : 473

7858 lkp_lma_results in 462 lkp_lma_test are linked to a deleted sample


## 2. Duplicated data in msr dwh

### 2.1 Sample

In [8]:
want_to_group_also_by_version = True
version = 's.ver_lma_sample,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_sample.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_samples = query_job.to_dataframe().drop_duplicates()
nb_duplicated_sample = duplicated_samples['lkp_key_lma_sample'].nunique()
print(nb_duplicated_sample, 'lkp_lma_sample with several ak_key_msr_sample')



9759 lkp_lma_sample with several ak_key_msr_sample


In [None]:
duplicated_sample_not_deleted = duplicated_samples[~duplicated_samples['lkp_key_lma_sample'].isin(impacted_lma_sample)]
print(f"{duplicated_sample_not_deleted['lkp_key_lma_sample'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_sample_not_deleted.head()

9386 lma_res with several msr_res without any deleted sample


Unnamed: 0,ver_lma_sample,lkp_key_lma_sample,nb_msr_sample
48,4,800486,3
60,2,753763,3
61,2,716653,3
62,2,702445,3
63,2,593400,3


In [10]:
duplicated_sample_w_deleted_sample = duplicated_samples[duplicated_samples['lkp_key_lma_sample'].isin(impacted_lma_sample)]
print(f"{duplicated_sample_w_deleted_sample['lkp_key_lma_sample'].nunique()} lma_sample with several msr_sample with a deleted sample event")
duplicated_sample_w_deleted_sample.head()

373 lma_sample with several msr_sample with a deleted sample event


Unnamed: 0,ver_lma_sample,lkp_key_lma_sample,nb_msr_sample
0,5,788123,4
1,4,783740,3
2,4,783737,3
3,4,783736,3
4,4,783738,3


In [14]:
pc = round(100 * len(duplicated_sample_w_deleted_sample) / nb_duplicated_sample, 2)
print(f'duplicated sample are due to delete events less than {pc} % of the time')

duplicated sample are due to delete events less than 3.82 % of the time


In [15]:
pc = round(100 * len(duplicated_sample_w_deleted_sample) / len(impacted_lma_sample), 2)
print(f'{pc}% of deleted sample are duplicated')

78.86% of deleted sample are duplicated


In [16]:
for sample in impacted_lma_sample :
    if sample not in duplicated_samples['lkp_key_lma_sample'].to_list():
        print('example of a lkp_key_msr_sample linked to a deleted ak_key_msr_sample without generate duplication troubles :', sample  )
        break

example of a lkp_key_msr_sample linked to a deleted ak_key_msr_sample without generate duplication troubles : 784903


### 2.2 Test

In [17]:
want_to_group_also_by_version = True
version = 't.ver_lma_test,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_test.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_tests = query_job.to_dataframe().drop_duplicates()
nb_duplicated_test = duplicated_tests['lkp_lma_test'].nunique()
print(nb_duplicated_test, 'lkp_lma_test with several ak_key_msr_test')



10578 lkp_lma_test with several ak_key_msr_test


In [18]:
duplicated_test_not_deleted = duplicated_tests[~duplicated_tests['lkp_lma_test'].isin(impacted_lma_test)]
print(f"{duplicated_test_not_deleted['lkp_lma_test'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_test_not_deleted.head()

10205 lma_res with several msr_res without any deleted sample


Unnamed: 0,ver_lma_test,lkp_lma_test,nb_msr_test
98,6,540935,3
99,5,458341,3
100,6,502461,3
101,5,527045,3
102,6,536436,3


In [19]:
duplicated_test_w_deleted_sample = duplicated_tests[duplicated_tests['lkp_lma_test'].isin(impacted_lma_test)]
print(f"{duplicated_test_w_deleted_sample['lkp_lma_test'].nunique()} lma_test with several msr_test with a deleted sample event")
duplicated_test_w_deleted_sample.head()

373 lma_test with several msr_test with a deleted sample event


Unnamed: 0,ver_lma_test,lkp_lma_test,nb_msr_test
0,7,1523483,4
1,4,1662697,4
2,4,1662698,4
3,4,1662699,4
4,4,1662700,4


In [20]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / nb_duplicated_test)
print(f'duplicated test are due to delete events less than {pc} % of the time')

duplicated test are due to delete events less than 4 % of the time


In [21]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / len(impacted_lma_test), 2)
print(f'{pc}% of deleted sample are duplicated')

80.56% of deleted sample are duplicated


### 2.3 Results

In [22]:
want_to_group_also_by_version = True
version = 'r.ver_lma_results,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_results.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_resuts = query_job.to_dataframe().drop_duplicates()
nb_duplicated_result = duplicated_resuts['lkp_lma_results'].nunique()
print(nb_duplicated_result, 'lkp_lma_results with several ak_key_msr_result')



176168 lkp_lma_results with several ak_key_msr_result


In [23]:
duplicated_resuts_not_deleted = duplicated_resuts[~duplicated_resuts['lkp_lma_results'].isin(impacted_lma_res)]
print(f"{duplicated_resuts_not_deleted['lkp_lma_results'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_resuts_not_deleted.head()

168548 lma_res with several msr_res without any deleted sample


Unnamed: 0,ver_lma_results,lkp_lma_results,nb_msr_res
0,11,51308514,6
198,11,51308513,4
646,13,47874104,3
647,13,47872002,3
648,13,47872024,3


In [24]:
duplicated_resuts_w_deleted_sample = duplicated_resuts[duplicated_resuts['lkp_lma_results'].isin(impacted_lma_res)]
print(f"{duplicated_resuts_w_deleted_sample['lkp_lma_results'].nunique()} lma_res with several msr_res with a deleted sample event")
duplicated_resuts_w_deleted_sample.head()

7620 lma_res with several msr_res with a deleted sample event


Unnamed: 0,ver_lma_results,lkp_lma_results,nb_msr_res
1,8,57454005,4
2,7,57454006,4
3,8,57453996,4
4,7,57453989,4
5,8,57454000,4


In [25]:
pc = round(100 * len(duplicated_resuts_w_deleted_sample) / nb_duplicated_result)
print(f'duplicated results are due to delete events less than {pc} % of the time')

duplicated results are due to delete events less than 6 % of the time


In [26]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / len(impacted_lma_test), 2)
print(f'{pc}% of deleted sample are duplicated')

80.56% of deleted sample are duplicated


## 3. Duplicated data in SDDS