In [1]:
import pandas as pd
import json
from google.cloud import storage
from google.cloud import bigquery
client = bigquery.Client()

# Comparison deleted data vs duplicated data

## 1. Data linked to a deleted sample

In [55]:
#! pip install -r requirements.txt

In [57]:
with open(f"SQL/get_delete_events.sql", "r") as sql_file:
    sql = sql_file.read()

query_job = client.query(sql)
delete_events = query_job.to_dataframe().drop_duplicates()
nb_events, _ = delete_events.shape
print(nb_events, 'deletion events')
delete_events.head()

520 deletion events




Unnamed: 0,json_blob
0,"{""event_id"":""5d2de090-d3ee-4d8c-8428-7650f6ac4..."
1,"{""event_id"":""026eac3a-e06a-4aed-b5f0-025206596..."
2,"{""event_id"":""9412e6ad-def9-41fa-a570-5b3705cff..."
3,"{""event_id"":""14c31487-e31a-416d-b2d6-6f4e34daa..."
4,"{""event_id"":""71bef347-aa41-4628-8911-8ea29dc26..."


In [58]:
deleted_samples = []
dates = []
for i in range(nb_events):
    event = json.loads(delete_events['json_blob'][i])
    deleted_samples.append(str(event['data_key']))
    dates.append(event['event_date'])

deleted_samples = list(set(deleted_samples))
nb_u_samples = len(deleted_samples)
print(f"We have {nb_events} delete events on {nb_u_samples} samples")

We have 520 delete events on 442 samples


In [59]:
print(f"oldest event : {sorted(dates)[0]}") 
print(f"newest event : {sorted(dates)[len(dates)-1]}") 

oldest event : 2024-11-11T07:08:54.142Z
newest event : 2025-03-03T11:07:03.601Z


In [60]:
with open(f"SQL/dwh_msr/get_sample_test_result.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{samples}", str(tuple(deleted_samples)))

query_job = client.query(sql)
all_data = query_job.to_dataframe().drop_duplicates()
all_data.head()



Unnamed: 0,ak_key_msr_sample,lkp_key_lma_sample,id_msr_sample,ak_key_msr_test,lkp_lma_test,id_msr_test,ak_key_msr_result,lkp_lma_results,id_msr_result
0,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72987,47874974,0f2a6273-6c7d-47c3-aebe-ac93a880f35f
1,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72990,51310940,4f8e9486-5750-457d-aa1c-47e2a7dfbacc
2,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72993,47874973,99a0e474-8e66-40ea-8274-b43f505a77ff
3,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72992,47874973,e81c0a53-8860-416a-a908-a237c119fced
4,19501,650720,44c9cdbd-5cbd-4ca4-9dc4-0379e3618b64,38278,1438059,9a55f99a-3730-4363-8f23-ac929e0cd3f0,72991,47874973,e983e2c4-41b7-430e-937e-e16de48446d6


In [61]:
impacted_lma_sample = all_data['lkp_key_lma_sample'].drop_duplicates()
print(f"Number of deleted lma sample : {all_data['lkp_key_lma_sample'].nunique()}\n")
impacted_lma_test = all_data['lkp_lma_test'].drop_duplicates()
impacted_lma_res = all_data['lkp_lma_results'].drop_duplicates()
print(f"{all_data['lkp_lma_results'].nunique()} lkp_lma_results in {all_data['lkp_lma_test'].nunique()} lkp_lma_test are linked to a deleted sample")

Number of deleted lma sample : 480

7902 lkp_lma_results in 466 lkp_lma_test are linked to a deleted sample


## 2. Duplicated data in msr dwh

### 2.1 Sample

In [10]:
want_to_group_also_by_version = True
version = 's.ver_lma_sample,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_sample.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_samples = query_job.to_dataframe().drop_duplicates()

# SET TARGET ID 
target_id = "results_sample_id"
nb_duplicated_sample = duplicated_samples[target_id].nunique()
print(nb_duplicated_sample, f'{target_id} with several ak_key_msr_sample')

1229 results_sample_id with several ak_key_msr_sample


In [11]:
duplicated_sample_not_deleted = duplicated_samples[~duplicated_samples['lkp_key_lma_sample'].isin(impacted_lma_sample)]
print(f"{duplicated_sample_not_deleted['lkp_key_lma_sample'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_sample_not_deleted.head()

KeyError: 'lkp_key_lma_sample'

In [55]:
duplicated_sample_w_deleted_sample = duplicated_samples[duplicated_samples['lkp_key_lma_sample'].isin(impacted_lma_sample)]
print(f"{duplicated_sample_w_deleted_sample['lkp_key_lma_sample'].nunique()} lma_sample with several msr_sample with a deleted sample event")
duplicated_sample_w_deleted_sample.head()

389 lma_sample with several msr_sample with a deleted sample event


Unnamed: 0,ver_lma_sample,lkp_key_lma_sample,nb_msr_sample
0,5,788123,7
1,4,783751,6
2,4,804262,6
3,4,795842,6
4,4,804260,6


In [34]:
pc = round(100 * len(duplicated_sample_w_deleted_sample) / nb_duplicated_sample, 2)
print(f'duplicated sample are due to delete events less than {pc} % of the time')

duplicated sample are due to delete events less than 3.82 % of the time


In [35]:
pc = round(100 * len(duplicated_sample_w_deleted_sample) / len(impacted_lma_sample), 2)
print(f'{pc}% of deleted sample are duplicated')

78.86% of deleted sample are duplicated


In [36]:
for sample in impacted_lma_sample :
    if sample not in duplicated_samples['lkp_key_lma_sample'].to_list():
        print('example of a lkp_key_msr_sample linked to a deleted ak_key_msr_sample without generate duplication troubles :', sample  )
        break

example of a lkp_key_msr_sample linked to a deleted ak_key_msr_sample without generate duplication troubles : 650720


### 2.2 Test

In [37]:
want_to_group_also_by_version = True
version = 't.ver_lma_test,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_test.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_tests = query_job.to_dataframe().drop_duplicates()
nb_duplicated_test = duplicated_tests['lkp_lma_test'].nunique()
print(nb_duplicated_test, 'lkp_lma_test with several ak_key_msr_test')



10579 lkp_lma_test with several ak_key_msr_test


In [38]:
duplicated_test_not_deleted = duplicated_tests[~duplicated_tests['lkp_lma_test'].isin(impacted_lma_test)]
print(f"{duplicated_test_not_deleted['lkp_lma_test'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_test_not_deleted.head()

10205 lma_res with several msr_res without any deleted sample


Unnamed: 0,ver_lma_test,lkp_lma_test,nb_msr_test
8,6,536436,3
10,6,510570,3
14,5,458341,3
17,6,493532,3
22,5,539072,3


In [39]:
duplicated_test_w_deleted_sample = duplicated_tests[duplicated_tests['lkp_lma_test'].isin(impacted_lma_test)]
print(f"{duplicated_test_w_deleted_sample['lkp_lma_test'].nunique()} lma_test with several msr_test with a deleted sample event")
duplicated_test_w_deleted_sample.head()

374 lma_test with several msr_test with a deleted sample event


Unnamed: 0,ver_lma_test,lkp_lma_test,nb_msr_test
0,7,1523483,4
1,4,1662699,4
2,4,1662698,4
3,4,1662697,4
4,4,1662700,4


In [40]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / nb_duplicated_test)
print(f'duplicated test are due to delete events less than {pc} % of the time')

duplicated test are due to delete events less than 4 % of the time


In [41]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / len(impacted_lma_test), 2)
print(f'{pc}% of deleted sample are duplicated')

80.6% of deleted sample are duplicated


### 2.3 Results

In [42]:
want_to_group_also_by_version = True
version = 'r.ver_lma_results,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_duplicated_results.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
duplicated_resuts = query_job.to_dataframe().drop_duplicates()
nb_duplicated_result = duplicated_resuts['lkp_lma_results'].nunique()
print(nb_duplicated_result, 'lkp_lma_results with several ak_key_msr_result')



176173 lkp_lma_results with several ak_key_msr_result


In [43]:
duplicated_resuts_not_deleted = duplicated_resuts[~duplicated_resuts['lkp_lma_results'].isin(impacted_lma_res)]
print(f"{duplicated_resuts_not_deleted['lkp_lma_results'].nunique()} lma_res with several msr_res without any deleted sample")
duplicated_resuts_not_deleted.head()

168548 lma_res with several msr_res without any deleted sample


Unnamed: 0,ver_lma_results,lkp_lma_results,nb_msr_res
0,11,51308514,6
1,11,51308513,4
199,10,47875741,3
200,10,47875334,3
201,10,47874676,3


In [44]:
duplicated_resuts_w_deleted_sample = duplicated_resuts[duplicated_resuts['lkp_lma_results'].isin(impacted_lma_res)]
print(f"{duplicated_resuts_w_deleted_sample['lkp_lma_results'].nunique()} lma_res with several msr_res with a deleted sample event")
duplicated_resuts_w_deleted_sample.head()

7625 lma_res with several msr_res with a deleted sample event


Unnamed: 0,ver_lma_results,lkp_lma_results,nb_msr_res
2,150,55503187,4
3,154,55503147,4
4,173,55503107,4
5,166,55503127,4
6,156,55503098,4


In [45]:
pc = round(100 * len(duplicated_resuts_w_deleted_sample) / nb_duplicated_result)
print(f'duplicated results are due to delete events less than {pc} % of the time')

duplicated results are due to delete events less than 6 % of the time


In [46]:
pc = round(100 * duplicated_test_w_deleted_sample['lkp_lma_test'].nunique() / len(impacted_lma_test), 2)
print(f'{pc}% of deleted sample are duplicated')

80.6% of deleted sample are duplicated


## 3. Comparison with lims output

In [2]:
lims_output_sample = pd.read_excel("Data/RM PCP LIMS - samples.xlsx")

In [4]:
lims_output_sample

Unnamed: 0,id,sample_type,external_id,sample_code,fabrication_code,batch_code_number,batch_code_supplier,sample_ec_code,sample_smiles_code,created_at,updated_at
0,22212,Refcomm,849910,55194284,B0000192703,B0000192703-00001231001,00001231001,,,2025-02-04T17:21:56.500Z,2025-02-04T17:21:56.500Z
1,22211,Refcomm,840091,D13736572,DGA2270177,DGA2270177-21LNC1021SQ89,21LNC1021SQ89,,,2025-02-04T16:32:41.296Z,2025-02-04T16:32:41.296Z
2,22210,Refcomm,849766,C34035984,DGA2270073,DGA2270073-0001968392,0001968392,,,2025-02-04T16:18:33.136Z,2025-02-04T16:18:33.136Z
3,22209,Refcomm,849773,53586881,B0000191247,B0000191247-0001897027,0001897027,,,2025-02-04T16:18:29.796Z,2025-02-04T16:18:29.796Z
4,22208,Refcomm,849767,C24504104,DGA2240002,DGA2240002-DGC21N0010,DGC21N0010,,,2025-02-04T16:18:22.673Z,2025-02-04T16:18:22.673Z
...,...,...,...,...,...,...,...,...,...,...,...
9312,12786,Refcomm,820359,63795945,B0000186839,B0000186839-20200729-100,20200729-100,,,2024-05-31T10:10:05.670Z,2024-05-31T10:10:05.670Z
9313,12785,Refcomm,820362,436072,B0000192345,B0000192345-23A25108-00,23A25108-00,,,2024-05-31T10:10:02.200Z,2024-05-31T10:10:02.200Z
9314,12784,Refcomm,820371,C24687885,DGA23N0080,DGA23N0080-0027451126,0027451126,,,2024-05-31T10:09:56.790Z,2024-05-31T10:09:56.790Z
9315,12783,Refcomm,824712,C20042883,DGA23D0069,DGA23D0069-DGK2350178,DGK2350178,,,2024-05-31T10:09:53.863Z,2024-05-31T10:09:53.863Z


### Identification de "id"

"id" correspond à id_msr_sample ? ak_key_msr ? sample_id ? => **ak_key_msr**

In [5]:
import db_dtypes

In [6]:
want_to_group_also_by_version = False
version = 'ver_lma_sample,' if want_to_group_also_by_version else ''

with open(f"SQL/dwh_msr/get_all_sample_IDs.sql", "r") as sql_file:
    sql_template = sql_file.read()

sql = sql_template.replace("{version}", version)
query_job = client.query(sql)
samples_ids = query_job.to_dataframe().drop_duplicates()

In [7]:
samples_ids.head()

Unnamed: 0,id_lma_sample,lkp_key_lma_sample,id_msr_sample,ak_key_msr_sample
0,ffff2e09-c00c-4f1f-a43d-be8cfe3f06ea,225345,f0cc9fb4-8c4b-4373-b7dd-e75618e8b930,33705
1,ffff2e09-c00c-4f1f-a43d-be8cfe3f06ea,225345,b8e27c25-9596-4aa7-a36e-38a2f48173db,41412
2,fffe8063-469a-42ee-aafb-36142bcf0586,32945,b20c26f4-b0e8-4740-8a84-89bcd0e06f85,41198
3,fffe8063-469a-42ee-aafb-36142bcf0586,32945,dd6ee7d9-5cf0-4eee-a20d-fe83e56de394,35780
4,fffaa8fb-60e1-406f-a60e-a2605f72b623,699372,08fc7700-6df5-4ade-8aa0-5e6ce16031cb,16151


In [8]:
unique_excel_ids = list(lims_output_sample["id"].unique())
unique_lkp_lma = list(samples_ids["lkp_key_lma_sample"].unique())
unique_ak_key_msr = list(samples_ids["ak_key_msr_sample"].unique())

In [9]:
candidate = unique_ak_key_msr

print("MIN")
print(min(candidate))
print(min(unique_excel_ids))
print("MAX")
print(max(candidate))
print(max(unique_excel_ids))

MIN
12782
12782
MAX
50359
22212


In [10]:
unique_ak_key_msr = [int(elt) for elt in unique_ak_key_msr]
unique_excel_ids = [int(elt) for elt in unique_excel_ids]
unique_lkp_lma = [int(elt) for elt in unique_lkp_lma]

In [11]:
ak_key = True
candidate = unique_ak_key_msr if ak_key else unique_lkp_lma
c= 0 
for elt in unique_excel_ids:
    if elt in candidate:
        c+= 1
print(f"Taux de présence de 'id' dans le candidat {c}/{len(unique_excel_ids)}")

Taux de présence de 'id' dans le candidat 9306/9317


### Doublons samples lkmp_key_lma liés à ak_key présentes dans XL ( Jointure propre Results x Test x Sample)

Sample en doublons vs sample du lims ? Combien sont dans les 2 ? Combien sont seulement en doublons ? Seulement issu du lims ? 

In [12]:
# GROUP BY VERSION ? 
want_to_group_also_by_version = True

In [13]:
version = 's.ver_lma_sample,' if want_to_group_also_by_version else ''
with open(f"SQL/dwh_msr/get_duplicated_sample.sql", "r") as sql_file:
    sql_template = sql_file.read()

# Replace placeholders with actual values
sql = sql_template.format(version= version)
query_job = client.query(sql)
duplicated_samples = query_job.to_dataframe().drop_duplicates()

if len(duplicated_samples)>0:
    nb_duplicated_sample = duplicated_samples["lkp_key_lma_sample"].nunique()
    print(nb_duplicated_sample, f'lkp key lma sample with several id msr sample')
else:
    print(0, f'lkp with several id msr')

10 lkp key lma sample with several id msr sample


In [14]:
duplicated_samples.head()

Unnamed: 0,ver_lma_sample,lkp_key_lma_sample,nb_msr_sample
0,4,862598,3
1,1,783977,2
2,2,783976,2
3,6,824788,2
4,6,824787,2


Now select ak_keys corresponding to the listed lkp keys 

In [15]:
version = 's.ver_lma_sample,' if want_to_group_also_by_version else ''
with open(f"SQL/dwh_msr/get_target_ak_keys.sql", "r") as sql_file:
    sql_template = sql_file.read()

# Replace placeholders with actual values
sql = sql_template.format(version= version)
query_job = client.query(sql)
duplicated_ak_keys_df = query_job.to_dataframe().drop_duplicates()

In [16]:
duplicated_ak_keys_df.head()

Unnamed: 0,ver_lma_sample,lkp_key_lma_sample,ak_key_msr_sample,updated_at
0,1,783977,17357,2024-07-31 09:07:03.476000+00:00
1,2,783976,17358,2024-07-31 09:07:08.456000+00:00
2,6,824788,17120,2024-06-24 23:03:56.676000+00:00
3,6,824787,17121,2024-06-24 23:03:58.560000+00:00
4,6,824787,19903,2024-08-27 09:17:57.156000+00:00


Update this part after 

In [17]:
duplicated_ak_keys = [ int(elt) for elt in duplicated_ak_keys_df["ak_key_msr_sample"].unique()]
unique_excel_ids = list(lims_output_sample["id"].unique())

In [18]:
print(f"{len(duplicated_ak_keys)} ak_key_msr_sample in DWH MSR matching lkp_key_lma_sample with several id msr")
print(f"{len(unique_excel_ids)} IDs (corresponding to msr_ak_key) in Excel extract")

21 ak_key_msr_sample in DWH MSR matching lkp_key_lma_sample with several id msr
9317 IDs (corresponding to msr_ak_key) in Excel extract


In [19]:
# Elements communs 
elements_communs = 0 
c = 0

for elt in duplicated_ak_keys:
    if elt in unique_excel_ids:
        elements_communs +=1 
    else:
        if c==0:
            print(f"{elt} est une ak_key_msr_sample pas dans le Excel alors que son lkp_key_lma_sample associé a plusieurs id_msr_sample \n")
            c+=1


print("EXTRAITS EXCEL SAMPLE LIMS vs DOUBLONS SAMPLE DWH MSR")
print("La colonne 'id' de l'extrait Excel correspond à 'ak_key_msr_sample'")
print(f"{elements_communs} communs entre les 'id' Excel sample LIMS et les ak_key_msr_sample associées aux lkp_key_lma_sample avec plusieurs id_msr_sample DWH MSR ('doublons')")
print(f"{len(unique_excel_ids)-elements_communs} 'id' uniquement dans l'Excel sample LIMS")
print(f"{len(duplicated_ak_keys)-elements_communs} 'ak_key_msr_sample' uniquement dans les doublons sample DWH MSR")


17357 est une ak_key_msr_sample pas dans le Excel alors que son lkp_key_lma_sample associé a plusieurs id_msr_sample 

EXTRAITS EXCEL SAMPLE LIMS vs DOUBLONS SAMPLE DWH MSR
La colonne 'id' de l'extrait Excel correspond à 'ak_key_msr_sample'
9 communs entre les 'id' Excel sample LIMS et les ak_key_msr_sample associées aux lkp_key_lma_sample avec plusieurs id_msr_sample DWH MSR ('doublons')
9308 'id' uniquement dans l'Excel sample LIMS
12 'ak_key_msr_sample' uniquement dans les doublons sample DWH MSR


### Doublons samples lkmp_key_lma liés à ak_key présentes dans XL ( Jointure directe Results x Sample via r.sample_id)

In [20]:
want_to_group_also_by_version = True

In [21]:
version = 's.ver_lma_sample,' if want_to_group_also_by_version else ''
with open(f"SQL/dwh_msr/get_duplicated_samples_direct_join.sql", "r") as sql_file:
    sql_template = sql_file.read()

# Replace placeholders with actual values
sql = sql_template.format(version= version)
query_job = client.query(sql)
samples_duplicated_via_sample_id = query_job.to_dataframe().drop_duplicates()

In [22]:
if len(samples_duplicated_via_sample_id)>0:
    nb_duplicated_sample = samples_duplicated_via_sample_id["lkp_key_lma_sample"].nunique()
    print(nb_duplicated_sample, f'lkp key lma sample with several id msr sample when direct join via r.sample_id')
else:
    print(0, f'lkp with several id msr')

10 lkp key lma sample with several id msr sample when direct join via r.sample_id


In [23]:
version = 's.ver_lma_sample,' if want_to_group_also_by_version else ''
with open(f"SQL/dwh_msr/get_target_ak_keys_direct_join.sql", "r") as sql_file:
    sql_template = sql_file.read()

# Replace placeholders with actual values
sql = sql_template.format(version= version)
query_job = client.query(sql)
target_ak_keys_direct_join = query_job.to_dataframe().drop_duplicates()

In [24]:
duplicated_ak_keys_direct_join = [ int(elt) for elt in target_ak_keys_direct_join["ak_key_msr_sample"].unique()]
unique_excel_ids = list(lims_output_sample["id"].unique())

In [28]:
print(f"{len(duplicated_ak_keys_direct_join)} ak_key_msr_sample in DWH MSR (direct join results-sample) matching lkp_key_lma_sample with several id msr")
print(f"{len(unique_excel_ids)} IDs (corresponding to msr_ak_key) in Excel extract")

21 ak_key_msr_sample in DWH MSR (direct join results-sample) matching lkp_key_lma_sample with several id msr
9317 IDs (corresponding to msr_ak_key) in Excel extract


In [29]:
# Elements communs 
elements_communs = 0 
c = 0

for elt in duplicated_ak_keys_direct_join:
    if elt in unique_excel_ids:
        elements_communs +=1 
    else:
        if c==0:
            print(f"{elt} est une ak_key_msr_sample pas dans le Excel alors que son lkp_key_lma_sample est associé a plusieurs id_msr_sample \n")
            c+=1


print("EXTRAITS EXCEL SAMPLE LIMS vs DOUBLONS SAMPLE DWH MSR (DIRECT JOIN results-sample)")
print("La colonne 'id' de l'extrait Excel correspond à 'ak_key_msr_sample'")
print(f"{elements_communs} communs entre les 'id' Excel sample LIMS et les ak_key_msr_sample associées aux lkp_key_lma_sample avec plusieurs id_msr_sample DWH MSR ('doublons')")
print(f"{len(unique_excel_ids)-elements_communs} 'id' uniquement dans l'Excel sample LIMS")
print(f"{len(duplicated_ak_keys)-elements_communs} 'ak_key_msr_sample' uniquement dans les doublons sample DWH MSR")

17357 est une ak_key_msr_sample pas dans le Excel alors que son lkp_key_lma_sample est associé a plusieurs id_msr_sample 

EXTRAITS EXCEL SAMPLE LIMS vs DOUBLONS SAMPLE DWH MSR (DIRECT JOIN results-sample)
La colonne 'id' de l'extrait Excel correspond à 'ak_key_msr_sample'
9 communs entre les 'id' Excel sample LIMS et les ak_key_msr_sample associées aux lkp_key_lma_sample avec plusieurs id_msr_sample DWH MSR ('doublons')
9308 'id' uniquement dans l'Excel sample LIMS
12 'ak_key_msr_sample' uniquement dans les doublons sample DWH MSR


Statuer sur la différence de résultats observés entre la jointure propre et le direct join

### Logs inspection

In [26]:
# Initialize an empty list to store the numbers
numbers_after_update = []

# Open and read the file
with open('Data/logs.log', 'r') as file:
    for line in file:
        # Split the line into words
        words = line.split()
        """
        # Check if 'UPDATE' is in the line and followed by a number
        if 'UPDATE' in words:
            index = words.index('UPDATE')
            if index + 1 < len(words) and words[index + 1].isdigit():
                # Append the number to the list
                numbers_after_update.append(int(words[index + 1]))
        """
        # Check if 'publish' is in the line and followed by a number
        if 'publish' in words:
            index = words.index('publish')
            if index + 1 < len(words) and words[index + 1].isdigit():
                # Append the number to the list
                numbers_after_update.append(int(words[index + 1]))

# Drop duplicates in list with set conversion 
numbers_after_update = list(set(numbers_after_update))

FileNotFoundError: [Errno 2] No such file or directory: 'Data/logs.log'

In [42]:
with open(f"SQL/dwh_msr/get_recent_msr_events.sql", "r") as sql_file:
    sql_template = sql_file.read()

# Replace placeholders with actual values
query_job = client.query(sql_template)
recent_msr_inputs = query_job.to_dataframe().drop_duplicates()



In [43]:
recent_msr_inputs["lkp_lma_test"].iloc[0]

'1513061'

In [44]:
inputs = [int(elt) for elt in recent_msr_inputs["lkp_lma_test"]]

In [47]:
c=0
lkp_lma_errors = []

for elt in numbers_after_update:
    if elt in inputs:
        c+=1
    else:
        lkp_lma_errors.append(elt)

print(f"{c}/{len(numbers_after_update)} lkp_lma_test from logs['publish'] present in MSR DWH")

372/1664 lkp_lma_test from logs['publish'] present in MSR DWH


In [52]:
len(lkp_lma_errors)

1292

In [50]:
# Open the file in write mode
with open('Data/publish_errors.txt', 'w') as file:
    # Write each error on a new line
    for error in lkp_lma_errors:
        file.write(str(error) + '\n')

In [53]:
# Open the file in write mode
with open('Data/published_lkp_lma_key.txt', 'w') as file:
    # Write each elt on a new line
    for elt in numbers_after_update:
        file.write(str(elt) + '\n')