In [1]:
# import packages for transformations
import sqlite3
import pandas as pd
from pathlib import Path


# Check connection to markdown


In [2]:
#1. path to this script
current_dir = Path.cwd()

#2. Path to project root
project_root = current_dir.parent

#3. Go to database folder
db_path = project_root / "database" / "dataverse_complete_zuhdil.db"

#4. define the connection to the database
try:
    conn = sqlite3.connect(str(db_path))
    print("✅ Connection Successful!")
    print(f"Connected to: {db_path}")
except Exception as e:
    print(f"❌ Still failing. Looking at: {db_path}")
    print(f"Error: {e}")



✅ Connection Successful!
Connected to: /Users/lottesavelberg/Documents/Akvo/IGH/igh-data-transform/src/igh_data_transform/database/dataverse_complete_zuhdil.db


In [3]:
#if the connection was succesful, open it again
conn = sqlite3.connect(str(db_path))

### Set link to Candidates table

In [4]:
#1. define the candidates table
table_name = "vin_candidates"
df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)

# see the first rows
print(f"Table' {table_name} was successfully loaded!")
df.head()

Table' vin_candidates was successfully loaded!


Unnamed: 0,row_id,vin_name,new_sbereviewstatus,new_potentialforacceleratedorconditionalregulator,vin_stringentregulatoryauthoritysraapprovalda,crc8b_updatedforipps20,vin_targettoxinclass,vin_includeinevgendatabase,new_platform,new_personslivingwithhiv,...,vin_approvingauthority,_createdonbehalfby_value,_vin_clinicalusestatus_value,new_includeinpipeline2021,_vin_routeofadministration_value,new_rdstage,json_response,sync_time,valid_from,valid_to
0,1,DPP Fever Panel II Asia IgM,,,,862890000.0,,909670000.0,Lateral Flow Assay,,...,,,,,,,"{""@odata.etag"": ""W/\""110768105\"""", ""vin_meshhe...",2025-12-19T03:14:27.843601+00:00,2025-12-17T17:48:00Z,
1,2,TRURAPID MPXV Ag Test,,,,862890000.0,,909670000.0,Lateral Flow Assay,,...,,,,,,,"{""@odata.etag"": ""W/\""110768107\"""", ""vin_meshhe...",2025-12-19T03:14:28.359554+00:00,2025-12-17T17:48:00Z,
2,3,NABIT Mpox Test,,,,862890000.0,,909670000.0,qRT-PCR,,...,,,,,,,"{""@odata.etag"": ""W/\""110768091\"""", ""vin_meshhe...",2025-12-19T03:14:28.418569+00:00,2025-12-17T17:48:00Z,
3,4,MPV Ag Rapid Test,,,,862890000.0,,909670000.0,Lateral Flow Assay,,...,,,,,,,"{""@odata.etag"": ""W/\""110768093\"""", ""vin_meshhe...",2025-12-19T03:14:28.480738+00:00,2025-12-17T17:48:00Z,
4,5,SGTi-flex Mpox Ag,,,,862890000.0,,909670000.0,ELISA/EIA,,...,,,,,,,"{""@odata.etag"": ""W/\""110768109\"""", ""vin_meshhe...",2025-12-19T03:14:28.512262+00:00,2025-12-17T17:48:00Z,


### Execute transformation

In [5]:
#1. Calculate stats of each column
stats = []
for col in df.columns:
   stats.append({
       'column_name': col,
       'unique_values': df[col].nunique(),
       'empty_cells': df[col].isnull().sum(),
       'data_type': df[col].dtypes
   })

#2. Create data frame for the stats
df_stats = pd.DataFrame(stats)

#3. Sort stats data frame by number of unique values to identify index column
df_stats = df_stats.sort_values(by='unique_values', ascending=False)

#4. Print the results
print("Column Quality Summary:")
display(df_stats)

Column Quality Summary:


Unnamed: 0,column_name,unique_values,empty_cells,data_type
0,row_id,9579,0,int64
244,sync_time,9579,0,object
243,json_response,9579,0,object
43,versionnumber,9579,0,int64
190,vin_candidateid,9383,0,object
...,...,...,...,...
13,crc8b_sourceofipps20update,0,9579,object
187,new_vin_snakespeciesagainst,0,9579,object
143,vin_2019notes,0,9579,object
44,new_knownfunders2021,0,9579,object


In [6]:
# set index column
df = df.set_index('versionnumber')

In [7]:
#drop all columns with only empty cells
#1. Identify all column with only empty cells
empty_cols = df.columns[df.isnull().all()].tolist()

#2. Print empty colls
if not empty_cols:
    print("No columns are empty")
else:
    print(f"There are {len(empty_cols)} empty columns")
    for col in empty_cols:
        print(f" - {col}")


df_transformed = df.drop(columns=empty_cols)


There are 19 empty columns
 - crc8b_sourceofipps20update
 - new_knownfunders2021
 - overriddencreatedon
 - _owningteam_value
 - new_developers2021
 - new_includeinpipeline2025
 - utcconversiontimezonecode
 - _new_archetype_simple_value
 - new_developers2025
 - vin_2019notes
 - new_knownfunders2025
 - new_rdstage2021
 - new_vin_snakespeciesagainst
 - _modifiedonbehalfby_value
 - vin_sbereviewcompleted
 - vin_typehidden
 - _createdonbehalfby_value
 - _vin_routeofadministration_value
 - new_rdstage


In [8]:
#make list with all column names to check one by one
print(df.columns.tolist())

['row_id', 'vin_name', 'new_sbereviewstatus', 'new_potentialforacceleratedorconditionalregulator', 'vin_stringentregulatoryauthoritysraapprovalda', 'crc8b_updatedforipps20', 'vin_targettoxinclass', 'vin_includeinevgendatabase', 'new_platform', 'new_personslivingwithhiv', 'vin_id', 'vin_knownfundersaggregated', 'crc8b_ghtcroireviewstatus', 'crc8b_sourceofipps20update', 'vin_2019candidateidnumber', 'new_medicated', 'crc8b_ndpipelinereviewstatus', 'vin_numberofcountrieswithproductapproval_date', 'new_vin_whosnakespeciesriskcatagainst', 'vin_casnumber', 'new_exportgroup', 'crc8b_ndpipelinereviewdate', 'vin_2019stagepcr', 'vin_approvalstatus', 'new_pipscomments', 'vin_venomspecificity', 'new_adjuvantrequirement', 'new_pressuretype', 'new_ctregistrylink', 'modifiedon', 'vin_approvedforuseinpregnantorlactatingwomen', 'new_technologyprinciple', 'new_ctenddate', 'vin_2019developers', 'vin_usfdaapprovaldate', 'vin_alternativenames', 'new_whreviewdate', '_modifiedby_value', 'vin_reviewpersonaggre

In [9]:
def inspect_column(df, column_name):
    """
    Prints stats, missing data percentage, and top 10 values for a specific column.
    """
    # 1. Check if column exists
    if column_name not in df.columns:
        print(f"❌ Column '{column_name}' not found.")
        return

    print(f"--- Statistics for: {column_name} ---")

    # 2. Basic Stats & Data Type
    unique_vals = df[column_name].nunique()
    empty_cells = df[column_name].isnull().sum()
    filled_cells = df[column_name].count()
    dtype = df[column_name].dtype

    print(f"Unique values: {unique_vals} | Data type: {dtype} | filled cells: {filled_cells}" )

    # 3. Percentage Missing Data
    null_pct = (empty_cells / len(df)) * 100
    print(f"Empty cells: {empty_cells} ({null_pct:.2f}%)")

    #4. Show unique values if they are low (Categorical Data)
    if unique_vals < 20:
        print(f"\n--- All unique values (<20) ---")
        unique_list = df[column_name].dropna().unique()
        print(sorted(unique_list))
    else:
        print(f"\n--- First 10 values ---")
        print(df[column_name].value_counts().head(10))

    print("-" * 40 + "\n")

In [10]:
inspect_column(df, "new_knownfunders2025")

--- Statistics for: new_knownfunders2025 ---
Unique values: 0 | Data type: object | filled cells: 0
Empty cells: 9579 (100.00%)

--- All unique values (<20) ---
[]
----------------------------------------



In [12]:
# row_id deleted since to many index values in the dataset
df_transformed = df_transformed.drop(['row_id', 'new_potentialforacceleratedorconditionalregulator',
                                      'crc8b_updatedforipps20', 'vin_stringentregulatoryauthoritysraapprovalda', 'vin_includeinevgendatabase', 'new_platform', 'new_personslivingwithhiv', 'vin_id', 'crc8b_ghtcroireviewstatus', 'vin_2019candidateidnumber', 'new_medicated',
                                      'crc8b_ndpipelinereviewstatus',
                                      'vin_numberofcountrieswithproductapproval_date', 'vin_casnumber',
                                      'new_exportgroup', 'crc8b_ndpipelinereviewdate', 'vin_2019stagepcr',
                                      'new_pipscomments', 'new_adjuvantrequirement', 'new_whreviewdate',
                                      '_modifiedby_value', 'vin_reviewpersonaggregated' ,'vin_iggformatanimalderived', 'new_durationofaction', 'new_reviewstatus', 'vin_fdapregnancylabelingpregnancyrisksummary', '_vin_captype_value',
                                      'new_regionofregistration', 'new_estimateddateofregulatoryfiling',
                                      'vin_regionspecificityaggregated', 'vin_sbereviewdate', 'new_2023includeinevgendatabase', 'vin_currentrdstage', 'vin_evgenreviewcompleted', 'timezoneruleversionnumber', 'new_testformat',
                                     'new_aim1clinicalusestatus', 'new_updatedforaim20',
                                      'new_includeinaim1', '_createdby_value', 'vin_evgenreviewdate', 'crc8b_includeinipps20', 'new_thermostabilityandstorage', 'new_atcclassification', 'vin_meshheadings', 'vin_directactionontoxins',
                                      'vin_duplicateentrycapformorethanonedisease', 'importsequencenumber', 'crc8b_srhreviewdate', 'createdon', 'statuscode', 'crc8b_updatedforndpipeline', 'vin_2019archetype', 'vin_nationalregulatoryauthorityapprovaldate', 'new_tppreviewrequired', 'vin_otherindications', 'vin_includeinwellcomesbedatabase', 'new_includeinportal2025', 'new_aim1archetype',
                                      'new_reviewdateipps30', 'new_chimstudyyesno', 'vin_productiontechniqueandorimmunizationstrat', 'new_ipps30reviewstatus',
                                      'vin_reviewnotes', 'new_profilestatus', 'vin_adisid', 'new_mamedicinesubtype', 'crc8b_includeinghtcroi', 'new_includeinwhpipeline', 'vin_2019pcrpipelineinclusion', 'vin_adisurl'
                                      ], axis = 1)

# update column name
df_transformed = df_transformed.rename(columns={"vin_name": "candidate_name",
                                                'new_sbereviewstatus': 'sbereviewstatus',
                                                'vin_targettoxinclass' : 'targettoxinclass',
                                                'vin_knownfundersaggregated': 'knownfundersaggregated',
                                                'new_vin_whosnakespeciesriskcatagainst' : 'whosnakespeciesriskcatagainst',
                                                'vin_approvalstatus':  'approvalstatus',
                                                'vin_venomspecificity': 'venomspecificity',
                                                'new_pressuretype': 'pressuretype',
                                                'new_ctregistrylink' : 'ctregistrylink',
                                                'vin_approvedforuseinpregnantorlactatingwomen': 'approvedforuseinpregnantorlactatingwomen',
                                                'new_technologyprinciple' :'technologyprinciple',
                                                'new_ctenddate': 'ctenddate',
                                                'vin_2019developers': '2019developers',
                                                'vin_usfdaapprovaldate': 'usfdaapprovaldate',
                                                'vin_alternativenames': 'alternativenames',
                                                'vin_product' : 'product',
                                                'vin_researchedinpregnantwomenorlactatingwomen': 'researchedinpregnantwomenorlactatingwomen',
                                                'vin_target': 'target',
                                                'vin_emaapprovalstatus': 'emaapprovalstatus',
                                                'vin_inactivedevelopmentreason': 'inactivedevelopmentreason',
                                                'new_2024knownfunders': '2024knownfunders',
                                                'vin_emaapprovaldate':  'emaapprovaldate',
                                                'vin_previouslyidentifiedcandidate' : 'previouslyidentifiedcandidate',
                                                'new_2023knownfundersaggregated': '2023knownfunders',
                                                'crc8b_srhindication': 'WH_indication',
                                                'vin_developersaggregated': 'developersaggregated',
                                                'vin_japanesemhlwapprovalstatus': 'japanesemhlwapprovalstatus',
                                                'vin_typeofpreclinicalresults': 'typeofpreclinicalresults',
                                                'vin_stringentregulatoryauthorityapproval': 'SRA_approvalstatus',
                                                'vin_whoprequalificationdate': 'whoprequalificationdate',
                                                'vin_technologytype': 'technologytype',
                                                'new_snakespeciesagainst' : 'snakespeciesagainst',
                                                'vin_numberofcountrieswithproductapproval': 'numberofcountrieswithproductapproval',
                                                'new_developers2025': 'developers2025',
                                                'vin_indication': 'indication',
                                                'vin_snakespecies': 'snakespecies',
                                                'vin_specifictargettoxinclass': 'specifictargettoxinclass',
                                                'new_ctstartdate': 'ctstartdate',
                                                'vin_chemicalname':'chemicalname',
                                                'vin_specimentype': 'specimentype',
                                                'new_indicationtype': 'indicationtype',
                                                'vin_mechanismofaction': 'mechanismofaction',
                                                'vin_recentupdates': 'recentupdates',
                                                'new_knownfunders2025': 'knownfunders2025',})



In [13]:
# Update pressure type categorical values
df_transformed['pressuretype'] = df_transformed['pressuretype'].replace({'Negative pressure ' : 'Negative pressure', 'Positive pressure ' : 'Positive pressure', 'Not applicable ': 'N/A'})

In [14]:
# Update product categorical values
# This will list every unique status and how many rows have it
status_counts = df[('vin_product')].value_counts()

print(status_counts)

df_transformed['product'] = df_transformed['product'].replace({'Dietary supplement' : 'Dietary supplements',                                                                    'Diagnostic' :'Diagnostics',
                                                                'Drug':'Drugs',
                                                               'Functional foods' :'Dietary supplements',
                                                               'Microbial interventions':'Microbicides',
                                                               'Chemical vector control products' : 'VCP',
                                                               'Biological vector control products' : 'VCP',
                                                               'Vector control products Reservoir targeted vaccines': 'VCP',
                                                                'Vector control products' :'VCP',
                                                               'Reservoir targeted vaccines' : 'Vaccines'
                                                               }
)



vin_product
Diagnostics                                            3853
Drugs                                                  2424
Vaccines                                               1626
Biologics                                               889
Dietary supplements                                     245
Chemical vector control products                        107
Devices                                                  97
Microbicides                                             27
Biological vector control products                       11
Diagnostic                                                6
Vector control products Reservoir targeted vaccines       4
Drug                                                      2
Vector control products                                   2
Functional foods                                          2
Reservoir targeted vaccines                               2
Dietary supplement                                        2
Microbial interventions     

In [15]:
# Update technology type categorical values
# This will list every unique status and how many rows have it
status_counts = df[('vin_technologytype')].value_counts().sort_index()
tech_list = df['vin_technologytype'].dropna().unique()
tech_list.sort()

print(len(tech_list))


597


In [16]:
df_transformed.head()

Unnamed: 0_level_0,candidate_name,sbereviewstatus,targettoxinclass,knownfundersaggregated,whosnakespeciesriskcatagainst,approvalstatus,venomspecificity,pressuretype,ctregistrylink,modifiedon,...,vin_iggformatrecombinant,new_ctregistrylink3,vin_includeinp2imodel,vin_approvingauthority,_vin_clinicalusestatus_value,new_includeinpipeline2021,json_response,sync_time,valid_from,valid_to
versionnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
110768105,DPP Fever Panel II Asia IgM,,,,,,,,,2025-12-17T17:48:00Z,...,,,,,,,"{""@odata.etag"": ""W/\""110768105\"""", ""vin_meshhe...",2025-12-19T03:14:27.843601+00:00,2025-12-17T17:48:00Z,
110768107,TRURAPID MPXV Ag Test,,,,,,,,,2025-12-17T17:48:00Z,...,,,,,,,"{""@odata.etag"": ""W/\""110768107\"""", ""vin_meshhe...",2025-12-19T03:14:28.359554+00:00,2025-12-17T17:48:00Z,
110768091,NABIT Mpox Test,,,,,,,,,2025-12-17T17:48:00Z,...,,,,,,,"{""@odata.etag"": ""W/\""110768091\"""", ""vin_meshhe...",2025-12-19T03:14:28.418569+00:00,2025-12-17T17:48:00Z,
110768093,MPV Ag Rapid Test,,,,,,,,,2025-12-17T17:48:00Z,...,,,,,,,"{""@odata.etag"": ""W/\""110768093\"""", ""vin_meshhe...",2025-12-19T03:14:28.480738+00:00,2025-12-17T17:48:00Z,
110768109,SGTi-flex Mpox Ag,,,,,,,,,2025-12-17T17:48:00Z,...,,,,,,,"{""@odata.etag"": ""W/\""110768109\"""", ""vin_meshhe...",2025-12-19T03:14:28.512262+00:00,2025-12-17T17:48:00Z,
