In [2]:
# import packages for transformations
import sqlite3
from cmath import nan

import pandas as pd
from pathlib import Path


### Tables changed in this document

vin_diseases


### Set connection to DB

In [4]:
#1. path to this script
current_dir = Path.cwd()

#2. Path to project root
project_root = current_dir.parent

#3. Go to database folder
db_path = project_root / "database" / "dataverse_complete.db"

#4. define the connection to the database
try:
    conn = sqlite3.connect(str(db_path))
    print("✅ Connection Successful!")
    print(f"Connected to: {db_path}")
except Exception as e:
    print(f"❌ Still failing. Looking at: {db_path}")
    print(f"Error: {e}")

✅ Connection Successful!
Connected to: /Users/lottesavelberg/Documents/Akvo/IGH/igh-data-transform/src/igh_data_transform/database/dataverse_complete.db


In [5]:
#if the connection was succesful, open it again
conn = sqlite3.connect(str(db_path))

In [20]:
#1. define the priorities table
table_name = "vin_diseases"
df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)

# see the first rows
print(f"Table' {table_name} was successfully loaded!")
df.head()

Table' vin_diseases was successfully loaded!


Unnamed: 0,row_id,_vin_subproduct_value,vin_disease,createdon,modifiedon,_organizationid_value,crc8b_addedclinicalvalue,crc8b_tppppc,crc8b_addedclinicalvaluedescription,crc8b_p2iproductlaunch,...,_modifiedby_value,vin_diseaseid,_vin_maindisease_value,new_incl_nd,new_globalhealtharea,overriddencreatedon,json_response,sync_time,valid_from,valid_to
0,1,,Hantaan virus,2025-08-06T00:34:18Z,2025-08-22T08:24:38Z,98f8a63d-c50c-43c6-b8da-41598ca3e709,,0.0,,0.0,...,de354ebe-bdbd-eb11-bacc-00224818186e,6474e313-5d72-f011-b4cd-000d3a6a2e50,,0.0,100000001,,"{""_createdby_value"": ""d535fe41-1777-eb11-a812-...",2026-01-09T12:01:26.168252+00:00,2025-08-22T08:24:38Z,
1,2,,Hantaan virus,2025-08-06T02:49:14Z,2025-08-22T08:24:38Z,98f8a63d-c50c-43c6-b8da-41598ca3e709,,0.0,,0.0,...,de354ebe-bdbd-eb11-bacc-00224818186e,eb3d81ed-6f72-f011-b4cd-000d3a6a2e50,4720ff32-bcb0-ec11-983f-002248155108,0.0,100000001,,"{""_createdby_value"": ""d535fe41-1777-eb11-a812-...",2026-01-09T12:01:26.208328+00:00,2025-08-22T08:24:38Z,
2,3,,Hantaan virus,2025-08-06T02:50:25Z,2025-08-22T08:24:38Z,98f8a63d-c50c-43c6-b8da-41598ca3e709,,0.0,,0.0,...,de354ebe-bdbd-eb11-bacc-00224818186e,9e699f18-7072-f011-b4cd-000d3a6a2e50,4720ff32-bcb0-ec11-983f-002248155108,0.0,100000001,,"{""_createdby_value"": ""d535fe41-1777-eb11-a812-...",2026-01-09T12:01:26.221607+00:00,2025-08-22T08:24:38Z,
3,4,,Influenza A,2025-08-06T03:15:22Z,2025-08-22T08:24:38Z,98f8a63d-c50c-43c6-b8da-41598ca3e709,,0.0,,0.0,...,de354ebe-bdbd-eb11-bacc-00224818186e,c5ad3b90-7372-f011-b4cd-000d3a6a2e50,68f088d0-7272-f011-b4cd-000d3acbf47d,0.0,100000001,,"{""_createdby_value"": ""d535fe41-1777-eb11-a812-...",2026-01-09T12:01:26.228798+00:00,2025-08-22T08:24:38Z,
4,5,,Yaws,2024-11-27T01:25:00Z,2025-04-08T05:37:23Z,98f8a63d-c50c-43c6-b8da-41598ca3e709,,0.0,,0.0,...,695049d1-af7e-ea11-a811-000d3ae05f74,b2f74968-5eac-ef11-b8e9-000d3a6a3bfa,,1.0,100000000,,"{""_createdby_value"": ""d535fe41-1777-eb11-a812-...",2026-01-09T12:01:26.237227+00:00,2025-04-08T05:37:23Z,


### Describe columns in Priorities

In [21]:
#1. Calculate stats of each column
stats = []
for col in df.columns:
   stats.append({
       'column_name': col,
       'unique_values': df[col].nunique(),
       'empty_cells': df[col].isnull().sum(),
       'data_type': df[col].dtypes
   })

#2. Create data frame for the stats
df_stats = pd.DataFrame(stats)

#3. Sort stats data frame by number of unique values to identify index column
df_stats = df_stats.sort_values(by='unique_values', ascending=False)

#4. Print the results
print("Column Quality Summary:")
display(df_stats)

Column Quality Summary:


Unnamed: 0,column_name,unique_values,empty_cells,data_type
0,row_id,535,0,int64
10,versionnumber,535,0,int64
39,sync_time,535,0,object
38,json_response,535,0,object
33,vin_diseaseid,535,0,object
12,vin_name,524,0,object
21,vin_diseasecode,521,7,object
3,createdon,221,0,object
2,vin_disease,110,1,object
4,modifiedon,93,0,object


In [22]:
df.shape


(535, 42)

In [23]:
#drop all columns with only empty cells
#1. Identify all column with only empty cells
empty_cols = df.columns[df.isnull().all()].tolist()

#2. Print empty colls
if not empty_cols:
    print("No columns are empty")
else:
    print(f"There are {len(empty_cols)} empty columns")
    for col in empty_cols:
        print(f" - {col}")

# Create a new list excluding 'valid_to'
cols_to_drop = [c for c in empty_cols if c != 'valid_to']

# Drop the filtered list
df_transformed = df.drop(columns=cols_to_drop)

There are 7 empty columns
 - _vin_subproduct_value
 - timezoneruleversionnumber
 - _createdonbehalfby_value
 - utcconversiontimezonecode
 - _modifiedonbehalfby_value
 - overriddencreatedon
 - valid_to


### Inspect columns 1-1 and update accordingly

In [24]:
def inspect_column(df, column_name):
    """
    Prints stats, missing data percentage, and top 10 values for a specific column.
    """
    # 1. Check if column exists
    if column_name not in df.columns:
        print(f"❌ Column '{column_name}' not found.")
        return

    print(f"--- Statistics for: {column_name} ---")

    # 2. Basic Stats & Data Type
    unique_vals = df[column_name].nunique()
    empty_cells = df[column_name].isnull().sum()
    filled_cells = df[column_name].count()
    dtype = df[column_name].dtype

    print(f"Unique values: {unique_vals} | Data type: {dtype} | filled cells: {filled_cells}" )

    # 3. Percentage Missing Data
    null_pct = (empty_cells / len(df)) * 100
    print(f"Empty cells: {empty_cells} ({null_pct:.2f}%)")

    #4. Show unique values if they are low (Categorical Data)
    if unique_vals < 20:
        print(f"\n--- All unique values (<20) ---")
        unique_list = df[column_name].dropna().unique()
        print(sorted(unique_list))
    else:
        print(f"\n--- First 10 values ---")
        print(df[column_name].value_counts().head(10))

    print("-" * 40 + "\n")

In [25]:
 df_transformed.columns.tolist()

['row_id',
 'vin_disease',
 'createdon',
 'modifiedon',
 '_organizationid_value',
 'crc8b_addedclinicalvalue',
 'crc8b_tppppc',
 'crc8b_addedclinicalvaluedescription',
 'crc8b_p2iproductlaunch',
 'versionnumber',
 'statuscode',
 'vin_name',
 'statecode',
 'crc8b_realisticlaunch',
 'vin_type',
 'new_secondary_diseae_choice_text',
 '_createdby_value',
 'new_globalhealthareaportal',
 'vin_diseasecode',
 '_vin_product_value',
 'new_disease_simple',
 'importsequencenumber',
 'new_incl_eid',
 'new_diseasefilter',
 'new_disease_sort',
 'new_secondary_disease_filter',
 'new_disease_choice_text',
 '_modifiedby_value',
 'vin_diseaseid',
 '_vin_maindisease_value',
 'new_incl_nd',
 'new_globalhealtharea',
 'json_response',
 'sync_time',
 'valid_from',
 'valid_to']

In [46]:
inspect_column(df_transformed, 'globalhealtharea')

--- Statistics for: globalhealtharea ---
Unique values: 3 | Data type: object | filled cells: 448
Empty cells: 87 (16.26%)

--- All unique values (<20) ---
['100000000', '100000001', '100000002']
----------------------------------------



### Unrelevant columns that can be dropped

In [43]:
cols_to_drop = ['row_id', 'createdon', 'modifiedon','_organizationid_value', 'crc8b_addedclinicalvalue','crc8b_tppppc',
               'crc8b_addedclinicalvaluedescription', 'crc8b_p2iproductlaunch','statuscode' , 'statecode',
                '_createdby_value', 'new_globalhealthareaportal', 'importsequencenumber', 'new_incl_eid', '_modifiedby_value', 'new_incl_nd', 'json_response', 'sync_time']

### Updated column names

In [45]:
# update column name
df_transformed = df_transformed.rename(columns={'vin_disease': 'disease',
                                               'vin_name': 'name',
                                                'vin_type': 'type',
                                                'new_secondary_diseae_choice_text': 'secondary_diseae_choice_text',
                                               'vin_diseasecode': 'diseasecode',
                                                '_vin_product_value': 'product_value',
                                                'new_disease_simple': 'disease_simple',
                                                'new_diseasefilter': 'diseasefilter',
                                                'new_disease_sort': 'diseasesort',
                                                'new_secondary_disease_filter': 'secondary_disease_filter',
                                                'new_disease_choice_text': 'diseasechoice_text',
                                                'vin_diseaseid': 'diseaseid',
                                                '_vin_maindisease_value': 'maindisease_value',
                                       'new_globalhealtharea': 'globalhealtharea',})

### Columns without optionset that require updating

### Columns with optionset that require updating

In [47]:
#replace values in option set
#1. define the candidates table
table_name = "_optionset_new_globalhealtharea"
df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)

# see the first rows
print(f"Table' {table_name} was successfully loaded!")
df.head(10)

Table' _optionset_new_globalhealtharea was successfully loaded!


Unnamed: 0,code,label,first_seen
0,100000000,Neglected disease,2026-01-09T12:01:26.242680+00:00
1,100000001,Emerging infectious disease,2026-01-09T12:01:26.178445+00:00
2,100000002,Sexual & reproductive health,2026-01-09T12:01:28.916762+00:00


In [50]:
# transform catagories in approval status to one less.
df['label'] = df['label'].replace({'Sexual & reproductive health' : "Womens Health"})