In [1]:
import numpy as np
import pandas as pd

In [11]:
product1 = pd.read_csv("../data/processed/en_product1.csv")
product4 = pd.read_csv("../data/processed/en_product4.csv")
product6 = pd.read_csv("../data/processed/en_product6.csv")
product7 = pd.read_csv("../data/processed/en_product7.csv")
hp = pd.read_csv("../data/processed/hp.csv") 
phenotype = pd.read_csv("../data/processed/phenotype.csv") 

  phenotype = pd.read_csv("../data/processed/phenotype.csv")  # parsed phenotype.hpoa


In [12]:
datasets = {
    "product1": product1,
    "product4": product4,
    "product6": product6,
    "product7": product7,
    "hp": hp,
    "phenotype": phenotype
}

In [18]:
for name, df in datasets.items():
    print(f"\n--- {name} ---")
    print(f"Shape: {df.shape}") 
    print("Columns:", df.columns.tolist())
    print(df.head(3))  
    print(df.info())
    print("Missing values per column:\n", df.isnull().sum())
    print("="*50)


--- product1 ---
Shape: (11239, 9)
Columns: ['id', 'OrphaCode', 'Name', 'ExpertLink', 'Synonyms', 'DisorderType', 'DisorderGroup', 'ExternalReferences', 'Definition']
      id  OrphaCode                                               Name  \
0  17601     166024  Multiple epiphyseal dysplasia-macrocephaly-fac...   
1      2         58                                  Alexander disease   
2  17603     166032  Multiple epiphyseal dysplasia-miniepiphyses sy...   

                                          ExpertLink  \
0  http://www.orpha.net/consor/cgi-bin/OC_Exp.php...   
1  http://www.orpha.net/consor/cgi-bin/OC_Exp.php...   
2  http://www.orpha.net/consor/cgi-bin/OC_Exp.php...   

                                        Synonyms DisorderType DisorderGroup  \
0  Multiple epiphyseal dysplasia, Al-Gazali type      Disease      Disorder   
1                                            AxD      Disease      Disorder   
2                                            NaN      Disease      Disord

In [20]:

# Overlap checks: HPO ids present in phenotype vs hp; OrphaCodes between product1 and product4/phenotype
results = {}
if 'hp' in datasets and 'phenotype' in datasets:
    hp_set = set(datasets['hp']['id'].dropna().unique())
    ph_set = set(datasets['phenotype']['hpo_id'].dropna().unique())
    results['hp_terms_in_phenotype'] = len(hp_set & ph_set)
    results['hp_terms_only_in_hp'] = len(hp_set - ph_set)
    results['hp_terms_only_in_phenotype'] = len(ph_set - hp_set)
if 'product1' in datasets and 'product4' in datasets:
    p1 = set(datasets['product1']['OrphaCode'].dropna().unique())
    p4 = set(datasets['product4']['OrphaCode'].dropna().unique())
    results['orpha_in_p1_p4_overlap'] = len(p1 & p4)
if 'product1' in datasets and 'phenotype' in datasets:
    p1 = set(datasets['product1']['OrphaCode'].dropna().unique())
    ph = set(datasets['phenotype']['database_id'].dropna().unique())
    results['orpha_in_p1_phenotype_overlap'] = len(p1 & ph)
results


{'hp_terms_in_phenotype': 11428,
 'hp_terms_only_in_hp': 8229,
 'hp_terms_only_in_phenotype': 0,
 'orpha_in_p1_p4_overlap': 4314,
 'orpha_in_p1_phenotype_overlap': 0}

In [21]:

summary_rows = []
for name, df in datasets.items():
    nrows, ncols = df.shape
    missing_total = int(df.isnull().sum().sum())
    dup = int(df.duplicated().sum())
    text_cols = [c for c in df.columns if any(k in c.lower() for k in ('name','term','def','definition','synonym','description','text'))]
    small_cardinal = {}
    for c in df.columns:
        nunique = df[c].nunique(dropna=True)
        if nunique <= 20:
            small_cardinal[c] = int(nunique)
    summary_rows.append({
        'dataset': name,
        'rows': nrows,
        'cols': ncols,
        'missing_values_total': missing_total,
        'duplicate_rows': dup,
        'text_columns_candidates': text_cols,
        'small_cardinalities_sample': small_cardinal,
        'columns': df.columns.tolist()
    })
summary_rows

[{'dataset': 'product1',
  'rows': 11239,
  'cols': 9,
  'missing_values_total': 6240,
  'duplicate_rows': 0,
  'text_columns_candidates': ['Name', 'Synonyms', 'Definition'],
  'small_cardinalities_sample': {'DisorderType': 11, 'DisorderGroup': 3},
  'columns': ['id',
   'OrphaCode',
   'Name',
   'ExpertLink',
   'Synonyms',
   'DisorderType',
   'DisorderGroup',
   'ExternalReferences',
   'Definition']},
 {'dataset': 'product4',
  'rows': 115627,
  'cols': 9,
  'missing_values_total': 0,
  'duplicate_rows': 1,
  'text_columns_candidates': ['DisorderName', 'HPOTerm'],
  'small_cardinalities_sample': {'DisorderType': 10,
   'DisorderGroup': 3,
   'Frequency': 6,
   'DiagnosticCriteria': 1},
  'columns': ['OrphaCode',
   'DisorderName',
   'ExpertLink',
   'DisorderType',
   'DisorderGroup',
   'HPOId',
   'HPOTerm',
   'Frequency',
   'DiagnosticCriteria']},
 {'dataset': 'product6',
  'rows': 8300,
  'cols': 23,
  'missing_values_total': 4845,
  'duplicate_rows': 0,
  'text_columns_ca

In [22]:
# Count missing values column-wise
missing_by_col = product7.isnull().sum().sort_values(ascending=False)
print(missing_by_col)


TargetName           9753
TargetOrphaCode      9753
OrphaCode               0
ExpertLink              0
DisorderName            0
TotalAssociations       0
DisorderId              0
IsCycle                 0
RootId                  0
TargetId                0
AssociationType         0
dtype: int64


In [32]:
for name,df in datasets.items():
    print("Columns:", df.columns.tolist())

Columns: ['id', 'OrphaCode', 'Name', 'ExpertLink', 'Synonyms', 'DisorderType', 'DisorderGroup', 'ExternalReferences', 'Definition']
Columns: ['OrphaCode', 'DisorderName', 'ExpertLink', 'DisorderType', 'DisorderGroup', 'HPOId', 'HPOTerm', 'Frequency', 'DiagnosticCriteria']
Columns: ['DisorderID', 'OrphaCode', 'DisorderName', 'ExpertLink', 'DisorderType', 'DisorderGroup', 'SourceOfValidation', 'AssociationType', 'AssociationStatus', 'GeneID', 'GeneName', 'GeneSymbol', 'GeneSynonyms', 'GeneType', 'GeneLocus', 'LocusKey', 'Ref_HGNC', 'Ref_Ensembl', 'Ref_OMIM', 'Ref_SwissProt', 'Ref_Genatlas', 'Ref_ClinVar', 'Ref_Reactome']
Columns: ['OrphaCode', 'DisorderName', 'ExpertLink', 'TotalAssociations', 'DisorderId', 'RootId', 'IsCycle', 'TargetId', 'TargetOrphaCode', 'TargetName', 'AssociationType']
Columns: ['id', 'name', 'definition', 'comment', 'synonyms', 'synonym_types', 'xrefs', 'alt_ids', 'is_a', 'created_date', 'obsolete']
Columns: ['database_id', 'disease_name', 'qualifier', 'hpo_id', 'r