# 03 — Populate a sample A-box (Subjects, Visits, Observations)

This notebook builds a **small A-box** from a demo longitudinal table (PPMI-like), using:

- the bridge ontology: `ontologies/ppmi_bridge.ttl`
- the mapping file: `mapping/ppmi_pdon_pmdo_mapping.csv`

It creates:
- `ppmi:Subject` individuals (by `PATNO`)
- `ppmi:Visit` individuals (by `PATNO` + `EVENT_ID`)
- `ppmi:Observation` individuals for mapped variables
- `ppmi:DiagnosisObservation` for PRIMDIAG codes

**Output**: `output/ppmi_abox_sample.ttl`


In [None]:
!pip -q install rdflib pandas owlrl

## 1) Project root (Drive recommended)

In [None]:
from pathlib import Path
USE_DRIVE = True
if USE_DRIVE:
  from google.colab import drive
  drive.mount('/content/drive')
  PROJECT_DIR = Path('/content/drive/MyDrive/ppmi-ontology-alignment')
else:
  PROJECT_DIR = Path('/content/ppmi-ontology-alignment')

ONT_DIR = PROJECT_DIR / 'ontologies'
MAP_DIR = PROJECT_DIR / 'mapping'
DATA_DIR = PROJECT_DIR / 'data'
OUT_DIR = PROJECT_DIR / 'output'
for p in [ONT_DIR, MAP_DIR, DATA_DIR, OUT_DIR]:
  p.mkdir(parents=True, exist_ok=True)

BRIDGE_PATH = ONT_DIR / 'ppmi_bridge.ttl'
MAP_PATH    = MAP_DIR / 'ppmi_pdon_pmdo_mapping.csv'
DEMO_PATH   = DATA_DIR / 'ppmi_demo.tsv'
ABOX_OUT    = OUT_DIR / 'ppmi_abox_sample.ttl'

print('PROJECT_DIR:', PROJECT_DIR)
print('Bridge exists :', BRIDGE_PATH.exists(), BRIDGE_PATH)
print('Mapping exists:', MAP_PATH.exists(), MAP_PATH)
print('Demo exists   :', DEMO_PATH.exists(), DEMO_PATH)


## 2) Load mapping CSV

In [None]:
import pandas as pd
from google.colab import files
import shutil

if not MAP_PATH.exists():
  print('Upload ppmi_pdon_pmdo_mapping.csv')
  up = files.upload()
  for fn in up.keys():
    shutil.move(f'/content/{fn}', str(MAP_PATH if fn=='ppmi_pdon_pmdo_mapping.csv' else (MAP_DIR/fn)))

mapping = pd.read_csv(MAP_PATH)
mapping['Variable'] = mapping['Variable'].astype(str).str.strip()
mapping['TargetIRI'] = mapping['TargetIRI'].astype(str).str.strip()
mapping.head(10)


## 3) Provide / upload demo longitudinal data

Expected format: **TSV** with one row per (PATNO, EVENT_ID).

Save as `data/ppmi_demo.tsv` (tab-separated).

In [None]:
from google.colab import files
import shutil

if not DEMO_PATH.exists():
  print('Upload ppmi_demo.tsv (tab-separated)')
  up = files.upload()
  for fn in up.keys():
    dest = DEMO_PATH if fn=='ppmi_demo.tsv' else (DATA_DIR/fn)
    shutil.move(f'/content/{fn}', str(dest))
  print('Saved to:', DEMO_PATH)

demo = pd.read_csv(DEMO_PATH, sep='\t')
demo.head(3)


## 4) Build A-box (RDF)

In [None]:
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, OWL, XSD
import re

PPMI = Namespace('http://example.org/ppmi-ontology-alignment#')

def mint_subject(patno: str) -> URIRef:
  return PPMI[f'subject/{patno}']

def mint_visit(patno: str, event_id: str) -> URIRef:
  return PPMI[f'visit/{patno}/{event_id}']

def mint_obs(patno: str, event_id: str, var: str) -> URIRef:
  safe = re.sub(r'[^A-Za-z0-9_\-\.]+','_', var)
  return PPMI[f'obs/{patno}/{event_id}/{safe}']

def parse_gYearMonth(s: str):
  m = re.match(r'^(\d{1,2})/(\d{4})$', s.strip())
  if not m:
    return None
  mm, yy = int(m.group(1)), int(m.group(2))
  if mm < 1 or mm > 12:
    return None
  return f'{yy:04d}-{mm:02d}'

def literal_best(x):
  import pandas as pd
  if pd.isna(x):
    return None
  s = str(x).strip()
  if s == '':
    return None
  if re.fullmatch(r'[-+]?\d+', s):
    try:
      return Literal(int(s), datatype=XSD.integer)
    except Exception:
      pass
  if re.fullmatch(r'[-+]?\d*\.\d+(?:[eE][-+]?\d+)?', s) or re.fullmatch(r'[-+]?\d+\.\d*', s):
    try:
      return Literal(float(s), datatype=XSD.decimal)
    except Exception:
      pass
  return Literal(s, datatype=XSD.string)

# Lookup: variable -> target IRI
map_rows = mapping.copy()
def _clean(v):
  if v is None:
    return ''
  s = str(v).strip()
  return '' if s.lower()=='nan' else s

var2target = {}
for _, r in map_rows.iterrows():
  var = _clean(r.get('Variable'))
  tgt = _clean(r.get('TargetIRI'))
  if var and tgt:
    var2target[var] = tgt

datscan_vars = set(map_rows.loc[map_rows['Category'].astype(str).str.upper().eq('DATSCAN'), 'Variable'].astype(str))
obs_vars = sorted(set(map_rows.loc[map_rows['MappingBucket'].astype(str).str.upper().isin(['PMDO','MDO','PDON','PPMI/PMDO']), 'Variable'].astype(str)))

g = Graph()
g.bind('ppmi', PPMI)
g.bind('rdf', RDF)
g.bind('rdfs', RDFS)
g.bind('owl', OWL)
g.bind('xsd', XSD)

C_Subject = PPMI['Subject']
C_Visit = PPMI['Visit']
C_Obs = PPMI['Observation']
C_DxObs = PPMI['DiagnosisObservation']
C_ImgObs = PPMI['ImagingObservation']

P_hasVisit = PPMI['hasVisit']
P_hasObs = PPMI['hasObservation']
P_observes = PPMI['observesConcept']
P_refersPdon = PPMI['refersToPdonConcept']
P_relRegion = PPMI['relatesToRegion']

P_hasValue = PPMI['hasValue']
P_hasCode  = PPMI['hasCode']
P_hasDecode= PPMI['hasDecode']
P_visitDate= PPMI['visitDate']
P_visitYear= PPMI['visitYear']
P_ageAtVisit=PPMI['ageAtVisit']

SUBJECT_META = ['COHORT','subgroup','SEX','EDUCYRS','fampd_bin']

def add_subject_meta(subj, row):
  for v in SUBJECT_META:
    if v in row.index:
      lit = literal_best(row[v])
      if lit is not None:
        g.add((subj, PPMI[f'subjectMeta/{v}'], lit))

def add_visit_meta(visit, row):
  if 'visit_date' in row.index:
    s = str(row['visit_date']).strip() if not pd.isna(row['visit_date']) else ''
    gy = parse_gYearMonth(s)
    if gy:
      g.add((visit, P_visitDate, Literal(gy, datatype=XSD.gYearMonth)))
    elif s:
      g.add((visit, P_visitDate, Literal(s, datatype=XSD.string)))
  if 'YEAR' in row.index:
    lit = literal_best(row['YEAR'])
    if lit is not None:
      g.add((visit, P_visitYear, lit))
  if 'age_at_visit' in row.index:
    lit = literal_best(row['age_at_visit'])
    if lit is not None:
      g.add((visit, P_ageAtVisit, lit))

SAMPLE_PATNO = str(demo['PATNO'].iloc[0])
sample_df = demo[demo['PATNO'].astype(str) == SAMPLE_PATNO].copy()
print('Sample PATNO:', SAMPLE_PATNO, 'rows:', len(sample_df))

for _, row in sample_df.iterrows():
  patno = str(row['PATNO']).strip()
  event = str(row['EVENT_ID']).strip()
  subj = mint_subject(patno)
  visit = mint_visit(patno, event)

  g.add((subj, RDF.type, C_Subject))
  g.add((visit, RDF.type, C_Visit))
  g.add((subj, P_hasVisit, visit))
  add_subject_meta(subj, row)
  add_visit_meta(visit, row)

  for var in obs_vars:
    if var not in row.index:
      continue
    val_lit = literal_best(row[var])
    if val_lit is None:
      continue

    obs = mint_obs(patno, event, var)
    g.add((visit, P_hasObs, obs))

    if var == 'PRIMDIAG':
      g.add((obs, RDF.type, C_DxObs))
      g.add((obs, P_hasCode, val_lit))
      code_str = str(row[var]).strip()
      try:
        code_norm = f"{float(code_str):.1f}" if re.fullmatch(r'[-+]?\d+(?:\.\d+)?', code_str) else code_str
      except Exception:
        code_norm = code_str
      m = map_rows[(map_rows['Variable']=='PRIMDIAG') & (map_rows['Code'].astype(str).str.strip()==code_norm)]
      if len(m)==1:
        tgt = _clean(m.iloc[0].get('TargetIRI'))
        if tgt:
          g.add((obs, P_refersPdon, URIRef(tgt)))
      continue

    if var in datscan_vars and var.startswith('MIA_'):
      g.add((obs, RDF.type, C_ImgObs))
      tgt = var2target.get(var)
      if tgt:
        g.add((obs, P_relRegion, URIRef(tgt)))
    else:
      g.add((obs, RDF.type, C_Obs))

    g.add((obs, P_hasValue, val_lit))
    tgt = var2target.get(var)
    if tgt:
      g.add((obs, P_observes, URIRef(tgt)))

print('A-box triples:', len(g))


## 5) Serialise A-box to Turtle

In [None]:
g.serialize(destination=str(ABOX_OUT), format='turtle')
print('Wrote:', ABOX_OUT)


## 6) Quick check (SPARQL)

In [None]:
q = '''
PREFIX ppmi: <http://example.org/ppmi-ontology-alignment#>
SELECT ?obs ?val ?concept
WHERE {
  ?visit ppmi:hasObservation ?obs .
  OPTIONAL { ?obs ppmi:hasValue ?val }
  OPTIONAL { ?obs ppmi:observesConcept ?concept }
}
LIMIT 50
'''
for row in g.query(q):
  print(row)
