# Person7 — Relation Extraction, Normalization & Summarization

Single-file Jupyter notebook implementing the full Person7 pipeline for your case study.

This notebook is self-contained and includes sample data so you can run it immediately.

What it contains (in order):
1. Setup & sample data creation
2. Utilities and helper functions
3. Relation extraction (rule-based SLM baseline)
4. Entity normalization (simple dictionary + fuzzy matching via difflib)
5. Extractive summarization baseline
6. Small manual annotation + evaluation (precision/recall/F1)
7. Simple plots (top drugs by ADE count, relation distribution)
8. Notes on how to plug into your existing pipeline and optionally call an LLM

Run all cells in order. Replace sample files with your real Person2 outputs to process real data.


In [None]:
import os, json, re, difflib, math
from collections import Counter, defaultdict
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

Path('data/dictionaries').mkdir(parents=True, exist_ok=True)
Path('data/cleaned').mkdir(parents=True, exist_ok=True)
Path('results').mkdir(parents=True, exist_ok=True)

print('Working dir:', os.getcwd())
print('Directories ready.')


In [None]:
from pathlib import Path
import json
import pandas as pd

sample_extractions_path = Path('results/extractions.json')
if not sample_extractions_path.exists():
    sample_data = [
        {'id':'doc1_sent1','sentence':'Patient developed nausea after taking paracetamol.','drugs':['paracetamol'],'events':['nausea'],'source':'FAERS'},
        {'id':'doc1_sent2','sentence':'After starting ibuprofen the patient reported stomach pain and vomiting.','drugs':['ibuprofen'],'events':['stomach pain','vomiting'],'source':'PubMed'},
        {'id':'doc2_sent1','sentence':'Headache was reported, possibly related to metformin use.','drugs':['metformin'],'events':['headache'],'source':'FAERS'},
        {'id':'doc3_sent2','sentence':'No drug was mentioned but patient had severe dizziness.','drugs':[],'events':['dizziness'],'source':'PubMed'},
        {'id':'doc4_sent1','sentence':'Patient experienced rash following antibiotic therapy (amoxicillin).','drugs':['amoxicillin'],'events':['rash'],'source':'FAERS'},
        {'id':'doc5_sent1','sentence':'Vomiting occurred but causality with drug is unclear.','drugs':['paracetamol'],'events':['vomiting'],'source':'PubMed'}
    ]
    with open(sample_extractions_path, 'w', encoding='utf-8') as f:
        json.dump(sample_data, f, indent=2, ensure_ascii=False)
    print('Created sample results/extractions.json')
else:
    print('Found existing results/extractions.json - leaving it intact.')

drugs_csv = Path('data/dictionaries/drugs.csv')
if not drugs_csv.exists():
    drugs_dict = [
        {'name':'paracetamol','id':'DB00316','atc':'N02BE01'},
        {'name':'ibuprofen','id':'DB01050','atc':'M01AE01'},
        {'name':'metformin','id':'DB00331','atc':'A10BA02'},
        {'name':'amoxicillin','id':'DB01060','atc':'J01CA04'}
    ]
    pd.DataFrame(drugs_dict).to_csv(drugs_csv, index=False)
    print('Created sample data/dictionaries/drugs.csv')
else:
    print('Found existing data/dictionaries/drugs.csv')

meddra_csv = Path('data/dictionaries/meddra.csv')
if not meddra_csv.exists():
    meddra_list = [
        {'term':'nausea','code':'10028813'},
        {'term':'stomach pain','code':'10012345'},
        {'term':'vomiting','code':'10047700'},
        {'term':'headache','code':'10019211'},
        {'term':'dizziness','code':'10013384'},
        {'term':'rash','code':'10039906'}
    ]
    pd.DataFrame(meddra_list).to_csv(meddra_csv, index=False)
    print('Created sample data/dictionaries/meddra.csv')
else:
    print('Found existing data/dictionaries/meddra.csv')


In [None]:
import csv
from typing import List, Tuple, Dict, Any
import json, re, difflib, os

def load_json(path: str):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(obj: Any, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)

def load_dict_csv(path: str, key_col: str = 'name') -> Dict[str, dict]:
    d = {}
    with open(path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for r in reader:
            d[r[key_col].strip().lower()] = r
    return d

def simple_normalize(text: str) -> str:
    if not text:
        return ''
    return re.sub(r'[^a-z0-9 ]','', text.lower()).strip()

def fuzzy_match_difflib(term: str, candidates: List[str], cutoff: float = 0.6) -> Tuple[str, float]:
    if not term or not candidates:
        return None, 0.0
    matches = difflib.get_close_matches(term, candidates, n=1, cutoff=cutoff)
    if not matches:
        return None, 0.0
    match = matches[0]
    seq = difflib.SequenceMatcher(None, term, match)
    return match, seq.ratio()


In [None]:
# Relation extraction (rule-based baseline)

CAUSAL_WORDS = [
    'caused','causing','cause','associated with','associated','led to',
    'resulted in','reported','due to','after','following','induced','secondary to',
    'related to','triggered','following'
]

def sentence_contains_cue(sentence: str) -> bool:
    s = sentence.lower()
    return any(cue in s for cue in CAUSAL_WORDS)

def relation_rule_based(entry: dict) -> List[dict]:
    # For each (drug,event) pair produce a relation dict using heuristics
    sentence = entry.get('sentence','')
    drugs = entry.get('drugs',[]) or []
    events = entry.get('events',[]) or []
    relations = []
    tokens = re.findall(r'\\w+', sentence.lower())
    for d in drugs:
        for e in events:
            r = {'drug': d, 'event': e, 'sentence': sentence, 'method':'rule_based', 'relation':'uncertain','score':0.0, 'id': entry.get('id')}
            if sentence_contains_cue(sentence):
                r['relation'] = 'adverse_effect'
                r['score'] = 0.9
            else:
                try:
                    dpos = next((i for i,t in enumerate(tokens) if t == simple_normalize(d).split()[0]), None)
                    epos = next((i for i,t in enumerate(tokens) if t == simple_normalize(e).split()[0]), None)
                    if dpos is not None and epos is not None and abs(dpos-epos) <= 6:
                        r['relation'] = 'adverse_effect'
                        r['score'] = 0.6
                    else:
                        r['relation'] = 'not_related'
                        r['score'] = 0.2
                except Exception:
                    r['relation'] = 'uncertain'
                    r['score'] = 0.1
            relations.append(r)
    if not drugs and events:
        for e in events:
            relations.append({'drug': None, 'event': e, 'sentence': sentence, 'method':'rule_based','relation':'not_related','score':0.0,'id':entry.get('id')})
    return relations

# Run relation extraction on results/extractions.json
extractions = load_json('results/extractions.json')
all_relations = []
for entry in extractions:
    rels = relation_rule_based(entry)
    all_relations.extend(rels)

save_json(all_relations, 'results/relations.json')
print(f'Extracted {len(all_relations)} relations. Saved to results/relations.json')
import pandas as pd
df_rel = pd.DataFrame(all_relations)
df_rel.head(10)


In [None]:
# Normalization: map drugs -> drug_id/atc, events -> meddra code using simple fuzzy matching
drugs_map = load_dict_csv('data/dictionaries/drugs.csv', key_col='name')
meddra_map = load_dict_csv('data/dictionaries/meddra.csv', key_col='term')
drug_candidates = list(drugs_map.keys())
event_candidates = list(meddra_map.keys())

def normalize_relations(relations: List[dict]):
    normalized = []
    for r in relations:
        dr = r.get('drug')
        ev = r.get('event')
        drn = simple_normalize(dr) if dr else ''
        evn = simple_normalize(ev) if ev else ''
        drug_match, dscore = fuzzy_match_difflib(drn, drug_candidates, cutoff=0.6) if drn else (None, 0.0)
        event_match, escore = fuzzy_match_difflib(evn, event_candidates, cutoff=0.6) if evn else (None, 0.0)
        out = dict(r)
        if drug_match:
            out['drug_normalized_name'] = drug_match
            out['drug_id'] = drugs_map[drug_match].get('id')
            out['drug_atc'] = drugs_map[drug_match].get('atc')
            out['drug_match_score'] = round(dscore,3)
        else:
            out['drug_normalized_name'] = drn or None
            out['drug_id'] = None
            out['drug_atc'] = None
            out['drug_match_score'] = round(dscore,3)
        if event_match:
            out['event_normalized_term'] = event_match
            out['event_code'] = meddra_map[event_match].get('code')
            out['event_match_score'] = round(escore,3)
        else:
            out['event_normalized_term'] = evn or None
            out['event_code'] = None
            out['event_match_score'] = round(escore,3)
        normalized.append(out)
    return normalized

normalized = normalize_relations(all_relations)
save_json(normalized, 'results/normalized.json')
print(f'Saved {len(normalized)} normalized relations to results/normalized.json')
import pandas as pd
df_norm = pd.DataFrame(normalized)
df_norm.head(12)


In [None]:
# Summarization (extractive baseline): simple first-2-sentences approach
def extractive_summary(text: str, n_sent=2):
    if not text or not text.strip():
        return ''
    sents = re.split(r'(?<=[.!?])\\s+', text.strip())
    return ' '.join(sents[:n_sent])

summaries = []
for entry in extractions:
    txt = entry.get('sentence','')
    summ = extractive_summary(txt, n_sent=2)
    summaries.append({'id': entry.get('id'), 'summary_extractive': summ})

save_json(summaries, 'results/summaries.json')
print(f'Saved {len(summaries)} extractive summaries to results/summaries.json')
import pandas as pd
pd.DataFrame(summaries).head(8)


In [None]:
# Small manual annotation set for evaluation.
manual_annotations = [
    {'id':'doc1_sent1','drug':'paracetamol','event':'nausea','label':'adverse_effect'},
    {'id':'doc1_sent2','drug':'ibuprofen','event':'stomach pain','label':'adverse_effect'},
    {'id':'doc1_sent2','drug':'ibuprofen','event':'vomiting','label':'adverse_effect'},
    {'id':'doc2_sent1','drug':'metformin','event':'headache','label':'adverse_effect'},
    {'id':'doc3_sent2','drug':None,'event':'dizziness','label':'not_related'},
    {'id':'doc4_sent1','drug':'amoxicillin','event':'rash','label':'adverse_effect'},
    {'id':'doc5_sent1','drug':'paracetamol','event':'vomiting','label':'uncertain'}
]

import pandas as pd
df_pred = pd.DataFrame(normalized)
df_pred_eval = df_pred[['id','drug','event','relation','score']].copy()
df_truth = pd.DataFrame(manual_annotations)
df_truth['drug'] = df_truth['drug'].where(df_truth['drug'].notnull(), None)

merged = pd.merge(df_truth, df_pred_eval, on=['id','drug','event'], how='left', suffixes=('_truth','_pred'))
merged['relation'] = merged['relation'].fillna('not_found')

def compute_metrics(df, positive_label='adverse_effect'):
    tp = ((df['label']==positive_label) & (df['relation']==positive_label)).sum()
    fp = ((df['label']!=positive_label) & (df['relation']==positive_label)).sum()
    fn = ((df['label']==positive_label) & (df['relation']!=positive_label)).sum()
    precision = tp / (tp+fp) if (tp+fp)>0 else 0.0
    recall = tp / (tp+fn) if (tp+fn)>0 else 0.0
    f1 = 2*precision*recall/(precision+recall) if (precision+recall)>0 else 0.0
    return {'tp':int(tp),'fp':int(fp),'fn':int(fn),'precision':round(precision,3),'recall':round(recall,3),'f1':round(f1,3)}

metrics = compute_metrics(merged)
print('Evaluation metrics (rule-based):')
metrics


In [None]:
# Plots: top drugs by ADE count and relation distribution
import matplotlib.pyplot as plt

df_norm_local = df_norm.copy()
df_ade = df_norm_local[df_norm_local['relation']=='adverse_effect'].copy()
top_drugs = df_ade['drug_normalized_name'].value_counts().head(10)

plt.figure(figsize=(8,4))
top_drugs.plot(kind='bar')
plt.title('Top drugs by adverse_effect count (rule-based)')
plt.ylabel('Count')
plt.xlabel('Drug (normalized)')
plt.tight_layout()
plt.show()

rel_counts = df_norm_local['relation'].value_counts()
plt.figure(figsize=(6,4))
rel_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140)
plt.ylabel('')
plt.title('Relation label distribution (rule-based)')
plt.tight_layout()
plt.show()


In [None]:
# Final notes & how to adapt this notebook for your real pipeline

notes_lines = [
    '- Replace results/extractions.json with the Person2 output (keep fields: id, sentence, drugs[], events[])',
    '- Replace data/dictionaries/drugs.csv and data/dictionaries/meddra.csv with your fuller dictionaries',
    '- Optionally improve normalization using fuzzywuzzy or a proper UMLS lookup (if available)',
    '- To add LLM-based relation disambiguation: implement a function that calls your LLM (OpenAI or local), send a few-shot prompt, and parse JSON response',
    '- For borderline relations (score<0.8) you can call the LLM to refine the label',
    '- To expose Person7 as an API endpoint: embed main functions into a FastAPI endpoint (e.g., /person7/relations) and call from Person4 backend',
    'This notebook saved: results/relations.json, results/normalized.json, results/summaries.json',
    'Upload these files to GitHub as part of your case study.'
]
for line in notes_lines:
    print(line)
