In [None]:
import pandas
import re
import requests
import xml.etree.ElementTree as ET

csv_name = "AnVILCatalogWorkspacesPublicMetadata-2022-02-19.csv"
phs_id_re = re.compile("phs(\d+)")

# ignored the "(CC)" thing because I'm not sure how it works
# "param" also accepts "optional" but there don't seem to actually be any that have an optional param
# short text for secondary symbols isn't actually used so it's been left blank for MDS which didn't have a short text given
symbol_defs_list = [
    {"name": "NRES", "class": "primary", "param": "none", "short": "No restrictions", "long": "No restrictions on data use."},
    {"name": "GRU", "class": "primary", "param": "none", "short": "General research use and clinical care", "long": "For health/medical/biomedical purposes, including the study of population origins or ancestry."},
    {"name": "HMB", "class": "primary", "param": "none", "short": "Health/medical/biomedical research and clinical care", "long": "Use of the data is limited to health/medical/biomedical purposes; does not include the study of population origins or ancestry."},
    {"name": "DS", "class": "primary", "param": "required", "short": "Disease-specific research and clinical care", "long_with_param": "Use of the data must be related to disease \"[]\"."},
    {"name": "POA", "class": "primary", "param": "none", "short": "Population origins/ancestry research", "long": "Use of the data is limited to the study of population origins or ancestry."},
    {"name": "RS", "class": "secondary", "param": "required", "short": "Other research-specific restrictions", "long_with_param": "Use of the data is limited to studies of research type \"[]\"."},
    {"name": "RUO", "class": "secondary", "param": "none", "short": "Research use only", "long": "Use of data is limited to research purposes (e.g., does not include its use in clinical care)."},
    {"name": "NMDS", "class": "secondary", "param": "none", "short": "No \"general methods\" research", "long": "Use of the data includes methods development research (e.g., development of software or algorithms) ONLY within the bounds of other data use limitations."},
    {"name": "GSO", "class": "secondary", "param": "none", "short": "Genetic studies only", "long": "Use of the data is limited to genetic studies only (i.e., no \"phenotype-only\" research)."},
    {"name": "NPU", "class": "secondary", "param": "none", "short": "Not-for-profit use only", "long": "Use of the data is limited to not-for-profit organizations."},
    {"name": "PUB", "class": "secondary", "param": "none", "short": "Publication required", "long": "Requestor agrees to make results of studies using the data available to the larger scientific community."},
    {"name": "COL", "class": "secondary", "param": "none", "short": "Collaboration required", "long": "Requestor must agree to collaboration with the primary study investigator(s)."},
    {"name": "IRB", "class": "secondary", "param": "none", "short": "Ethics approval required", "long": "Requestor must provide documentation of local IRB/REC approval."},
    {"name": "GS", "class": "secondary", "param": "required", "short": "Geographical restrictions", "long_with_param": "Use of the data is limited to within geographic region \"[]\"."},
    {"name": "MOR", "class": "secondary", "param": "required", "short": "Publication moratorium/embargo", "long_with_param": "Requestor agrees not to publish results of studies until the date []."},
    {"name": "TS", "class": "secondary", "param": "required", "short": "Time limits on use", "long_with_param": "Use of data is approved for [] months."},
    {"name": "US", "class": "secondary", "param": "none", "short": "User-specific restrictions", "long": "Use of data is limited to use by approved users."},
    {"name": "PS", "class": "secondary", "param": "none", "short": "Project-specific restrictions", "long": "Use of data is limited to use within an approved project."},
    {"name": "IS", "class": "secondary", "param": "none", "short": "Institution-specific restrictions", "long": "Use of data is limited to use within an approved institution."},
    {"name": "MDS", "class": "secondary", "param": "none", "short": "", "long": "Use of the data includes methods development research (e.g., development of software or algorithms)"},
]

symbol_defs = {d["name"]: d for d in symbol_defs_list}


def make_csv_with_descriptions():
  df = pandas.read_csv(csv_name, header=0, keep_default_na=False)
  descriptions = {}
  code_lists = {}
  df[["dbgapConsents", "consentFullText", "consentShortText", "generatedFullText", "generatedShortText", "generatedError"]] = [get_row_descriptions(id, code, descriptions, code_lists) for id, code in zip(df["tagsSheet:tag:tags:dbGaP"], df["library:dataUseRestriction"])]
  df.to_csv(re.sub("(?=\\.)|$", "-WithConsents", csv_name, 1), index=False)
  errors_df = df.loc[df["generatedError"] != "", ["name", "tagsSheet:tag:tags:dbGaP", "library:dataUseRestriction", "generatedFullText", "generatedError"]]
  errors_df.to_csv(re.sub("(?=\\.)|$", "-Errors", csv_name, 1), index=False)

def get_row_descriptions(phs_id, consent_code, descriptions, code_lists):
  id_match = phs_id_re.search(phs_id)
  missing_code = consent_code == "NA" or consent_code == "" or consent_code == "Unspecified"
  if id_match is None or missing_code:
    result = ("", "", "")
  else:
    dbgap_id = id_match.group(1)
    if not dbgap_id in descriptions:
      descriptions[dbgap_id], code_lists[dbgap_id] = get_descriptions_from_study(dbgap_id)
    result = (code_lists[dbgap_id],) + descriptions[dbgap_id].get(consent_code, ("Missing", "Missing"))
  if missing_code:
    result += ("", "", "")
  else:
    result += generate_descriptions(consent_code)
  return result

def generate_descriptions(code):
  symbols = [symbol_defs.get(symbol, {"name": symbol, "class": "parameter"}) for symbol in re.split("-|_|,\s*", code)]
  
  short = ""
  long = ""
  error = ""
  i = 0
  current_class = "primary"
  short_has_parens = False

  while i < len(symbols):
    if symbols[i]["class"] != current_class:
      if symbols[i]["class"] == "parameter":
        error = "Unknown symbol \"" + symbols[i]["name"] + "\""
      else:
        error = "Invalid position for " + symbols[i]["class"] + " symbol \"" + symbols[i]["name"] + "\""
      break
    
    if current_class == "primary":
      short += symbols[i]["short"]
    else:
      if short[-1] != "(":
        short += ", "
      short += symbols[i]["name"]
      long += "\n"

    if len(symbols) <= i + 1 or symbols[i + 1]["class"] != "parameter" or symbols[i]["param"] == "none":
      if symbols[i]["param"] == "required":
        error = "Missing required parameter to \"" + symbols[i]["name"] + "\""
        break
      if current_class == "primary" and len(symbols) > i + 1:
        short += " ("
        short_has_parens = True
      long += symbols[i]["long"]
    else:
      i += 1
      if current_class == "primary":
        short += " ("
        short_has_parens = True
      else:
        short += "-"
      short += symbols[i]["name"]
      long += symbols[i - 1]["long_with_param"].replace("[]", symbols[i]["name"])
    
    i += 1

    if current_class == "primary":
      current_class = "secondary"
    
  if short_has_parens:
    short += ")"
  
  return (long, short, error)

def get_descriptions_from_study(dbgap_id):
  xml_text = requests.get("https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + dbgap_id).text
  root = ET.fromstring(xml_text)
  latest_study = None
  latest_version = -1
  for study in root.findall("./Study"):
    study_version = int(study.get("v"))
    if study_version > latest_version:
      latest_study = study
      latest_version = study_version
  study_descs = {}
  all_codes = []
  for group in latest_study.findall("./Policy/ConsentGroup"):
    try:
      long_text = group.find("Use-Restriction").text.strip()
    except AttributeError:
      long_text = "Missing"
    code = group.attrib["name"]
    all_codes.append(code)
    if code in study_descs:
      study_descs[code] = ("Duplicate", "Duplicate")
    else:
      study_descs[code] = (long_text, group.attrib["title"])
  return (study_descs, ", ".join(all_codes))


#print(generate_descriptions("DS-CVD-IRB-COL-MDS"))

make_csv_with_descriptions()