In [1]:
import pandas as pd
import re
import numpy as np


class Form:
    def __init__(self, file_path):
        self.file_path = file_path
        pass

    def process_conditions_section(self):
        with open(self.file_path, "r") as file:
            text = file.read()

        # Extract the CONDITIONS section
        conditions_section = re.search(
            r"CONDITIONS:\n(.*?)\n-{80}", text, re.DOTALL
        ).group(1)

        # Split the section into individual condition entries
        condition_entries = conditions_section.strip().split("\n")

        # Initialize lists to store the parsed data
        start_dates = []
        end_dates = []
        descriptions = []

        # Parse each condition entry
        for entry in condition_entries:
            match = re.match(
                r"\s*(\d{4}-\d{2}-\d{2}) -\s*(\d{4}-\d{2}-\d{2}|)\s*:\s*(.*)", entry
            )
            if match:
                start_date = match.group(1)
                end_date = match.group(2) if match.group(2) else None
                description = match.group(3)

                start_dates.append(start_date)
                end_dates.append(end_date)
                descriptions.append(description)

        # Create a DataFrame from the parsed data
        df = pd.DataFrame(
            {"start": start_dates, "end": end_dates, "description": descriptions}
        )

        return df

    def process_observations_section(self):
        # Read the text file
        with open(self.file_path, "r") as file:
            text = file.read()

        # Extract the observations section
        observations_section = re.search(
            r"OBSERVATIONS:(.*?)(?=IMMUNIZATIONS:)", text, re.DOTALL
        ).group(1)

        # Split the observations into individual entries
        observations = re.findall(
            r"(\d{4}-\d{2}-\d{2}.*?)(?=\d{4}-\d{2}-\d{2}|$)",
            observations_section,
            re.DOTALL,
        )

        # Create a DataFrame with columns date and content
        data = []
        for observation in observations:
            date = re.search(r"\d{4}-\d{2}-\d{2}", observation).group(0)
            content = observation[len(date) :].strip()
            if content.startswith(":"):
                content = content[1:].strip()
            data.append([date, content])

        df = pd.DataFrame(data, columns=["date", "content"])

        return df

In [2]:
form = Form("shortened_example.txt")

In [3]:
obs_df = form.process_observations_section()
obs_df.head(30)

Unnamed: 0,date,content
0,2014-04-24,Cause of Death [US Standard Certificate of Dea...
1,2014-03-27,Patient Health Questionnaire 2 item (PHQ-2) to...
2,2014-03-27,Total score [HARK] 0.0 {...
3,2014-03-27,Fall risk level [Morse Fall Scale] High ...
4,2014-03-27,Fall risk total [Morse Fall Scale] 101.0...
5,2014-03-27,Generalized anxiety disorder 7 item (GAD-7) to...
6,2014-03-27,Protocol for Responding to and Assessing Patie...
7,2014-03-27,Tobacco smoking status Never...
8,2014-03-27,"Carbon dioxide, total [Moles/volume] in Blood ..."
9,2014-03-27,Chloride [Moles/volume] in Blood 109.7...


In [4]:
cond_df = form.process_conditions_section()
cond_df.head(30)

Unnamed: 0,start,end,description
0,2014-04-03,,Viral sinusitis (disorder)
1,2014-03-27,,Part-time employment (finding)
2,2014-03-27,2014-03-27,Medication review due (situation)
3,2013-05-23,,Stress (finding)
4,2013-05-23,2014-03-27,Not in labor force (finding)
5,2013-05-23,2013-05-23,Medication review due (situation)
6,2012-05-17,2012-05-31,Gingivitis (disorder)
7,2012-05-17,2013-05-23,Reports of violence in the environment (finding)
8,2012-05-17,2012-05-17,Medication review due (situation)
9,2011-06-23,2011-07-06,Acute bronchitis (disorder)


In [6]:
cond_df[cond_df["end"].isnull()]

Unnamed: 0,start,end,description
0,2014-04-03,,Viral sinusitis (disorder)
1,2014-03-27,,Part-time employment (finding)
3,2013-05-23,,Stress (finding)
19,2009-04-30,,Alzheimer's disease (disorder)


In [None]:
obs_df.iloc[29].content

"Protocol for Responding to and Assessing Patients' Assets, Risks, and Experiences [PRAPARE]\n           - Within the last year, have you been afraid of your partner or ex-partner? No \n           - Do you feel physically and emotionally safe where you currently live? Yes \n           - Are you a refugee?                       No \n           - In the past year, have you spent more than 2 nights in a row in a jail, prison, detention center, or juvenile correctional facility? No \n           - Stress level                             Not at all \n           - How often do you see or talk to people that you care about and feel close to (For example: talking to friends on the phone, visiting friends or family, going to church or club meetings)? 3 to 5 times a week \n           - Has lack of transportation kept you from medical appointments, meetings, work, or from getting things needed for daily living No \n           - In the past year, have you or any family members you live with been u

In [None]:
CONDITIONS_PATH = "category_generator/data/CDSi ScheduleSupportingData- Coded Observations-508_v4.60_withRSV.csv"
codes_df = pd.read_csv(CONDITIONS_PATH)
codes_df.head()

Unnamed: 0,Observation Code,Observation Title,Indication Text Description,Contraindication Text Description,Clarifying Text,SNOMED (Code),CVX (Code),PHIN VS (Code),Relevance to RSV for 60-74y?,CDSi Observation Notes relevant to RSV for 60-74y,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,1.0,Patient seeks protection,Administer to persons seeking protection.,,,,,,,,...,,,,,,,,,,
1,2.0,Undergoing elective splenectomy,Administer to persons who are undergoing elect...,,Vaccination 14 or more days before splenectomy...,,,,,,...,,,,,,,,,,
2,3.0,Immunocompromised,,Do not vaccinate if the patient is immunocompr...,,Patient Immunocompromised (370388006),,Immunodeficiency due to any cause (VXC27),,,...,,,,,,,,,,
3,4.0,Recipient of a hematopoietic stem cell transplant,Administer to recipients of a hematopoietic st...,,,Hemopoietic stem cell transplant [procedure] (...,,,Indication,,...,,,,,,,,,,
4,5.0,Hepatitis C virus infection,Administer to persons with a hepatitis C virus...,,,Viral hepatitis type C [disorder] (50711007),,,,,...,,,,,,,,,,


In [None]:
codes_df.isna().sum()

Observation Code                                     732
Observation Title                                    732
Indication Text Description                          834
Contraindication Text Description                    883
Clarifying Text                                      941
SNOMED (Code)                                        852
CVX (Code)                                           968
PHIN VS (Code)                                       967
Relevance to RSV for 60-74y?                         961
CDSi Observation Notes relevant to RSV for 60-74y    989
Unnamed: 10                                          999
Unnamed: 11                                          999
Unnamed: 12                                          999
Unnamed: 13                                          999
Unnamed: 14                                          999
Unnamed: 15                                          999
Unnamed: 16                                          999
Unnamed: 17                    

In [None]:
cols_to_drop = [f"Unnamed: {i}" for i in range(10, 25 + 1)]
codes_df.drop(columns=cols_to_drop, inplace=True)

In [None]:
codes_df

Unnamed: 0,Observation Code,Observation Title,Indication Text Description,Contraindication Text Description,Clarifying Text,SNOMED (Code),CVX (Code),PHIN VS (Code),Relevance to RSV for 60-74y?,CDSi Observation Notes relevant to RSV for 60-74y
0,1.0,Patient seeks protection,Administer to persons seeking protection.,,,,,,,
1,2.0,Undergoing elective splenectomy,Administer to persons who are undergoing elect...,,Vaccination 14 or more days before splenectomy...,,,,,
2,3.0,Immunocompromised,,Do not vaccinate if the patient is immunocompr...,,Patient Immunocompromised (370388006),,Immunodeficiency due to any cause (VXC27),,
3,4.0,Recipient of a hematopoietic stem cell transplant,Administer to recipients of a hematopoietic st...,,,Hemopoietic stem cell transplant [procedure] (...,,,Indication,
4,5.0,Hepatitis C virus infection,Administer to persons with a hepatitis C virus...,,,Viral hepatitis type C [disorder] (50711007),,,,
...,...,...,...,...,...,...,...,...,...,...
994,,,,,,,,,,
995,,,,,,,,,,
996,,,,,,,,,,
997,,,,,,,,,,


In [None]:
# find the first row that's all NaN
first_nan_row = codes_df.isna().all(axis=1).idxmax()
conditions_df = codes_df.iloc[:first_nan_row]

In [None]:
codes_df

Unnamed: 0,Observation Code,Observation Title,Indication Text Description,Contraindication Text Description,Clarifying Text,SNOMED (Code),CVX (Code),PHIN VS (Code),Relevance to RSV for 60-74y?,CDSi Observation Notes relevant to RSV for 60-74y
0,1.0,Patient seeks protection,Administer to persons seeking protection.,,,,,,,
1,2.0,Undergoing elective splenectomy,Administer to persons who are undergoing elect...,,Vaccination 14 or more days before splenectomy...,,,,,
2,3.0,Immunocompromised,,Do not vaccinate if the patient is immunocompr...,,Patient Immunocompromised (370388006),,Immunodeficiency due to any cause (VXC27),,
3,4.0,Recipient of a hematopoietic stem cell transplant,Administer to recipients of a hematopoietic st...,,,Hemopoietic stem cell transplant [procedure] (...,,,Indication,
4,5.0,Hepatitis C virus infection,Administer to persons with a hepatitis C virus...,,,Viral hepatitis type C [disorder] (50711007),,,,
...,...,...,...,...,...,...,...,...,...,...
994,,,,,,,,,,
995,,,,,,,,,,
996,,,,,,,,,,
997,,,,,,,,,,


In [None]:
print(codes_df.iloc[4]["Indication Text Description"])
print(codes_df.iloc[4])

Administer to persons with a hepatitis C virus infection.
Observation Code                                                                                   5.0
Observation Title                                                          Hepatitis C virus infection
Indication Text Description                          Administer to persons with a hepatitis C virus...
Contraindication Text Description                                                                  NaN
Clarifying Text                                                                                    NaN
SNOMED (Code)                                             Viral hepatitis type C [disorder] (50711007)
CVX (Code)                                                                                         NaN
PHIN VS (Code)                                                                                     NaN
Relevance to RSV for 60-74y?                                                                       NaN
CDSi Observatio

In [None]:
codes_df.isna().sum()

Observation Code                                     732
Observation Title                                    732
Indication Text Description                          834
Contraindication Text Description                    883
Clarifying Text                                      941
SNOMED (Code)                                        852
CVX (Code)                                           968
PHIN VS (Code)                                       967
Relevance to RSV for 60-74y?                         961
CDSi Observation Notes relevant to RSV for 60-74y    989
dtype: int64