In [1]:
import pandas as pd
import numpy as np

In [2]:
# read tab delimited data
data = pd.read_csv("./data/data.txt", sep="\t")
# remove all na rows
data = data.dropna(axis=0, how="all")
# remove all na columns
data = data.dropna(axis=1, how="all")
# strip strings
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)
# data.columns.values
data

Unnamed: 0,SUBJECTIVE SYMPTOM,ASSOCIATED SYMPTOMS,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,INVESTIGATIONS DONE,Unnamed: 14,...,PROVISIONAL DIAGNOSIS,Unnamed: 25,Unnamed: 26,ADVISED INVESTIGATIONS,Unnamed: 32,Unnamed: 33,MANAGEMENT,Unnamed: 36,Unnamed: 37,SURGICAL MANAGEMENT
0,nasal,sneezing,asthma,stress,enhancing factors,itchy throat,red eyes,,blood tests,blood IgE,...,allergy,,,antihistamine,nasal spray - steroid,,,,,
1,nasal,sneezing,allergy,enhancing factors,reducing factors,,,,blood tests,blood IgE,...,allergy,,,antihistamine,nasal spray - decongestant,,,,,
2,nasal,runny nose,skin allergy,anxiety,reducing factors,treatment history,,,blood tests,blood IgE,...,allergy,vasomotor,,avoid stress/anxiety,nasal spray - decongestant,,medical then surgical,,,surgery
3,nasal,sneezing,runny nose,blockage in nose,,,,,blood tests,blood IgE,...,allergy,nasal polyposis,,antihistamine,nasal spray - steroid,,medical then surgical,,,FESS
4,nasal,runny nose,blockage in nose,,,,,,blood tests,,...,vasomotor,deviated nasal septum,nasal polyposis,avoid stress/anxiety,nasal spray - decongestant,,medical and surgical,,,surgery
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,male breast,swelling in breast area in males,pain and tenderness in male breast,,,,,,,,...,gynecomastia,,,FNAC and ultrasound breast,harmone studies,,,,,surgery
84,chest pain,discomfort in left side of chest,"referred pain in jaw, neck, back","referred pain left shoulder, arm",fainting sweating,shortness of breath,,,,,...,heart attack,,,ECG,,,limit physical activities,call nerarby hospital and ambulance,do not drive,
85,chest pain,sharp chest pain while breating,pain stops on holding breath,,,,,,,,...,pleurisy,,,X-Ray chest,HRCT scan chest,ECG,consult pulmonologist or physician,complete check up till final diagnosis,,
86,chest pain,pain in chest few inches away from midline,pain in chest wall on stress and anxiety,pain in chest wall on viral infection,local tenderness on chest wall,pain in floating ribs area,,,,,...,costochondritis,,,X-Ray chest,ECG,,complete check up till final diagnosis,avoid stress and anxiety,local rubefacient,


In [3]:
# replace unnamed columns with last named column in lower case and space replaced with _
columns = [
    last_named_column
    if "Unnamed" in column
    else (last_named_column := column.lower().replace(" ", "_"))
    for column in data.columns
]
# # alternatively,
# columns = []
# for column in data.columns:
#     if not "Unnamed" in column:
#         last_named_column = column.lower().replace(" ", "_")
#     columns.append(last_named_column)
data.columns = columns

# # make it all lower case
# data = data.applymap(lambda s: s.lower() if type(s) == str else s)

data

Unnamed: 0,subjective_symptom,associated_symptoms,associated_symptoms.1,associated_symptoms.2,associated_symptoms.3,associated_symptoms.4,associated_symptoms.5,associated_symptoms.6,investigations_done,investigations_done.1,...,provisional_diagnosis,provisional_diagnosis.1,provisional_diagnosis.2,advised_investigations,advised_investigations.1,advised_investigations.2,management,management.1,management.2,surgical_management
0,nasal,sneezing,asthma,stress,enhancing factors,itchy throat,red eyes,,blood tests,blood IgE,...,allergy,,,antihistamine,nasal spray - steroid,,,,,
1,nasal,sneezing,allergy,enhancing factors,reducing factors,,,,blood tests,blood IgE,...,allergy,,,antihistamine,nasal spray - decongestant,,,,,
2,nasal,runny nose,skin allergy,anxiety,reducing factors,treatment history,,,blood tests,blood IgE,...,allergy,vasomotor,,avoid stress/anxiety,nasal spray - decongestant,,medical then surgical,,,surgery
3,nasal,sneezing,runny nose,blockage in nose,,,,,blood tests,blood IgE,...,allergy,nasal polyposis,,antihistamine,nasal spray - steroid,,medical then surgical,,,FESS
4,nasal,runny nose,blockage in nose,,,,,,blood tests,,...,vasomotor,deviated nasal septum,nasal polyposis,avoid stress/anxiety,nasal spray - decongestant,,medical and surgical,,,surgery
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,male breast,swelling in breast area in males,pain and tenderness in male breast,,,,,,,,...,gynecomastia,,,FNAC and ultrasound breast,harmone studies,,,,,surgery
84,chest pain,discomfort in left side of chest,"referred pain in jaw, neck, back","referred pain left shoulder, arm",fainting sweating,shortness of breath,,,,,...,heart attack,,,ECG,,,limit physical activities,call nerarby hospital and ambulance,do not drive,
85,chest pain,sharp chest pain while breating,pain stops on holding breath,,,,,,,,...,pleurisy,,,X-Ray chest,HRCT scan chest,ECG,consult pulmonologist or physician,complete check up till final diagnosis,,
86,chest pain,pain in chest few inches away from midline,pain in chest wall on stress and anxiety,pain in chest wall on viral infection,local tenderness on chest wall,pain in floating ribs area,,,,,...,costochondritis,,,X-Ray chest,ECG,,complete check up till final diagnosis,avoid stress and anxiety,local rubefacient,


In [4]:
if isinstance(data.associated_symptoms, pd.DataFrame):
    # merge and join all associated_symptoms with "|"
    new_data = pd.DataFrame()
    for _, row in data.iterrows():
        new_data = pd.concat(
            [
                new_data,
                pd.DataFrame(
                    {
                        "associated_symptoms": [
                            "|".join(row.associated_symptoms.dropna().to_list())
                        ]
                    }
                ),
            ]
        )
    data = data.drop(columns="associated_symptoms")
    data.insert(
        1, column="associated_symptoms", value=new_data.associated_symptoms.to_list()
    )

data

Unnamed: 0,subjective_symptom,associated_symptoms,investigations_done,investigations_done.1,gender,age,age.1,age.2,age.3,provisional_diagnosis,provisional_diagnosis.1,provisional_diagnosis.2,advised_investigations,advised_investigations.1,advised_investigations.2,management,management.1,management.2,surgical_management
0,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests,blood IgE,both,all ages,,,,allergy,,,antihistamine,nasal spray - steroid,,,,,
1,nasal,sneezing|allergy|enhancing factors|reducing fa...,blood tests,blood IgE,both,all ages,,,,allergy,,,antihistamine,nasal spray - decongestant,,,,,
2,nasal,runny nose|skin allergy|anxiety|reducing facto...,blood tests,blood IgE,both,all ages,,,,allergy,vasomotor,,avoid stress/anxiety,nasal spray - decongestant,,medical then surgical,,,surgery
3,nasal,sneezing|runny nose|blockage in nose,blood tests,blood IgE,both,all ages,,,,allergy,nasal polyposis,,antihistamine,nasal spray - steroid,,medical then surgical,,,FESS
4,nasal,runny nose|blockage in nose,blood tests,,both,all ages,,,,vasomotor,deviated nasal septum,nasal polyposis,avoid stress/anxiety,nasal spray - decongestant,,medical and surgical,,,surgery
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,male breast,swelling in breast area in males|pain and tend...,,,male,above 50 years,,,,gynecomastia,,,FNAC and ultrasound breast,harmone studies,,,,,surgery
84,chest pain,discomfort in left side of chest|referred pain...,,,both,all ages,,,,heart attack,,,ECG,,,limit physical activities,call nerarby hospital and ambulance,do not drive,
85,chest pain,sharp chest pain while breating|pain stops on ...,,,both,all ages,,,,pleurisy,,,X-Ray chest,HRCT scan chest,ECG,consult pulmonologist or physician,complete check up till final diagnosis,,
86,chest pain,pain in chest few inches away from midline|pai...,,,both,all ages,,,,costochondritis,,,X-Ray chest,ECG,,complete check up till final diagnosis,avoid stress and anxiety,local rubefacient,


In [5]:
if isinstance(data.investigations_done, pd.DataFrame):
    # merge and join all investigations_done with "|"
    new_data = pd.DataFrame()
    for _, row in data.iterrows():
        new_data = pd.concat(
            [
                new_data,
                pd.DataFrame(
                    {
                        "investigations_done": [
                            "|".join(row.investigations_done.dropna().to_list())
                        ]
                    }
                ),
            ]
        )
    data = data.drop(columns="investigations_done")
    data.insert(
        2, column="investigations_done", value=new_data.investigations_done.to_list()
    )

data

Unnamed: 0,subjective_symptom,associated_symptoms,investigations_done,gender,age,age.1,age.2,age.3,provisional_diagnosis,provisional_diagnosis.1,provisional_diagnosis.2,advised_investigations,advised_investigations.1,advised_investigations.2,management,management.1,management.2,surgical_management
0,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood IgE,both,all ages,,,,allergy,,,antihistamine,nasal spray - steroid,,,,,
1,nasal,sneezing|allergy|enhancing factors|reducing fa...,blood tests|blood IgE,both,all ages,,,,allergy,,,antihistamine,nasal spray - decongestant,,,,,
2,nasal,runny nose|skin allergy|anxiety|reducing facto...,blood tests|blood IgE,both,all ages,,,,allergy,vasomotor,,avoid stress/anxiety,nasal spray - decongestant,,medical then surgical,,,surgery
3,nasal,sneezing|runny nose|blockage in nose,blood tests|blood IgE,both,all ages,,,,allergy,nasal polyposis,,antihistamine,nasal spray - steroid,,medical then surgical,,,FESS
4,nasal,runny nose|blockage in nose,blood tests,both,all ages,,,,vasomotor,deviated nasal septum,nasal polyposis,avoid stress/anxiety,nasal spray - decongestant,,medical and surgical,,,surgery
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,male breast,swelling in breast area in males|pain and tend...,,male,above 50 years,,,,gynecomastia,,,FNAC and ultrasound breast,harmone studies,,,,,surgery
84,chest pain,discomfort in left side of chest|referred pain...,,both,all ages,,,,heart attack,,,ECG,,,limit physical activities,call nerarby hospital and ambulance,do not drive,
85,chest pain,sharp chest pain while breating|pain stops on ...,,both,all ages,,,,pleurisy,,,X-Ray chest,HRCT scan chest,ECG,consult pulmonologist or physician,complete check up till final diagnosis,,
86,chest pain,pain in chest few inches away from midline|pai...,,both,all ages,,,,costochondritis,,,X-Ray chest,ECG,,complete check up till final diagnosis,avoid stress and anxiety,local rubefacient,


In [6]:
if isinstance(data.age, pd.DataFrame):
    # split multiple age values to multiple records
    processed_data = pd.DataFrame()
    for _, row in data.iterrows():
        items = row.age.dropna().to_list()
        new_data = row.copy().to_frame().T
        new_data = new_data.drop(columns="age")
        new_data.insert(
            4,
            column="age",
            value=items[0],
        )
        for item in items[1:]:
            new_data = pd.concat([new_data, new_data.tail(1)], ignore_index=True)
            new_data.at[new_data.shape[0] - 1, "age"] = item
        processed_data = pd.concat([processed_data, new_data], ignore_index=True)

    data = processed_data

data

Unnamed: 0,subjective_symptom,associated_symptoms,investigations_done,gender,age,provisional_diagnosis,provisional_diagnosis.1,provisional_diagnosis.2,advised_investigations,advised_investigations.1,advised_investigations.2,management,management.1,management.2,surgical_management
0,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood IgE,both,all ages,allergy,,,antihistamine,nasal spray - steroid,,,,,
1,nasal,sneezing|allergy|enhancing factors|reducing fa...,blood tests|blood IgE,both,all ages,allergy,,,antihistamine,nasal spray - decongestant,,,,,
2,nasal,runny nose|skin allergy|anxiety|reducing facto...,blood tests|blood IgE,both,all ages,allergy,vasomotor,,avoid stress/anxiety,nasal spray - decongestant,,medical then surgical,,,surgery
3,nasal,sneezing|runny nose|blockage in nose,blood tests|blood IgE,both,all ages,allergy,nasal polyposis,,antihistamine,nasal spray - steroid,,medical then surgical,,,FESS
4,nasal,runny nose|blockage in nose,blood tests,both,all ages,vasomotor,deviated nasal septum,nasal polyposis,avoid stress/anxiety,nasal spray - decongestant,,medical and surgical,,,surgery
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,male breast,swelling in breast area in males|pain and tend...,,male,above 50 years,gynecomastia,,,FNAC and ultrasound breast,harmone studies,,,,,surgery
87,chest pain,discomfort in left side of chest|referred pain...,,both,all ages,heart attack,,,ECG,,,limit physical activities,call nerarby hospital and ambulance,do not drive,
88,chest pain,sharp chest pain while breating|pain stops on ...,,both,all ages,pleurisy,,,X-Ray chest,HRCT scan chest,ECG,consult pulmonologist or physician,complete check up till final diagnosis,,
89,chest pain,pain in chest few inches away from midline|pai...,,both,all ages,costochondritis,,,X-Ray chest,ECG,,complete check up till final diagnosis,avoid stress and anxiety,local rubefacient,


In [7]:
if isinstance(data.provisional_diagnosis, pd.DataFrame):
    # split multiple provisional_diagnosis values to multiple records
    processed_data = pd.DataFrame()
    for _, row in data.iterrows():
        items = row.provisional_diagnosis.dropna().to_list()
        new_data = row.copy().to_frame().T
        new_data = new_data.drop(columns="provisional_diagnosis")
        new_data.insert(
            5,
            column="provisional_diagnosis",
            value=items[0],
        )
        for item in items[1:]:
            new_data = pd.concat([new_data, new_data.tail(1)], ignore_index=True)
            new_data.at[new_data.shape[0] - 1, "provisional_diagnosis"] = item
        processed_data = pd.concat([processed_data, new_data], ignore_index=True)

    data = processed_data

data

Unnamed: 0,subjective_symptom,associated_symptoms,investigations_done,gender,age,provisional_diagnosis,advised_investigations,advised_investigations.1,advised_investigations.2,management,management.1,management.2,surgical_management
0,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood IgE,both,all ages,allergy,antihistamine,nasal spray - steroid,,,,,
1,nasal,sneezing|allergy|enhancing factors|reducing fa...,blood tests|blood IgE,both,all ages,allergy,antihistamine,nasal spray - decongestant,,,,,
2,nasal,runny nose|skin allergy|anxiety|reducing facto...,blood tests|blood IgE,both,all ages,allergy,avoid stress/anxiety,nasal spray - decongestant,,medical then surgical,,,surgery
3,nasal,runny nose|skin allergy|anxiety|reducing facto...,blood tests|blood IgE,both,all ages,vasomotor,avoid stress/anxiety,nasal spray - decongestant,,medical then surgical,,,surgery
4,nasal,sneezing|runny nose|blockage in nose,blood tests|blood IgE,both,all ages,allergy,antihistamine,nasal spray - steroid,,medical then surgical,,,FESS
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,male breast,swelling in breast area in males|pain and tend...,,male,above 50 years,gynecomastia,FNAC and ultrasound breast,harmone studies,,,,,surgery
121,chest pain,discomfort in left side of chest|referred pain...,,both,all ages,heart attack,ECG,,,limit physical activities,call nerarby hospital and ambulance,do not drive,
122,chest pain,sharp chest pain while breating|pain stops on ...,,both,all ages,pleurisy,X-Ray chest,HRCT scan chest,ECG,consult pulmonologist or physician,complete check up till final diagnosis,,
123,chest pain,pain in chest few inches away from midline|pai...,,both,all ages,costochondritis,X-Ray chest,ECG,,complete check up till final diagnosis,avoid stress and anxiety,local rubefacient,


In [8]:
# expand 'both' gender value to female and male rows
processed_data = pd.DataFrame()
for _, row in data.iterrows():
    if "both" in row.gender:
        new_data = pd.DataFrame()
        for item in ["female", "male"]:
            new_row = row.copy()
            new_row.gender = item
            new_data = pd.concat([new_data, new_row.to_frame().T], ignore_index=True)
    else:
        new_data = row.copy().to_frame().T
    processed_data = pd.concat([processed_data, new_data], ignore_index=True)

data = processed_data

data

Unnamed: 0,subjective_symptom,associated_symptoms,investigations_done,gender,age,provisional_diagnosis,advised_investigations,advised_investigations.1,advised_investigations.2,management,management.1,management.2,surgical_management
0,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood IgE,female,all ages,allergy,antihistamine,nasal spray - steroid,,,,,
1,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood IgE,male,all ages,allergy,antihistamine,nasal spray - steroid,,,,,
2,nasal,sneezing|allergy|enhancing factors|reducing fa...,blood tests|blood IgE,female,all ages,allergy,antihistamine,nasal spray - decongestant,,,,,
3,nasal,sneezing|allergy|enhancing factors|reducing fa...,blood tests|blood IgE,male,all ages,allergy,antihistamine,nasal spray - decongestant,,,,,
4,nasal,runny nose|skin allergy|anxiety|reducing facto...,blood tests|blood IgE,female,all ages,allergy,avoid stress/anxiety,nasal spray - decongestant,,medical then surgical,,,surgery
...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,chest pain,sharp chest pain while breating|pain stops on ...,,male,all ages,pleurisy,X-Ray chest,HRCT scan chest,ECG,consult pulmonologist or physician,complete check up till final diagnosis,,
230,chest pain,pain in chest few inches away from midline|pai...,,female,all ages,costochondritis,X-Ray chest,ECG,,complete check up till final diagnosis,avoid stress and anxiety,local rubefacient,
231,chest pain,pain in chest few inches away from midline|pai...,,male,all ages,costochondritis,X-Ray chest,ECG,,complete check up till final diagnosis,avoid stress and anxiety,local rubefacient,
232,chest pain,dull ache chest wall|pain in chest wall increa...,,female,all ages,chest wall myalgia,X-Ray chest,ECG,,avoid strain on chest wall muscles,symptomatic and supportive,,


In [9]:
all_ages = [
    "upto 4 weeks",
    "1-12 months",
    "1 year",
    "2 years",
    "3 years",
    "4 years",
    "5 years",
    "6 years",
    "7 years",
    "8 years",
    "9 years",
    "10 years",
    "11 years",
    "12 years",
    "13 years",
    "14 years",
    "15 years",
    "16 years",
    "17 years",
    "18 years",
    "19 years",
    "20 years",
    "21 years",
    "22 years",
    "23 years",
    "24 years",
    "25 years",
    "26 years",
    "27 years",
    "28 years",
    "29 years",
    "30 years",
    "31 years",
    "32 years",
    "33 years",
    "34 years",
    "35 years",
    "36 years",
    "37 years",
    "38 years",
    "39 years",
    "40 years",
    "41 years",
    "42 years",
    "43 years",
    "44 years",
    "45 years",
    "46 years",
    "47 years",
    "48 years",
    "49 years",
    "50 years",
    "51 years",
    "52 years",
    "53 years",
    "54 years",
    "55 years",
    "56 years",
    "57 years",
    "58 years",
    "59 years",
    "60 years",
    "61 years",
    "62 years",
    "63 years",
    "64 years",
    "65 years",
    "66 years",
    "67 years",
    "68 years",
    "69 years",
    "70 years",
    "71 years",
    "72 years",
    "73 years",
    "74 years",
    "75 years",
    "76 years",
    "77 years",
    "78 years",
    "79 years",
    "80 years",
    "81 years",
    "82 years",
    "83 years",
    "84 years",
    "85 years",
    "86 years",
    "87 years",
    "88 years",
    "89 years",
    "90 years",
    "91 years",
    "92 years",
    "93 years",
    "94 years",
    "95 years",
    "96 years",
    "97 years",
    "98 years",
    "99 years",
    "100 years",
    "above 100 years",
]

# expand 'all ages' age value to multiple age group rows
processed_data = pd.DataFrame()
for _, row in data.iterrows():
    if "all ages" in row.age:
        new_data = pd.DataFrame()
        for item in all_ages:
            new_row = row.copy()
            new_row.age = item
            new_data = pd.concat([new_data, new_row.to_frame().T], ignore_index=True)
    elif "1-12 years" in row.age:
        new_data = pd.DataFrame()
        for item in [
            "1 year",
            "2 years",
            "3 years",
            "4 years",
            "5 years",
            "6 years",
            "7 years",
            "8 years",
            "9 years",
            "10 years",
            "11 years",
            "12 years",
        ]:
            new_row = row.copy()
            new_row.age = item
            new_data = pd.concat([new_data, new_row.to_frame().T], ignore_index=True)
    elif "13-18 years" in row.age:
        new_data = pd.DataFrame()
        for item in [
            "13 years",
            "14 years",
            "15 years",
            "16 years",
            "17 years",
            "18 years",
        ]:
            new_row = row.copy()
            new_row.age = item
            new_data = pd.concat([new_data, new_row.to_frame().T], ignore_index=True)
    elif "19-25 years" in row.age:
        new_data = pd.DataFrame()
        for item in [
            "19 years",
            "20 years",
            "21 years",
            "22 years",
            "23 years",
            "24 years",
            "25 years",
        ]:
            new_row = row.copy()
            new_row.age = item
            new_data = pd.concat([new_data, new_row.to_frame().T], ignore_index=True)
    elif "26-35 years" in row.age:
        new_data = pd.DataFrame()
        for item in [
            "26 years",
            "27 years",
            "28 years",
            "29 years",
            "30 years",
            "31 years",
            "32 years",
            "33 years",
            "34 years",
            "35 years",
        ]:
            new_row = row.copy()
            new_row.age = item
            new_data = pd.concat([new_data, new_row.to_frame().T], ignore_index=True)
    elif "36-50 years" in row.age:
        new_data = pd.DataFrame()
        for item in [
            "36 years",
            "37 years",
            "38 years",
            "39 years",
            "40 years",
            "41 years",
            "42 years",
            "43 years",
            "44 years",
            "45 years",
            "46 years",
            "47 years",
            "48 years",
            "49 years",
            "50 years",
        ]:
            new_row = row.copy()
            new_row.age = item
            new_data = pd.concat([new_data, new_row.to_frame().T], ignore_index=True)
    elif "51-65 years" in row.age:
        new_data = pd.DataFrame()
        for item in [
            "51 years",
            "52 years",
            "53 years",
            "54 years",
            "55 years",
            "56 years",
            "57 years",
            "58 years",
            "59 years",
            "60 years",
            "61 years",
            "62 years",
            "63 years",
            "64 years",
            "65 years",
        ]:
            new_row = row.copy()
            new_row.age = item
            new_data = pd.concat([new_data, new_row.to_frame().T], ignore_index=True)
    elif "above 65 years" in row.age:
        new_data = pd.DataFrame()
        for item in [
            "66 years",
            "67 years",
            "68 years",
            "69 years",
            "70 years",
            "71 years",
            "72 years",
            "73 years",
            "74 years",
            "75 years",
            "76 years",
            "77 years",
            "78 years",
            "79 years",
            "80 years",
            "81 years",
            "82 years",
            "83 years",
            "84 years",
            "85 years",
            "86 years",
            "87 years",
            "88 years",
            "89 years",
            "90 years",
            "91 years",
            "92 years",
            "93 years",
            "94 years",
            "95 years",
            "96 years",
            "97 years",
            "98 years",
            "99 years",
            "100 years",
            "above 100 years",
        ]:
            new_row = row.copy()
            new_row.age = item
            new_data = pd.concat([new_data, new_row.to_frame().T], ignore_index=True)
    elif "above 60 years" in row.age:
        new_data = pd.DataFrame()
        for item in [
            "61 years",
            "62 years",
            "63 years",
            "64 years",
            "65 years",
            "66 years",
            "67 years",
            "68 years",
            "69 years",
            "70 years",
            "71 years",
            "72 years",
            "73 years",
            "74 years",
            "75 years",
            "76 years",
            "77 years",
            "78 years",
            "79 years",
            "80 years",
            "81 years",
            "82 years",
            "83 years",
            "84 years",
            "85 years",
            "86 years",
            "87 years",
            "88 years",
            "89 years",
            "90 years",
            "91 years",
            "92 years",
            "93 years",
            "94 years",
            "95 years",
            "96 years",
            "97 years",
            "98 years",
            "99 years",
            "100 years",
            "above 100 years",
        ]:
            new_row = row.copy()
            new_row.age = item
            new_data = pd.concat([new_data, new_row.to_frame().T], ignore_index=True)
    elif "above 50 years" in row.age:
        new_data = pd.DataFrame()
        for item in [
            "51 years",
            "52 years",
            "53 years",
            "54 years",
            "55 years",
            "56 years",
            "57 years",
            "58 years",
            "59 years",
            "60 years",
            "61 years",
            "62 years",
            "63 years",
            "64 years",
            "65 years",
            "66 years",
            "67 years",
            "68 years",
            "69 years",
            "70 years",
            "71 years",
            "72 years",
            "73 years",
            "74 years",
            "75 years",
            "76 years",
            "77 years",
            "78 years",
            "79 years",
            "80 years",
            "81 years",
            "82 years",
            "83 years",
            "84 years",
            "85 years",
            "86 years",
            "87 years",
            "88 years",
            "89 years",
            "90 years",
            "91 years",
            "92 years",
            "93 years",
            "94 years",
            "95 years",
            "96 years",
            "97 years",
            "98 years",
            "99 years",
            "100 years",
            "above 100 years",
        ]:
            new_row = row.copy()
            new_row.age = item
            new_data = pd.concat([new_data, new_row.to_frame().T], ignore_index=True)
    else:
        new_data = row.copy().to_frame().T
    processed_data = pd.concat([processed_data, new_data], ignore_index=True)

data = processed_data

data

Unnamed: 0,subjective_symptom,associated_symptoms,investigations_done,gender,age,provisional_diagnosis,advised_investigations,advised_investigations.1,advised_investigations.2,management,management.1,management.2,surgical_management
0,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood IgE,female,upto 4 weeks,allergy,antihistamine,nasal spray - steroid,,,,,
1,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood IgE,female,1-12 months,allergy,antihistamine,nasal spray - steroid,,,,,
2,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood IgE,female,1 year,allergy,antihistamine,nasal spray - steroid,,,,,
3,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood IgE,female,2 years,allergy,antihistamine,nasal spray - steroid,,,,,
4,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood IgE,female,3 years,allergy,antihistamine,nasal spray - steroid,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23161,chest pain,dull ache chest wall|pain in chest wall increa...,,male,97 years,chest wall myalgia,X-Ray chest,ECG,,avoid strain on chest wall muscles,symptomatic and supportive,,
23162,chest pain,dull ache chest wall|pain in chest wall increa...,,male,98 years,chest wall myalgia,X-Ray chest,ECG,,avoid strain on chest wall muscles,symptomatic and supportive,,
23163,chest pain,dull ache chest wall|pain in chest wall increa...,,male,99 years,chest wall myalgia,X-Ray chest,ECG,,avoid strain on chest wall muscles,symptomatic and supportive,,
23164,chest pain,dull ache chest wall|pain in chest wall increa...,,male,100 years,chest wall myalgia,X-Ray chest,ECG,,avoid strain on chest wall muscles,symptomatic and supportive,,


In [10]:
data.to_csv("./data/data_processed.txt", sep="\t")