In [12]:
#Goals: get demographics information from BIDMC and MGB. Use complete set to get all demographics. Use BDSPPatientID
#Age
#Mean Age with SD
#Sex M and F. 
#Race Marta had (Black or African American, other, White)
#Ethnicity (Hispanic, unknown, non-hispanic)
#Top text features
#Active SDH as far as annotators
#ICD positive for SDH

import pandas as pd
import numpy as np 
from tqdm import tqdm
from thunderpack import ThunderReader  # install thunderpack if not yet: pip install thunderpack


In [13]:
#Create the demographics csvs for each hospital

# Load the CSV file
notes = pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_dive/MGB_BIDMC_Complete_Notes.csv")

# Extract the required columns
columns_of_interest = ['BDSPPatientID', 'ContactDate', 'Site']
notes_filtered = notes[columns_of_interest]

# Separate the DataFrame based on hospitals in the Site column
notes_mgb = notes_filtered[notes_filtered['Site'] == 'MGB']
notes_bidmc = notes_filtered[notes_filtered['Site'] == 'BIDMC']

# Save the separated DataFrames to new CSV files
notes_mgb.to_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/ID_and_Notes_mgb.csv", index=False)
notes_bidmc.to_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/ID_and_Notes_bidmc.csv", index=False)

print("CSV files have been created successfully.")



CSV files have been created successfully.


In [14]:
#Verifying the data is still aligned

# Load the original CSV file
original_notes = pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_dive/MGB_BIDMC_Complete_Notes.csv")

# Load the new CSV files
notes_mgb = pd.read_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/ID_and_Notes_mgb.csv")
notes_bidmc = pd.read_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/ID_and_Notes_bidmc.csv")

# Extract and align columns
original_notes_filtered = original_notes[['BDSPPatientID', 'ContactDate', 'Site']]
notes_mgb_filtered = notes_mgb[['BDSPPatientID', 'ContactDate', 'Site']]
notes_bidmc_filtered = notes_bidmc[['BDSPPatientID', 'ContactDate', 'Site']]

# Verify if the new DataFrames align with the original DataFrame
def verify_alignment(original_df, new_df, label):
    original_sorted = original_df.sort_values(by=['ContactDate', 'Site']).reset_index(drop=True)
    new_sorted = new_df.sort_values(by=['ContactDate', 'Site']).reset_index(drop=True)
    
    if original_sorted.equals(new_sorted):
        print(f"{label} alignment correct.")
    else:
        print(f"{label} alignment incorrect.")
        # Find rows that are different
        original_diff = original_sorted.compare(new_sorted, result_names=("Original", "New"))
        print("Differences:")
        print(original_diff)
        # Rows in original but not in new
        missing_in_new = original_sorted[~original_sorted.isin(new_sorted).all(axis=1)]
        print("Rows in original but not in new:")
        print(missing_in_new)
        # Rows in new but not in original
        missing_in_original = new_sorted[~new_sorted.isin(original_sorted).all(axis=1)]
        print("Rows in new but not in original:")
        print(missing_in_original)

# Separate the original DataFrame for MGB and BIDMC to compare
original_notes_mgb = original_notes_filtered[original_notes_filtered['Site'] == 'MGB']
original_notes_bidmc = original_notes_filtered[original_notes_filtered['Site'] == 'BIDMC']

# Verify alignment
verify_alignment(original_notes_mgb, notes_mgb_filtered, "MGB")
verify_alignment(original_notes_bidmc, notes_bidmc_filtered, "BIDMC")


MGB alignment correct.
BIDMC alignment correct.


In [15]:
# Read the Thunderpack file for BIDMC, Pull out relevant info, and Calculate AGE
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/BIDMC/thunderpack_demographics_nax_1m_BIDMC')

# Initialize a list to hold the DataFrames
dfs = []

# Loop over all partitions to concatenate DataFrames
for key in reader.keys():
    df = reader[key]
    dfs.append(df)

# Concatenate all DataFrames into one
df_demograph_BI = pd.concat(dfs, axis=0, ignore_index=True)

# Keep relevant columns and remove duplicates
df_demograph_BI = df_demograph_BI[['BDSPPatientID', 'PatientRace', 'EthnicGroupDSC', 'SexDSC']].drop_duplicates()

# Load the notes_bidmc DataFrame
notes_bidmc = pd.read_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/ID_and_Notes_bidmc.csv")

# Extract relevant columns
notes_bidmc = notes_bidmc[['BDSPPatientID', 'ContactDate', 'Site']]

# Verify that all IDs in notes_bidmc are present in df_demograph_BI
missing_ids = notes_bidmc[~notes_bidmc['BDSPPatientID'].isin(df_demograph_BI['BDSPPatientID'])]
if not missing_ids.empty:
    print("Warning: Some BDSPPatientID values in notes_bidmc do not have corresponding entries in df_demograph_BI:")
    print(missing_ids[['BDSPPatientID']])

# Filter df_demograph_BI to include only those IDs present in notes_bidmc
filtered_demograph = df_demograph_BI[df_demograph_BI['BDSPPatientID'].isin(notes_bidmc['BDSPPatientID'])]

# Merge the filtered demographic DataFrame with notes_bidmc
merged_df = pd.merge(notes_bidmc, filtered_demograph, on='BDSPPatientID', how='left')

# Load the birth dates CSV file
birth_dates = pd.read_csv('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/BIDMC/bidmc_patient_demographics_birth_and_death_dates.csv')

# Extract relevant columns
birth_dates = birth_dates[['BDSPPatientID', 'DateOfBirth']]

# Verify that all IDs in birth_dates are present in merged_df
missing_birth_dates = birth_dates[~birth_dates['BDSPPatientID'].isin(merged_df['BDSPPatientID'])]
if not missing_birth_dates.empty:
    print("Warning: Some BDSPPatientID values in birth_dates do not have corresponding entries in merged_df:")
    print(missing_birth_dates[['BDSPPatientID']])

# Merge the birth dates DataFrame with merged_df
final_merged_df = pd.merge(merged_df, birth_dates, on='BDSPPatientID', how='left')

# Check for BDSPPatientID with missing DateOfBirth after the merge
missing_date_of_birth = final_merged_df[final_merged_df['DateOfBirth'].isna()]
if not missing_date_of_birth.empty:
    print("Warning: Some BDSPPatientID values in final_merged_df are missing DateOfBirth:")
    print(missing_date_of_birth[['BDSPPatientID']])

# Convert ContactDate and DateOfBirth to datetime objects
final_merged_df['ContactDate'] = pd.to_datetime(final_merged_df['ContactDate'], errors='coerce')
final_merged_df['DateOfBirth'] = pd.to_datetime(final_merged_df['DateOfBirth'], errors='coerce')

# Calculate Age as the difference between ContactDate and DateOfBirth
final_merged_df['Age'] = (final_merged_df['ContactDate'] - final_merged_df['DateOfBirth']).dt.days // 365

# Save the final merged DataFrame with age calculation to a new CSV file
final_merged_df.to_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/notes_bidmc_with_demographics_and_birth_dates_and_age.csv", index=False)

print("Final merged DataFrame with demographics, birth dates, and age calculation saved successfully.")


         BDSPPatientID
0            150000000
1            150059448
2            150000001
3            150060112
4            150060113
...                ...
1400253      151400253
1400254      151400254
1400255      151400255
1400256      151400256
1400257      151400257

[1398758 rows x 1 columns]
Final merged DataFrame with demographics, birth dates, and age calculation saved successfully.


In [16]:
#Printing values that are missing to recover later 
# Read the CSV file into a DataFrame
df = pd.read_csv('/home/gregory178/Desktop/NAX project/FM_Draft_15/notes_bidmc_with_demographics_and_birth_dates_and_age.csv')

# Identify rows with missing values
missing_values = df.isna().any(axis=1)

# Extract BDSPPatientID where there are missing values
patient_ids_with_missing_values = df.loc[missing_values, 'BDSPPatientID']

# Convert to list and count the number of patient IDs
patient_ids_list = patient_ids_with_missing_values.tolist()
count_patient_ids = len(patient_ids_list)

# Print the BDSPPatientID values and the count
print("Patient IDs with missing values:", patient_ids_list)
print("Total count of patient IDs with missing values:", count_patient_ids)


Patient IDs with missing values: [150011240, 151156613, 151133857, 150676236, 150010619, 150722344, 151126349, 151153026, 151119406, 151065320, 151124644, 151121293, 151120647, 150751070, 150011561, 150654988, 150670494, 150921653, 150011241, 151152749, 150009013, 151150423, 150010846, 151062252, 151130741, 150099308, 150005606, 151035318, 151036497, 151156430, 151154144, 151131036, 151146593, 151080094, 151136386, 151130102, 150595967, 150816792, 151133268, 151116152, 150785930, 150011218, 150008749, 150981926, 151101091, 150754899, 150393824, 150946700, 150947713, 151125701, 150534531, 150772328, 150011731, 150595603, 151123111, 151155621, 150570234, 150093117, 150607610, 151152805, 151111413, 150138965, 151131568, 150977822, 150650421, 151153673, 151128399, 151151212, 151082531, 151016742, 150863809, 151149625, 151135013]
Total count of patient IDs with missing values: 73


In [17]:
# Read the Thunderpack file for MGB, Pull out relevant info, and Calculate AGE
# Read the Thunderpack file for MGB
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_demographics_MGB')

# Initialize a list to hold the DataFrames
dfs = []

# Loop over all partitions to concatenate DataFrames
for key in reader.keys():
    df = reader[key]
    if isinstance(df, pd.DataFrame):
        dfs.append(df)
    else:
        print(f"Warning: Object for key '{key}' is not a DataFrame, it is a {type(df)}.")

# Check if dfs is empty
if not dfs:
    raise ValueError("No DataFrames found. Please check the ThunderReader output.")

# Concatenate all DataFrames into one
df_demograph_MGB = pd.concat(dfs, axis=0, ignore_index=True)

# Keep relevant columns and remove duplicates
df_demograph_MGB = df_demograph_MGB[['BDSPPatientID', 'PatientRace', 'EthnicGroupDSC', 'SexDSC', 'DateOfBirth']].drop_duplicates()

# Load the notes_mgb DataFrame
notes_mgb = pd.read_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/ID_and_Notes_mgb.csv")  # Adjust path if needed

# Extract relevant columns
notes_mgb = notes_mgb[['BDSPPatientID', 'ContactDate', 'Site']]

# Verify that all IDs in notes_mgb are present in df_demograph_MGB
missing_ids = notes_mgb[~notes_mgb['BDSPPatientID'].isin(df_demograph_MGB['BDSPPatientID'])]
if not missing_ids.empty:
    print("Warning: Some BDSPPatientID values in notes_mgb do not have corresponding entries in df_demograph_MGB:")
    print(missing_ids[['BDSPPatientID']])

# Filter df_demograph_MGB to include only those IDs present in notes_mgb
filtered_demograph = df_demograph_MGB[df_demograph_MGB['BDSPPatientID'].isin(notes_mgb['BDSPPatientID'])]

# Merge the filtered demographic DataFrame with notes_mgb
merged_df = pd.merge(notes_mgb, filtered_demograph, on='BDSPPatientID', how='left')

# Check for BDSPPatientID with missing DateOfBirth in merged_df
missing_date_of_birth = merged_df[merged_df['DateOfBirth'].isna()]
if not missing_date_of_birth.empty:
    print("Warning: Some BDSPPatientID values in merged_df are missing DateOfBirth:")
    print(missing_date_of_birth[['BDSPPatientID']])

# Convert ContactDate and DateOfBirth to datetime objects
merged_df['ContactDate'] = pd.to_datetime(merged_df['ContactDate'], errors='coerce')
merged_df['DateOfBirth'] = pd.to_datetime(merged_df['DateOfBirth'], errors='coerce')

# Calculate Age as the difference between ContactDate and DateOfBirth
merged_df['Age'] = (merged_df['ContactDate'] - merged_df['DateOfBirth']).dt.days // 365

# Check for BDSPPatientID with missing Age after the calculation
missing_age = merged_df[merged_df['Age'].isna()]
if not missing_age.empty:
    print("Warning: Some BDSPPatientID values in merged_df have missing Age after calculation:")
    print(missing_age[['BDSPPatientID']])

# Save the final merged DataFrame with age calculation to a new CSV file
final_file_path = "/home/gregory178/Desktop/NAX project/FM_Draft_15/notes_mgb_with_demographics_and_age.csv"
merged_df.to_csv(final_file_path, index=False)

print(f"Final merged DataFrame with demographics and age calculation saved successfully to {final_file_path}.")


Final merged DataFrame with demographics and age calculation saved successfully to /home/gregory178/Desktop/NAX project/FM_Draft_15/notes_mgb_with_demographics_and_age.csv.


In [18]:

# Read the CSV files into DataFrames
bidmc_df = pd.read_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/notes_bidmc_with_demographics_and_birth_dates_and_age.csv")
mgb_df = pd.read_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/notes_mgb_with_demographics_and_age.csv")

# Concatenate the DataFrames row-wise (default behavior)
combined_df = pd.concat([bidmc_df, mgb_df], ignore_index=True)

# Save the combined DataFrame to a new CSV file without the index
combined_df.to_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/combined_data.csv", index=False)


In [19]:

# Load the CSV file into a DataFrame
df = pd.read_csv('/home/gregory178/Desktop/NAX project/FM_Draft_15/combined_data.csv')

# Remove rows where 'BDSPPatientID' is 122501769
df = df[df['BDSPPatientID'] != 122501769]

# Save the DataFrame back to a CSV file
df.to_csv('/home/gregory178/Desktop/NAX project/FM_Draft_15/combined_data.csv', index=False)


In [20]:
#Print out demographic information for Both Hospitals

combined_df = pd.read_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/combined_data.csv")
# Extract the 'age' column
age = combined_df['Age'].dropna()  # Drop any missing values

# Calculate mean age
mean_age = age.mean()

# Calculate standard deviation
std_dev_age = age.std()

# Calculate min and max
min_age = age.min()
max_age = age.max()

# Calculate IQR (Interquartile Range)
Q1 = age.quantile(0.25)
Q3 = age.quantile(0.75)
IQR = Q3 - Q1

# Print the results
print(f"Mean Age Both Hospitals: {mean_age}")
print(f"Standard Deviation of Age Both Hospitals: {std_dev_age}")
print(f"Minimum Age Both Hospitals: {min_age}")
print(f"Maximum Age Both Hospitals: {max_age}")
print(f"Interquartile Range (IQR) of Age Both Hospitals: {IQR}")
print(f"Quartile 1 of age Both Hospitals: {Q1}")
print(f"Quartile 3 of age Both Hospitals: {Q3}")


# Analyze 'SexDSC' column
sex_counts = combined_df['SexDSC'].value_counts()  # Counts
sex_percentages = combined_df['SexDSC'].value_counts(normalize=True) * 100  # Percentages
print("\nCount and Percentage of Male and Female:")
for sex in sex_counts.index:
    count = sex_counts[sex]
    percent = sex_percentages[sex]
    print(f"{sex}: {count} ({percent:.2f}%)")

# # Define the updated mapping for 'PatientRace'
def map_race(race):
    if pd.isna(race):  # Handle NaN values
        return "UNKNOWN/NOT SPECIFIED"
    race = race.strip()  # Remove any leading or trailing spaces
    if race in ["WHITE","White"]:
        return "WHITE"
    elif race in ["BLACK/AFRICAN AMERICAN","Black or African American"]:
        return "BLACK/AFRICAN AMERICAN"
    elif race in ["ASIAN","Asian"]:
        return "ASIAN"
    elif race in ["NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER", "Native Hawaiian or Other Pacific Islander"]:
        return "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER"
    elif race in ["AMERICAN INDIAN/ALASKA NATIVE", "American Indian or Alaska Native"]:
        return "AMERICAN INDIAN/ALASKA NATIVE"
    elif race in ["DECLINED TO ANSWER","Declined"]:
        return "UNKNOWN/NOT SPECIFIED"
    else:
        return "OTHER"


# Apply the mapping to the 'PatientRace' column
combined_df['PatientRace'] = combined_df['PatientRace'].map(map_race)

# Recalculate and print counts and percentages for the modified 'PatientRace' column
race_counts = combined_df['PatientRace'].value_counts()  # Counts
race_percentages = combined_df['PatientRace'].value_counts(normalize=True) * 100  # Percentages
print("\nCount and Percentage of Each Race:")
for race in race_counts.index:
    count = race_counts[race]
    percent = race_percentages[race]
    print(f"{race}: {count} ({percent:.2f}%)")

# Define the mapping for 'EthnicGroupDSC'
def map_ethnicity(ethnicity):
    if pd.isna(ethnicity):  # Handle NaN values
        return "Unknown"
    ethnicity = ethnicity.strip()  # Remove any leading or trailing spaces
    if ethnicity in ["OTHER ETHNICITY", "OTHER", "PREFER NOT TO SAY", "UNKNOWN/NOT SPECIFIED","UNABLE TO OBTAIN","DECLINED TO ANSWER", 
                     "LATIN AMERICAN", "SOUTH AMERICAN (NOT OTHERWISE SPECIFIED)","Unavailable", "Prefer not to say/Decline"]:
        return "Unknown"
    elif ethnicity in ["SALVADORIAN", "DOMINICAN", "PUERTO RICAN", "GUATEMALAN", "MEXICAN", "COLUMBIAN", "SPANISH", "ARGENTINIAN",
                       "ARGENTINE", "PERUVIAN", "VENEZUELAN", "CHILEAN", "ECUADORIAN", "COSTA RICAN", "CUBAN", "BOLIVIAN", "HONDURAN",
                       "PARAGUAYAN", "NICARAGUAN", "PANAMANIAN", "URUGUAYAN", "EQUATORIAL GUINEAN","MEXICAN, MEXICAN AMERICAN, CHICANO",
                        "SALVADORAN", "COLOMBIAN", "CARIBBEAN ISLAND", "Hispanic" ]:
        return "Hispanic"
    else:
        return "Not Hispanic"
# Apply the mapping to the 'EthnicGroupDSC' column
combined_df['EthnicGroupDSC'] = combined_df['EthnicGroupDSC'].map(map_ethnicity)

# Recalculate and print counts and percentages for the modified 'EthnicGroupDSC' column
ethnicity_counts = combined_df['EthnicGroupDSC'].value_counts()  # Counts
ethnicity_percentages = combined_df['EthnicGroupDSC'].value_counts(normalize=True) * 100  # Percentages
print("\nCount and Percentage of Each Ethnicity:")
for ethnicity in ethnicity_counts.index:
    count = ethnicity_counts[ethnicity]
    percent = ethnicity_percentages[ethnicity]
    print(f"{ethnicity}: {count} ({percent:.2f}%)")

outliers = combined_df[(combined_df['Age'] > 120)]
print("\nBDSPPatientIDs, Ages, ContactDates, and DateOfBirths for patients with age > 100 or < 5:")
for _, row in outliers.iterrows():
    print(f"BDSPPatientID: {row['BDSPPatientID']}, Age: {row['Age']}, ContactDate: {row['ContactDate']}, DateOfBirth: {row['DateOfBirth']}")


Mean Age Both Hospitals: 63.345781927309105
Standard Deviation of Age Both Hospitals: 19.715288562835475
Minimum Age Both Hospitals: 0
Maximum Age Both Hospitals: 102
Interquartile Range (IQR) of Age Both Hospitals: 25.0
Quartile 1 of age Both Hospitals: 53.0
Quartile 3 of age Both Hospitals: 78.0

Count and Percentage of Male and Female:
Male: 1619 (53.98%)
Female: 1380 (46.02%)

Count and Percentage of Each Race:
WHITE: 2189 (72.99%)
BLACK/AFRICAN AMERICAN: 297 (9.90%)
OTHER: 294 (9.80%)
ASIAN: 108 (3.60%)
UNKNOWN/NOT SPECIFIED: 104 (3.47%)
AMERICAN INDIAN/ALASKA NATIVE: 5 (0.17%)
NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER: 2 (0.07%)

Count and Percentage of Each Ethnicity:
Not Hispanic: 2400 (80.03%)
Unknown: 429 (14.30%)
Hispanic: 170 (5.67%)

BDSPPatientIDs, Ages, ContactDates, and DateOfBirths for patients with age > 100 or < 5:


In [21]:
# import pandas as pd

# # Load the data
# mgb_df = pd.read_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/notes_mgb_with_demographics_and_age.csv")

# # Extract the 'age' column
# age = mgb_df['Age'].dropna()  # Drop any missing values

# # Calculate mean age
# mean_age = age.mean()

# # Calculate standard deviation
# std_dev_age = age.std()

# # Calculate min and max
# min_age = age.min()
# max_age = age.max()

# # Calculate IQR (Interquartile Range)
# Q1 = age.quantile(0.25)
# Q3 = age.quantile(0.75)
# IQR = Q3 - Q1

# # Print the results
# print(f"Mean Age MGB: {mean_age}")
# print(f"Standard Deviation of Age MGB: {std_dev_age}")
# print(f"Minimum Age MGB: {min_age}")
# print(f"Maximum Age MGB: {max_age}")
# print(f"Interquartile Range (IQR) of Age MGB: {IQR}")
# print(f"Quartile 1 of age MGB: {Q1}")
# print(f"Quartile 3 of age MGB: {Q3}")

# # Analyze 'SexDSC' column
# sex_counts = mgb_df['SexDSC'].value_counts()  # Counts
# sex_percentages = mgb_df['SexDSC'].value_counts(normalize=True) * 100  # Percentages
# print("\nCount and Percentage of Male and Female:")
# for sex in sex_counts.index:
#     count = sex_counts[sex]
#     percent = sex_percentages[sex]
#     print(f"{sex}: {count} ({percent:.2f}%)")

# # Define the updated mapping for 'PatientRace'
# def map_race(race):
#     if pd.isna(race):  # Handle NaN values
#         return "UNKNOWN/NOT SPECIFIED"
#     race = race.strip()  # Remove any leading or trailing spaces
#     if race in ["White"]:
#         return "WHITE"
#     elif race in ["Black or African American"]:
#         return "BLACK/AFRICAN AMERICAN"
#     elif race in ["Asian"]:
#         return "ASIAN"
#     elif race in ["American Indian or Alaska Native"]:
#         return "AMERICAN INDIAN/ALASKA NATIVE"
#     elif race in ["Declined"]:
#         return "DECLINED TO ANSWER"
#     else:
#         return "OTHER"

# # Apply the mapping to the 'PatientRace' column
# mgb_df['PatientRace'] = mgb_df['PatientRace'].map(map_race)

# # Recalculate and print counts and percentages for the modified 'PatientRace' column
# race_counts = mgb_df['PatientRace'].value_counts()  # Counts
# race_percentages = mgb_df['PatientRace'].value_counts(normalize=True) * 100  # Percentages
# print("\nCount and Percentage of Each Race:")
# for race in race_counts.index:
#     count = race_counts[race]
#     percent = race_percentages[race]
#     print(f"{race}: {count} ({percent:.2f}%)")

# # Define the mapping for 'EthnicGroupDSC'
# def map_ethnicity(ethnicity):
#     if pd.isna(ethnicity):  # Handle NaN values
#         return "Unknown"
#     ethnicity = ethnicity.strip()  # Remove any leading or trailing spaces
#     if ethnicity in ["Unavailable", "Prefer not to say/Decline"]:
#         return "Unknown"
#     elif ethnicity in ["Hispanic"]:
#         return "Hispanic"
#     else:
#         return "Not Hispanic"

# # Apply the mapping to the 'EthnicGroupDSC' column
# mgb_df['EthnicGroupDSC'] = mgb_df['EthnicGroupDSC'].map(map_ethnicity)

# # Recalculate and print counts and percentages for the modified 'EthnicGroupDSC' column
# ethnicity_counts = mgb_df['EthnicGroupDSC'].value_counts()  # Counts
# ethnicity_percentages = mgb_df['EthnicGroupDSC'].value_counts(normalize=True) * 100  # Percentages
# print("\nCount and Percentage of Each Ethnicity:")
# for ethnicity in ethnicity_counts.index:
#     count = ethnicity_counts[ethnicity]
#     percent = ethnicity_percentages[ethnicity]
#     print(f"{ethnicity}: {count} ({percent:.2f}%)")

# # Filter and print BDSPPatientID, Age, ContactDate, and DateOfBirth for patients with age > 100 or < 5
# outliers = mgb_df[(mgb_df['Age'] > 100) | (mgb_df['Age'] < 5)]
# print("\nBDSPPatientIDs, Ages, ContactDates, and DateOfBirths for patients with age > 100 or < 5:")
# for _, row in outliers.iterrows():
#     print(f"BDSPPatientID: {row['BDSPPatientID']}, Age: {row['Age']}, ContactDate: {row['ContactDate']}, DateOfBirth: {row['DateOfBirth']}")

# # Save the outliers to a new CSV file
# outliers.to_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/notes_mgb_outliers.csv", index=False)
# print("\nOutliers have been saved to 'notes_mgb_outliers.csv'")


In [22]:
#122501769 elderly patient who died

In [23]:
#Next steps, sort columns into common groups and fill in table. 
#Fill in missing info and merge