In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/Users/celestinesheum/Desktop/MY-DAC-Curriculum/Heart_cleaned.csv')

In [3]:
# Columns categorization
illness_columns = [
    "HadHeartAttack", "HadAngina", "HadStroke", "HadAsthma", 
    "HadSkinCancer", "HadCOPD", "HadDepressiveDisorder",
    "HadKidneyDisease", "HadArthritis", "HadDiabetes"
]

non_life_threatening_columns = [
    "DeafOrHardOfHearing", "BlindOrVisionDifficulty",
    "DifficultyConcentrating", "DifficultyWalking",
    "DifficultyDressingBathing", "DifficultyErrands"
]

In [4]:
# Function to check for inconsistencies
def find_inconsistencies(data, columns, category):
    inconsistencies = []
    for col in columns:
        mismatch = data.groupby(col)["GeneralHealth"].nunique()
        mismatched_cases = mismatch[mismatch > 1]
        if not mismatched_cases.empty:
            inconsistencies.append((col, mismatched_cases.index.tolist()))
    print(f"Inconsistencies for {category}:")
    if inconsistencies:
        for condition, statuses in inconsistencies:
            print(f" - '{condition}' has different GeneralHealth statuses: {statuses}")
    else:
        print(" - No inconsistencies found.")
    print()

In [6]:
# Check inconsistencies for both categories
print("Checking for inconsistencies in illness columns:")
find_inconsistencies(df, illness_columns, "Illness Columns")

Checking for inconsistencies in illness columns:
Inconsistencies for Illness Columns:
 - 'HadHeartAttack' has different GeneralHealth statuses: ['no', 'yes']
 - 'HadAngina' has different GeneralHealth statuses: ['no', 'yes']
 - 'HadStroke' has different GeneralHealth statuses: ['no', 'yes']
 - 'HadAsthma' has different GeneralHealth statuses: ['no', 'yes']
 - 'HadSkinCancer' has different GeneralHealth statuses: ['no', 'yes']
 - 'HadCOPD' has different GeneralHealth statuses: ['no', 'yes']
 - 'HadDepressiveDisorder' has different GeneralHealth statuses: ['no', 'yes']
 - 'HadKidneyDisease' has different GeneralHealth statuses: ['no', 'yes']
 - 'HadArthritis' has different GeneralHealth statuses: ['no', 'yes']
 - 'HadDiabetes' has different GeneralHealth statuses: ['no', 'no, pre-diabetes or borderline diabetes', 'yes', 'yes, but only during pregnancy (female)']



In [8]:
print("Checking for inconsistencies in non-life-threatening columns:")
find_inconsistencies(df, non_life_threatening_columns, "Non-Life-Threatening Columns")

Checking for inconsistencies in non-life-threatening columns:
Inconsistencies for Non-Life-Threatening Columns:
 - 'DeafOrHardOfHearing' has different GeneralHealth statuses: ['no', 'yes']
 - 'BlindOrVisionDifficulty' has different GeneralHealth statuses: ['no', 'yes']
 - 'DifficultyConcentrating' has different GeneralHealth statuses: ['no', 'yes']
 - 'DifficultyWalking' has different GeneralHealth statuses: ['no', 'yes']
 - 'DifficultyDressingBathing' has different GeneralHealth statuses: ['no', 'yes']
 - 'DifficultyErrands' has different GeneralHealth statuses: ['no', 'yes']



In [10]:
# Define the relevant columns
illness_columns = [
    "HadHeartAttack", "HadAngina", "HadStroke", "HadAsthma", 
    "HadSkinCancer", "HadCOPD", "HadDepressiveDisorder",
    "HadKidneyDisease", "HadArthritis", "HadDiabetes"
]

# Create a subset with relevant columns
subset = df[["GeneralHealth"] + illness_columns]

# Group individuals by their illness combinations and check for mismatched GeneralHealth statuses
subset["IllnessCombination"] = subset[illness_columns].apply(lambda x: tuple(x), axis=1)

# Identify groups with differing GeneralHealth statuses
mismatch_groups = subset.groupby("IllnessCombination")["GeneralHealth"].nunique()
mismatched_combinations = mismatch_groups[mismatch_groups > 1]

# Extract mismatched records
if not mismatched_combinations.empty:
    mismatched_records = subset[subset["IllnessCombination"].isin(mismatched_combinations.index)]
    print("Mismatched cases:")
    print(mismatched_records)
else:
    print("No mismatched cases found.")

Mismatched cases:
       GeneralHealth HadHeartAttack HadAngina HadStroke HadAsthma  \
0          very good             no        no        no        no   
1          excellent             no        no        no        no   
2          very good             no        no        no        no   
3          excellent             no        no        no       yes   
4               fair             no        no        no        no   
...              ...            ...       ...       ...       ...   
444757          good             no        no        no       yes   
444758     excellent             no        no        no        no   
444759          poor             no        no        no        no   
444760     very good            yes        no        no       yes   
444761     very good             no        no        no        no   

       HadSkinCancer HadCOPD HadDepressiveDisorder HadKidneyDisease  \
0                 no      no                    no               no   
1          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset["IllnessCombination"] = subset[illness_columns].apply(lambda x: tuple(x), axis=1)


In [13]:
# Count cases where the illness combination is identical but GeneralHealth differs
if not mismatched_combinations.empty:
    mismatched_records = subset[subset["IllnessCombination"].isin(mismatched_combinations.index)]
    num_mismatched_combinations = mismatched_combinations.shape[0]
    num_mismatched_cases = len(mismatched_records)
    
    print(f"Number of mismatched illness combinations: {num_mismatched_combinations}")
    print(f"Number of cases with mismatched GeneralHealth: {num_mismatched_cases}")
    print("Mismatched cases:")
    print(mismatched_records)
else:
    print("No mismatched cases found.")

Number of mismatched illness combinations: 1161
Number of cases with mismatched GeneralHealth: 444399
Mismatched cases:
       GeneralHealth HadHeartAttack HadAngina HadStroke HadAsthma  \
0          very good             no        no        no        no   
1          excellent             no        no        no        no   
2          very good             no        no        no        no   
3          excellent             no        no        no       yes   
4               fair             no        no        no        no   
...              ...            ...       ...       ...       ...   
444757          good             no        no        no       yes   
444758     excellent             no        no        no        no   
444759          poor             no        no        no        no   
444760     very good            yes        no        no       yes   
444761     very good             no        no        no        no   

       HadSkinCancer HadCOPD HadDepressiveDisorder 

In [18]:
num_mismatched_combinations = mismatched_combinations.shape[0]
num_mismatched_cases = len(mismatched_records)
num_mismatched_combinations

1161