# Further Analysis and Clean Up of Wiki Death Cause Data

In [26]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [27]:
from collections import Counter
import numpy as np
import pandas as pd
import re
import spacy
from spacy.lang.en import English

In [28]:
# Load Spacy English language model
nlp = spacy.load("en_core_web_sm")

## Perform Categorization

In [29]:
def remove_phrases(text, phrases_to_remove):
    # Create a temporary variable with the lowercase text
    temp_text = text.lower()
    
    # Perform the replacement on the lowercase text
    for phrase in phrases_to_remove:
        temp_text = temp_text.replace(phrase.lower(), '')

    # Now, reconstruct the original text with phrases removed
    result_text = ""
    start = 0
    for i in range(len(text)):
        if text[i].lower() != temp_text[start:start+1]:
            continue
        else:
            result_text += text[i]
            start += 1
    
    return result_text


In [30]:
# Function to process text with Spacy, remove stop words and unwanted phrases
def spacy_process(text, phrases_to_remove):
    if not isinstance(text, str):
        return []
    text = remove_phrases(text, phrases_to_remove)
    doc = nlp(text)
    return [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]

In [31]:
def categorize_cause_spacy(cause, key_terms, term_mappings):
    if not isinstance(cause, str):  # Check if cause is a string
        return None  # Return "unknown" for non-string inputs (like NaN)

    # First check specific terms in the mappings
    for term, category in term_mappings.items():
        if term in cause:
            return category

    # Then check general terms
    for term in key_terms:
        if term in cause:
            return term

    return "other"

In [32]:


# Function to check and replace strings containing any no_info_terms
def remove_no_info_terms(text):
    
    
    # Standardize No Information Entries and Normalize Text
    no_info_terms = ["health care",
                     "healthcare",
                     "health coverage",
                     "no relevant section found",
                     "unknown",
                     "there is no",
                     "it is not possible to determine",
                     "any specific health issue",
                     "significant health issue",
                     "care act",
                     "obamacare",
                     "aca",
                     "medicare", 
                     "medicaid",
                     "insurance",
                     "sorry provided text contain information"
                     ]
    
    # Combine all terms into a single regex pattern
    # Join the terms with '|', which acts as an 'OR' operator in regex
    regex_pattern = '|'.join(map(re.escape, no_info_terms))
    
    if isinstance(text, str) and re.search(regex_pattern, text, re.IGNORECASE):
        return np.nan
    return text

In [33]:
def clean_and_categorize_data_spacy(file_path):
    # Load the CSV file
    data = pd.read_csv(file_path)

     
    # data['Health_Issues'] = data['Health_Issues'].str.lower().apply(
    #     lambda x: np.nan if isinstance(x, str) and any(term in x for term in no_info_terms) else x)
    
    # Apply the function to the 'Health_Issues' column
    data['Health_Issues'] = data['Health_Issues'].str.lower().apply(remove_no_info_terms)

    # List of phrases to remove
    phrases_to_remove = ["cause of death: ", "death cause: ", "complications from ", 
                            "the cause of death was ",
                            "indications significant health issue mentioned ",
                            "possible determine specific health issue ",
                            "The most significant health issue mentioned in the text is ",
                            "The most significant health issue found in the text is ",
                            "The most significant health issue in the text is ",
                            "There are multiple health issues mentioned in the text, but the most significant one is "
                            ]

    # Process text and remove unwanted phrases
    data['Health_Issues'] = data['Health_Issues'].str.lower().apply(lambda x: ' '.join(spacy_process(x, phrases_to_remove)) if isinstance(x, str) else x)

    # Key terms and their mappings
    key_terms = ['cancer', 'heart', 'stroke', 'accident', 'suicide', 'murder',
                 'organ failure', 'pneumonia', 'respiratory', 'natural causes',
                 'tumor', 'diabetes', 'brain', 'poisoning', 
                 'liver', 'illness', 'als', 'kidney', 'assassination', 'tuberculosis', 
                 'overdose', 'alzheimer', 'parkinson', 'drowning', 'covid-19', 'aids', 
                 'bronchitis', 'surgery', 'fever', 'infection', 'blood', 'hemorrhage', 'asphyxiation',
                 'concussions', 'pacemaker', 'tinnitus', 'obesity', 'alcoholism' 'anorexia',
                 'opioid', 'lead contamination', 'hepatitis', 'seizures', 'aneurysm', 'pre existing conditions',
                 'reproductive', 'drug addiction', 'tobacco']
    
    term_mappings = {'injury': 'accident', 'cardiac arrest': 'heart', 'blunt trauma': 'accident', 'leukemia': 'cancer',
                     'cardiac':'heart', 'myeloma': 'cancer', 'cerebral': 'brain', 'gunshot': 'murder', 'hanged':'suicide',
                     'lymphoma': 'cancer', 'shot':'murder', 'mesothelioma':'cancer', 'stab': 'murder',
                     'cirrhosis':'liver', 'crash':'accident', 'collision':'accident', 'dementia':'alzheimer',
                     'fall':'accident', 'hanging':'suicide', 'cardiovascular':'heart', 'knife wound':'murder',
                     'unspecified': 'unknown', 'emphysema':'respiratory', 'aortic':'heart', 'thrombosis':'blood',
                     'coronary':'heart', 'lung':'respiratory', 'pulmonary': 'respiratory', 'coronavirus':'covid-19',
                     'abortion':'reproductive', 'diabetic': 'diabetes', 'eating disorders': 'anorexia',
                     'leukaemia': 'cancer', 'smoking': 'tobacco', 'pre existing medical conditions': 'pre existing conditions'}

    # Categorize the causes of death
    data['Categorized_Cause'] = data['Health_Issues'].apply(lambda x: categorize_cause_spacy(x, key_terms, term_mappings))

    return data

In [34]:

file_path = '../04_nndb_wiki_alive_lookup/wiki_alive_output.csv'
cleaned_data_spacy = clean_and_categorize_data_spacy(file_path)


### Write the File to Disk for the Seed

In [35]:
# Get rid of this one bad value i can't fix
index_to_drop = cleaned_data_spacy[cleaned_data_spacy['ID'] == 'b2332e48a1298b58fd1bf6ab1ca5d12b'].index
# Dropping the row by its index
cleaned_data_spacy = cleaned_data_spacy.drop(index_to_drop)

# Get rid of all rows with null values for better SQL join
df_seed = cleaned_data_spacy.dropna(subset=['Health_Issues'])

# Presist this analysis to a CSV
df_seed.to_csv('seed_nndb_alive_causes.csv', index=False)

In [36]:
cleaned_data_spacy.dropna(inplace=True)
filtered_unknown = cleaned_data_spacy[cleaned_data_spacy['Health_Issues'] != 'unknown']
filtered_data = filtered_unknown[filtered_unknown['Categorized_Cause'] == 'other']
print(filtered_data[['Health_Issues', 'Categorized_Cause']].sample(25))

                           Health_Issues Categorized_Cause
15446                              fraud             other
7405                     spinal stenosis             other
17743                      pre eclampsia             other
16785                access medical care             other
19525                               lyme             other
2487                      herniated disc             other
5401                 acid reflux disease             other
17402                       overcrowding             other
3441   obsessive compulsive disorder ocd             other
7318                               fraud             other
18894                            bulimia             other
314                    asperger syndrome             other
11880                          uninsured             other
16656                             sepsis             other
5999                                 hiv             other
12581           traumatic encephalopathy             oth

In [37]:
cleaned_data_spacy['Categorized_Cause'].value_counts()

Categorized_Cause
other                      173
cancer                      84
heart                       20
covid-19                    19
stroke                      16
reproductive                13
diabetes                    12
alzheimer                    7
respiratory                  7
obesity                      7
hepatitis                    6
pre existing conditions      5
kidney                       4
accident                     4
tobacco                      4
tumor                        4
pneumonia                    4
blood                        4
aids                         3
parkinson                    3
opioid                       3
aneurysm                     2
liver                        2
tinnitus                     2
seizures                     2
drug addiction               2
brain                        2
pacemaker                    2
illness                      2
concussions                  1
infection                    1
als                  

In [38]:
cleaned_data_spacy['Health_Issues'].value_counts()

Health_Issues
cancer                     26
breast cancer              22
covid-19                   17
prostate cancer            15
stroke                     14
                           ..
heart bypass surgery        1
lung issues                 1
kidney failure              1
spinal stenosis             1
guillain barré syndrome     1
Name: count, Length: 231, dtype: int64

## Examine Remaing Word Frequencies

In [39]:
def print_common_words(common_words):
    for word, freq in common_words:
        print(f"Word: '{word}', Frequency: {freq}")

In [40]:

def most_common_words(dataframe, column_name):
    text = ' '.join(dataframe[column_name].dropna())  # Join all text and handle NaN values
    doc = nlp(text)
    
    # Filter tokens that are stop words or punctuations
    words = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]

    # Count the words
    word_freq = Counter(words)

    return word_freq.most_common(30)  # You can adjust the number to get more or less frequent words

# Usage
common_words = most_common_words(filtered_data, 'Health_Issues')
print_common_words(common_words)


Word: 'health', Frequency: 24
Word: 'disorder', Frequency: 10
Word: 'care', Frequency: 8
Word: 'disease', Frequency: 7
Word: 'mental', Frequency: 7
Word: 'sclerosis', Frequency: 6
Word: 'bipolar', Frequency: 5
Word: 'multiple', Frequency: 5
Word: 'syndrome', Frequency: 4
Word: 'injuries', Frequency: 4
Word: 'preexisting', Frequency: 4
Word: 'conditions', Frequency: 4
Word: 'uninsured', Frequency: 4
Word: 'medical', Frequency: 4
Word: 'depression', Frequency: 4
Word: 'ménière', Frequency: 3
Word: 'diseases', Frequency: 3
Word: 'alcoholism', Frequency: 3
Word: 'flu', Frequency: 3
Word: 'issues', Frequency: 3
Word: 'arthritis', Frequency: 3
Word: 'reform', Frequency: 3
Word: 'lyme', Frequency: 3
Word: 'chronic', Frequency: 2
Word: 'issue', Frequency: 2
Word: 'affordable', Frequency: 2
Word: 'neurological', Frequency: 2
Word: 'hip', Frequency: 2
Word: 'nutrition', Frequency: 2
Word: 'covid', Frequency: 2


## Examine Bi-Grams

In [41]:
# Load Spacy English language model
nlp = English()
nlp.add_pipe('sentencizer')  # Add sentencizer to the pipeline

<spacy.pipeline.sentencizer.Sentencizer at 0x14780cf50>

In [42]:
def most_common_bigrams(dataframe, column_name):
    text = ' '.join(dataframe[column_name].dropna())  # Join all text and handle NaN values
    doc = nlp(text)

    # Create bigrams
    bigrams = []
    for sent in doc.sents:
        tokens = [token.text.lower() for token in sent if not token.is_stop and not token.is_punct and not token.is_space]
        bigrams.extend(zip(tokens, tokens[1:]))

    # Count the bigrams
    bigram_freq = Counter(bigrams)

    return bigram_freq.most_common(30)  # Adjust number for more/less frequent bigrams

In [43]:
common_bigrams = most_common_bigrams(filtered_data, 'Health_Issues')
print_common_words(common_bigrams)

Word: '('mental', 'health')', Frequency: 7
Word: '('health', 'care')', Frequency: 6
Word: '('multiple', 'sclerosis')', Frequency: 5
Word: '('bipolar', 'disorder')', Frequency: 4
Word: '('preexisting', 'conditions')', Frequency: 4
Word: '('ménière', 'disease')', Frequency: 3
Word: '('health', 'issue')', Frequency: 2
Word: '('care', 'health')', Frequency: 2
Word: '('health', 'substance')', Frequency: 2
Word: '('health', 'issues')', Frequency: 2
Word: '('health', 'services')', Frequency: 2
Word: '('lyme', 'disease')', Frequency: 2
Word: '('care', 'reform')', Frequency: 2
Word: '('dissociative', 'identity')', Frequency: 1
Word: '('identity', 'disorder')', Frequency: 1
Word: '('disorder', 'health')', Frequency: 1
Word: '('care', 'plan')', Frequency: 1
Word: '('plan', 'bipolar')', Frequency: 1
Word: '('disorder', 'asperger')', Frequency: 1
Word: '('asperger', 'syndrome')', Frequency: 1
Word: '('syndrome', 'attention')', Frequency: 1
Word: '('attention', 'deficit')', Frequency: 1
Word: '('def

In [44]:
filtered_data.head(50)

Unnamed: 0,ID,WIKI_PAGE,Health_Issues,Categorized_Cause
198,8a1df388700fbdc045627bcf92224841,Herschel_Walker,dissociative identity disorder,other
230,45c512fba6f069ce56ed24597c182337,Mark_Warner,health care plan,other
278,c919e3256330ae0ed64678d04f71fb14,Kanye_West,bipolar disorder,other
314,a231439b94ad79a5e36c72ddfec3c1dc,Joe_Walsh,asperger syndrome,other
316,c7d1464b97f5dce0df9b94dbe65be318,Joe_Walsh,attention deficit hyperactivity disorder,other
506,a6625d0ad36f985b889e179369362bb1,Alice_Walton,oncology,other
516,ae1357ab60574861eaf2932a4d52dcb1,Deryck_Whibley,injuries,other
541,115df7279300985b73a3c33faa4d2e40,Bruce_Willis,aphasia,other
655,7bba3fea9896052bcaa0c82e66ccd870,Brian_Wilson,surgeries,other
715,7e4d40e061568a50f3730c88ce6e0336,Dana_White,ménière disease,other


In [45]:
cleaned_data_spacy.head(50)

Unnamed: 0,ID,WIKI_PAGE,Health_Issues,Categorized_Cause
35,ea15d5d35cfbb426abd8b1461417e650,Asif_Ali_Zardari,dementia,alzheimer
134,c2c41864c52ff9f6310e537083725b63,Murray_Waas,breast cancer,cancer
176,2ba6574ad7774a0370532b809bf477f3,Rick_Wakeman,cirrhosis,liver
180,5dfdc5f3317adf707c90d0e89268cf59,Tim_Walberg,abortion,reproductive
198,8a1df388700fbdc045627bcf92224841,Herschel_Walker,dissociative identity disorder,other
230,45c512fba6f069ce56ed24597c182337,Mark_Warner,health care plan,other
278,c919e3256330ae0ed64678d04f71fb14,Kanye_West,bipolar disorder,other
280,5342bcdf4f1007399254cdf9d64ffaa4,Leslie_West,diabetes,diabetes
314,a231439b94ad79a5e36c72ddfec3c1dc,Joe_Walsh,asperger syndrome,other
316,c7d1464b97f5dce0df9b94dbe65be318,Joe_Walsh,attention deficit hyperactivity disorder,other
