# Further Analysis and Clean Up of Wiki Death Cause Data

In [1]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [2]:
from collections import Counter
import pandas as pd
import spacy
from spacy.lang.en import English

In [3]:
# Load Spacy English language model
nlp = spacy.load("en_core_web_sm")

## Perform Categorization

In [4]:
# Function to remove specified phrases from a string
def remove_phrases(text, phrases_to_remove):
    for phrase in phrases_to_remove:
        text = text.replace(phrase, '')
    return text

In [5]:
# Function to process text with Spacy, remove stop words and unwanted phrases
def spacy_process(text, phrases_to_remove):
    if not isinstance(text, str):
        return []
    text = remove_phrases(text, phrases_to_remove)
    doc = nlp(text)
    return [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]

In [6]:
def categorize_cause_spacy(cause, key_terms, term_mappings):
    if not isinstance(cause, str):  # Check if cause is a string
        return "unknown"  # Return "unknown" for non-string inputs (like NaN)

    # First check specific terms in the mappings
    for term, category in term_mappings.items():
        if term in cause:
            return category

    # Then check general terms
    for term in key_terms:
        if term in cause:
            return term

    return "other"

In [7]:
def clean_and_categorize_data_spacy(file_path):
    # Load the CSV file
    data = pd.read_csv(file_path)

    # Standardize No Information Entries and Normalize Text
    no_info_terms = ["not mentioned", "no relevant section found", "unknown"]
    data['Cause_of_Death'] = data['Cause_of_Death'].str.lower().apply(
        lambda x: 'unknown' if isinstance(x, str) and any(term in x for term in no_info_terms) else x)

    # List of phrases to remove
    phrases_to_remove = ["cause of death: ", "death cause: ", "complications from ", "the cause of death was "]

    # Process text and remove unwanted phrases
    data['Cause_of_Death'] = data['Cause_of_Death'].apply(lambda x: ' '.join(spacy_process(x, phrases_to_remove)) if isinstance(x, str) else x)

    # Key terms and their mappings
    key_terms = ['cancer', 'heart', 'stroke', 'accident', 'suicide', 'murder',
                 'organ failure', 'pneumonia', 'respiratory', 'natural causes',
                 'tumor', 'diabetes', 'brain', 'poisoning', 
                 'liver', 'illness', 'als', 'kidney', 'assassination', 'tuberculosis', 
                 'overdose', 'alzheimer', 'parkinson', 'drowning', 'covid-19', 'aids', 
                 'bronchitis', 'surgery', 'fever', 'infection', 'blood', 'hemorrhage', 'asphyxiation']
    
    term_mappings = {'injury': 'accident', 'cardiac arrest': 'heart', 'blunt trauma': 'accident', 'leukemia': 'cancer',
                     'cardiac':'heart', 'myeloma': 'cancer', 'cerebral': 'brain', 'gunshot': 'murder', 'hanged':'suicide',
                     'lymphoma': 'cancer', 'shot':'murder', 'mesothelioma':'cancer', 'stab': 'murder',
                     'cirrhosis':'liver', 'crash':'accident', 'collision':'accident', 'dementia':'alzheimer',
                     'fall':'accident', 'hanging':'suicide', 'cardiovascular':'heart', 'knife wound':'murder',
                     'unspecified': 'unknown', 'emphysema':'respiratory', 'aortic':'heart', 'thrombosis':'blood',
                     'coronary':'heart', 'lung':'respiratory', 'pulmonary': 'respiratory'}

    # Categorize the causes of death
    data['Categorized_Cause'] = data['Cause_of_Death'].apply(lambda x: categorize_cause_spacy(x, key_terms, term_mappings))

    return data

In [8]:

file_path = '../nndb_wiki_death_lookup/wiki_died_output.csv'
cleaned_data_spacy = clean_and_categorize_data_spacy(file_path)


In [18]:
# Presist this analysis to a CSV
cleaned_data_spacy.to_csv('seed_nndb_death_causes.csv', index=False)

In [9]:
filtered_unknown = cleaned_data_spacy[cleaned_data_spacy['Cause_of_Death'] != 'unknown']
filtered_data = filtered_unknown[filtered_unknown['Categorized_Cause'] == 'other']
print(filtered_data[['Cause_of_Death', 'Categorized_Cause']].sample(25))

                                          Cause_of_Death Categorized_Cause
2237                                    inhalation gases             other
13842                                      renal failure             other
2300   insufficient information provided determine ca...             other
14526                          cause death provided text             other
15988                    epileptic seizure complications             other
1423                                           specified             other
15767  hemorrhoidal condition leading bladder retenti...             other
2523                                    synovial sarcoma             other
7363   streptococcal toxic shock syndrome caused stre...             other
15199                 cause edwin land death undisclosed             other
11569                              myocardial infarction             other
7257   statement context provide information cause death             other
13584             cause d

In [10]:
cleaned_data_spacy['Categorized_Cause'].value_counts()

Categorized_Cause
other             10986
heart              1303
cancer             1055
respiratory         437
natural causes      319
stroke              312
pneumonia           285
accident            254
murder              212
suicide             165
brain               101
alzheimer            77
illness              75
kidney               72
liver                57
overdose             50
tuberculosis         48
assassination        44
unknown              41
parkinson            40
diabetes             37
tumor                36
blood                35
surgery              34
fever                32
poisoning            30
aids                 28
infection            26
als                  16
hemorrhage           16
drowning             14
asphyxiation         12
bronchitis           12
organ failure        10
covid-19             10
Name: count, dtype: int64

## Examine Remaing Word Frequencies

In [11]:
def print_common_words(common_words):
    for word, freq in common_words:
        print(f"Word: '{word}', Frequency: {freq}")

In [12]:

def most_common_words(dataframe, column_name):
    text = ' '.join(dataframe[column_name].dropna())  # Join all text and handle NaN values
    doc = nlp(text)
    
    # Filter tokens that are stop words or punctuations
    words = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]

    # Count the words
    word_freq = Counter(words)

    return word_freq.most_common(20)  # You can adjust the number to get more or less frequent words

# Usage
common_words = most_common_words(filtered_data, 'Cause_of_Death')
print_common_words(common_words)


Word: 'death', Frequency: 293
Word: 'cause', Frequency: 268
Word: 'provided', Frequency: 195
Word: 'information', Frequency: 164
Word: 'given', Frequency: 121
Word: 'specified', Frequency: 41
Word: 'disease', Frequency: 30
Word: 'text', Frequency: 28
Word: 'acute', Frequency: 24
Word: 'complications', Frequency: 23
Word: 'undisclosed', Frequency: 22
Word: 'died', Frequency: 21
Word: 'mentioned', Frequency: 18
Word: 'age', Frequency: 15
Word: 'causes', Frequency: 15
Word: 'mention', Frequency: 14
Word: 'multiple', Frequency: 11
Word: 'health', Frequency: 11
Word: 'attack', Frequency: 11
Word: 'intoxication', Frequency: 11


## Examine Bi-Grams

In [13]:
# Load Spacy English language model
nlp = English()
nlp.add_pipe('sentencizer')  # Add sentencizer to the pipeline

<spacy.pipeline.sentencizer.Sentencizer at 0x158eef190>

In [14]:
def most_common_bigrams(dataframe, column_name):
    text = ' '.join(dataframe[column_name].dropna())  # Join all text and handle NaN values
    doc = nlp(text)

    # Create bigrams
    bigrams = []
    for sent in doc.sents:
        tokens = [token.text.lower() for token in sent if not token.is_stop and not token.is_punct and not token.is_space]
        bigrams.extend(zip(tokens, tokens[1:]))

    # Count the bigrams
    bigram_freq = Counter(bigrams)

    return bigram_freq.most_common(30)  # Adjust number for more/less frequent bigrams

In [15]:
common_bigrams = most_common_bigrams(filtered_data, 'Cause_of_Death')
print_common_words(common_bigrams)

Word: '('cause', 'death')', Frequency: 249
Word: '('death', 'provided')', Frequency: 117
Word: '('given', 'information')', Frequency: 93
Word: '('provided', 'given')', Frequency: 83
Word: '('information', 'provided')', Frequency: 39
Word: '('provided', 'cause')', Frequency: 27
Word: '('provided', 'information')', Frequency: 24
Word: '('information', 'cause')', Frequency: 23
Word: '('death', 'specified')', Frequency: 21
Word: '('given', 'text')', Frequency: 15
Word: '('undisclosed', 'causes')', Frequency: 14
Word: '('mention', 'cause')', Frequency: 13
Word: '('death', 'mentioned')', Frequency: 12
Word: '('specified', 'given')', Frequency: 12
Word: '('specified', 'provided')', Frequency: 11
Word: '('provided', 'text')', Frequency: 9
Word: '('provided', 'mention')', Frequency: 8
Word: '('death', 'cause')', Frequency: 8
Word: '('information', 'given')', Frequency: 7
Word: '('blunt', 'force')', Frequency: 7
Word: '('death', 'given')', Frequency: 6
Word: '('text', 'cause')', Frequency: 6
Wor