In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import spacy

In [None]:
# Load Spacy English language model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Function to remove specified phrases from a string
def remove_phrases(text, phrases_to_remove):
    for phrase in phrases_to_remove:
        text = text.replace(phrase, '')
    return text

In [None]:
# Function to process text with Spacy, remove stop words and unwanted phrases
def spacy_process(text, phrases_to_remove):
    if not isinstance(text, str):
        return []
    text = remove_phrases(text, phrases_to_remove)
    doc = nlp(text)
    return [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]

In [None]:
def categorize_cause_spacy(cause, key_terms, term_mappings):
    # First check specific terms in the mappings
    for term, category in term_mappings.items():
        if term in cause:
            return category

    # Then check general terms
    for term in key_terms:
        if term in cause:
            return term

    return "other"

In [None]:
def clean_and_categorize_data_spacy(file_path):
    # Load the CSV file
    data = pd.read_csv(file_path)

    # Standardize No Information Entries and Normalize Text
    no_info_terms = ["not mentioned", "no relevant section found", "unknown"]
    data['Cause_of_Death'] = data['Cause_of_Death'].str.lower().apply(
        lambda x: 'unknown' if isinstance(x, str) and any(term in x for term in no_info_terms) else x)

    # List of phrases to remove
    phrases_to_remove = ["cause of death: ", "death cause: ", "complications from ", "the cause of death was "]

    # Process text and remove unwanted phrases
    data['Cause_of_Death'] = data['Cause_of_Death'].apply(lambda x: ' '.join(spacy_process(x, phrases_to_remove)) if isinstance(x, str) else x)

    # Key terms and their mappings
    key_terms = ['cancer', 'heart', 'stroke', 'accident', 'suicide', 'murder',
                 'organ failure', 'pneumonia', 'respiratory', 'natural causes',
                 'tumor', 'diabetes', 'pulmonary', 'brain', 'poisoning', 
                 'liver', 'illness', 'als', 'kidney', 'assassination', 'tuberculosis', 'overdose', 'alzheimer', 
                 'parkinson', 'drowning', 'covid-19']
    
    term_mappings = {'injury': 'accident', 'cardiac arrest': 'heart', 'blunt trauma': 'accident', 'leukemia': 'cancer',
                     'cardiac':'heart', 'myeloma': 'cancer', 'cerebral': 'brain', 'gunshot': 'murder', 'hanged':'suicide',
                     'lymphoma': 'cancer', 'shot':'murder', 'mesothelioma':'cancer', 'stab': 'murder',
                     'cirrhosis':'liver', 'crash':'accident', 'collision':'accident', 'dementia':'alzheimer',
                     'fall':'accident', 'hanging':'suicide', 'cardiovascular':'heart', 'knife wound':'murder',
                     'unspecified': 'unknown', 'emphysema':'respiratory'}

    # Categorize the causes of death
    data['Categorized_Cause'] = data['Cause_of_Death'].apply(lambda x: categorize_cause_spacy(x, key_terms, term_mappings))

    return data

In [None]:

# Usage
file_path = 'wiki_died_output.csv'  # Replace with your CSV file path
cleaned_data_spacy = clean_and_categorize_data_spacy(file_path)


In [None]:
filtered_unknown = cleaned_data_spacy[cleaned_data_spacy['Cause_of_Death'] != 'unknown']
filtered_data = filtered_unknown[filtered_unknown['Categorized_Cause'] == 'other']
print(filtered_data[['Cause_of_Death', 'Categorized_Cause']].sample(25))

In [None]:
filtered_unknown['Categorized_Cause'].value_counts()