In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import sklearn.preprocessing as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv(r"F:\DSA\Data Sets\IAFD DEAD.csv", encoding='ISO-8859-1')

In [3]:
df.drop('Photo', axis = 1, inplace = True)

In [4]:
df

Unnamed: 0,Name,Cause,Short Note,Date of Death,Long Description
0,A.J. Kahn,Unsure,,2021 December 25,More Info
1,Adam Faust,Medical,Cardiac Arrest,2021 August 2,More Info
2,Adam Wilde,Unsure,,2008 June 4,
3,Adonis Cheeks,Suicide,,2019 November 1,Exact date of death unknown at this time.\nDat...
4,Adrian Hoven,Medical,,1981 April 8,
...,...,...,...,...,...
1676,Zeff Ryan,AIDS,AIDS,1/94,AKA Jeff Ryan
1677,Zenya Lai,Unsure,,2022 December 16,
1678,Zoe Parker,Unsure,,2020 September 12,
1679,Zoraya Mora,Murder,,2019 March 28,


In [5]:
df.isnull().sum()

Name                  0
Cause                 0
Short Note          881
Date of Death         9
Long Description    945
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1681 entries, 0 to 1680
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Name              1681 non-null   object
 1   Cause             1681 non-null   object
 2   Short Note        800 non-null    object
 3   Date of Death     1672 non-null   object
 4   Long Description  736 non-null    object
dtypes: object(5)
memory usage: 65.8+ KB


In [43]:
df['Cause'].value_counts().sort_values(ascending = False)

Cause
Medical       693
Unsure        440
AIDS          120
Suicide       113
Murder         81
Unknown        79
Accidental     77
Overdose       75
Not Dead        2
unknown         1
Name: count, dtype: int64

In [7]:
df['Cause'].unique()

array(['Unsure', 'Medical', 'Suicide', 'Murder', 'AIDS', 'Unknown',
       'Accidental', 'Overdose', 'unknown', 'Not Dead'], dtype=object)

In [8]:
from datetime import datetime
df['Date of Death'] = pd.to_datetime(df['Date of Death'], format='%Y %B %d', errors='coerce')

In [9]:
df_1995 = df[df['Date of Death'].dt.year >= 1995]
df_1990 = df[df['Date of Death'].dt.year >= 1990]
df_1985 = df[df['Date of Death'].dt.year >= 1985]

In [10]:
df_1995.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1390 entries, 0 to 1680
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Name              1390 non-null   object        
 1   Cause             1390 non-null   object        
 2   Short Note        650 non-null    object        
 3   Date of Death     1390 non-null   datetime64[ns]
 4   Long Description  628 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 65.2+ KB


In [11]:
df_1990.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1503 entries, 0 to 1680
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Name              1503 non-null   object        
 1   Cause             1503 non-null   object        
 2   Short Note        717 non-null    object        
 3   Date of Death     1503 non-null   datetime64[ns]
 4   Long Description  670 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 70.5+ KB


In [12]:
df_1985.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1576 entries, 0 to 1680
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Name              1576 non-null   object        
 1   Cause             1576 non-null   object        
 2   Short Note        753 non-null    object        
 3   Date of Death     1576 non-null   datetime64[ns]
 4   Long Description  708 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 73.9+ KB


In [13]:
df_1995.shape

(1390, 5)

In [14]:
print(df_1990.shape)
print(df_1985.shape)

(1503, 5)
(1576, 5)


In [15]:
def filter_cause_of_death(df):
    causes_to_consider = ['Suicide', 'Murder', 'AIDS', 'Overdose', 'Medical', 'Unsure', 'Accidental']
    keep_rows = df['Cause'].apply(lambda x: any(cause in x for cause in causes_to_consider)) | ~(df['Short Note'].isnull() & df['Long Description'].isnull())
    return df[keep_rows]

df_1995 = filter_cause_of_death(df_1995.copy())
df_1990 = filter_cause_of_death(df_1990.copy())
df_1985 = filter_cause_of_death(df_1985.copy())

In [16]:
df_1995.shape[0]

1332

In [17]:
df_1995 = df_1995.drop_duplicates()

In [20]:
search_terms = ['Aids', 'aids', 'Suicide', 'suicide', 'meth', 'methamphetamine',
                'Methamphetamine', 'gun', 'Gun', 'Self', 'self', 'asphyxiation', 
                'hang', 'abuse', 'drugs', 'attack', 'cocaine', 'Cocaine',
                'car crash', 'partner', 'cancer', 'Cancer', 'OD', 'Overdose',
                'overdose', 'Murder', 'murder', 'murdered', 'boyfriend', 'husband', 'heart attack', 'stroke', 'Stroke']

exclude_terms = ['Suicide', 'Murder', 'AIDS', 'Overdose']

total_occurrence = 0

from collections import Counter

cause_counter = Counter()

for index, row in df_1995.iterrows():
    if any(term in row['Cause'] for term in exclude_terms):
        total_occurrence += 1
        continue

    if isinstance(row['Short Note'], str):
        short_note_matched = any(term.lower() in row['Short Note'].lower() for term in search_terms)
    else:
        short_note_matched = False
        
    if isinstance(row['Long Description'], str):
        long_desc_matched = any(term.lower() in row['Long Description'].lower() for term in search_terms)
    else:
        long_desc_matched = False
    
    if short_note_matched or long_desc_matched:
        cause_counter[row['Cause']] += 1

leading_cause = cause_counter.most_common(1)
if leading_cause:
    leading_cause_name, leading_cause_count = leading_cause[0]
    print(f"The leading cause of death is '{leading_cause_name}' with {leading_cause_count} occurrences.")
else:
    print("No leading cause of death found.")

total_occurrence += sum(cause_counter.values())
print("Total occurrences across all columns:", total_occurrence)

x_1995 = (total_occurrence / df_1995.shape[0]) * 100
print(f"\n{x_1995:.2f}% of actors are recorded to be dead related to the search variables")

The leading cause of death is 'Medical' with 175 occurrences.
Total occurrences across all columns: 493

37.98% of actors are recorded to be dead related to the search variables


In [21]:
df_1990 = df_1990.drop_duplicates()

In [22]:
df_1990.shape[0]

1406

In [23]:
search_terms = ['Aids', 'aids', 'Suicide', 'suicide', 'meth', 'methamphetamine',
                'Methamphetamine', 'gun', 'Gun', 'Self', 'self', 'asphyxiation', 
                'hang', 'abuse', 'drugs', 'attack', 'cocaine', 'Cocaine',
                'car crash', 'partner', 'cancer', 'Cancer', 'OD', 'Overdose',
                'overdose', 'Murder', 'murder', 'murdered', 'boyfriend', 'husband',
                'heart attack', 'stroke', 'Stroke']

exclude_terms = ['Suicide', 'Murder', 'AIDS', 'Overdose']

total_occurrence = 0

from collections import Counter

cause_counter = Counter()

for index, row in df_1990.iterrows():
    if any(term in row['Cause'] for term in exclude_terms):
        total_occurrence += 1
        continue

    if isinstance(row['Short Note'], str):
        short_note_matched = any(term.lower() in row['Short Note'].lower() for term in search_terms)
    else:
        short_note_matched = False
        
    if isinstance(row['Long Description'], str):
        long_desc_matched = any(term.lower() in row['Long Description'].lower() for term in search_terms)
    else:
        long_desc_matched = False
    
    if short_note_matched or long_desc_matched:
        cause_counter[row['Cause']] += 1

leading_cause = cause_counter.most_common(1)
if leading_cause:
    leading_cause_name, leading_cause_count = leading_cause[0]
    print(f"The leading cause of death is '{leading_cause_name}' with {leading_cause_count} occurrences.")
else:
    print("No leading cause of death found.")

total_occurrence += sum(cause_counter.values())
print("Total occurrences across all columns:", total_occurrence)

x_1990 = (total_occurrence / df_1990.shape[0]) * 100
print(f"\n{x_1990:.2f}% of actors are recorded to be dead related to the search variables")

The leading cause of death is 'Medical' with 185 occurrences.
Total occurrences across all columns: 552

39.26% of actors are recorded to be dead related to the search variables


In [24]:
df_1985 = df_1985.drop_duplicates()

In [25]:
df_1985.shape[0]

1475

In [26]:
search_terms = ['Aids', 'aids', 'Suicide', 'suicide', 'meth', 'methamphetamine',
                'Methamphetamine', 'gun', 'Gun', 'Self', 'self', 'asphyxiation', 
                'hang', 'abuse', 'drugs', 'attack', 'cocaine', 'Cocaine',
                'car crash', 'partner', 'cancer', 'Cancer', 'OD', 'Overdose',
                'overdose', 'Murder', 'murder', 'murdered', 'boyfriend', 'husband']

exclude_terms = ['Suicide', 'Murder', 'AIDS', 'Overdose']

total_occurrence = 0

from collections import Counter

cause_counter = Counter()

for index, row in df_1985.iterrows():
    if any(term in row['Cause'] for term in exclude_terms):
        total_occurrence += 1
        continue

    if isinstance(row['Short Note'], str):
        short_note_matched = any(term.lower() in row['Short Note'].lower() for term in search_terms)
    else:
        short_note_matched = False
        
    if isinstance(row['Long Description'], str):
        long_desc_matched = any(term.lower() in row['Long Description'].lower() for term in search_terms)
    else:
        long_desc_matched = False
    
    if short_note_matched or long_desc_matched:
        cause_counter[row['Cause']] += 1

leading_cause = cause_counter.most_common(1)
if leading_cause:
    leading_cause_name, leading_cause_count = leading_cause[0]
    print(f"The leading cause of death is '{leading_cause_name}' with {leading_cause_count} occurrences.")
else:
    print("No leading cause of death found.")

total_occurrence += sum(cause_counter.values())
print("Total occurrences across all columns:", total_occurrence)

x_1985 = (total_occurrence / df_1985.shape[0]) * 100
print(f"\n{x_1985:.2f}% of actors are recorded to be dead related to the search variables")

The leading cause of death is 'Medical' with 184 occurrences.
Total occurrences across all columns: 581

39.39% of actors are recorded to be dead related to the search variables


In [42]:
df_new = df.dropna(subset=['Short Note', 'Long Description'], how='all')
df_new

Unnamed: 0,Name,Cause,Short Note,Date of Death,Long Description
0,A.J. Kahn,Unsure,,2021-12-25,More Info
1,Adam Faust,Medical,Cardiac Arrest,2021-08-02,More Info
3,Adonis Cheeks,Suicide,,2019-11-01,Exact date of death unknown at this time.\nDat...
5,Adrianna Analese,Murder,Killed by her husband in a murder-suicide,2008-11-21,
8,Aiden Bonini,Medical,Meningitis,2007-03-26,"""Aiden was sadly taken from this world earlier..."
...,...,...,...,...,...
1670,Zac Stevens,Suicide,,2015-11-17,More Info
1671,Zachary Strong,Unsure,,2023-05-01,More Info
1672,Zachary Strong,Unsure,,2023-05-01,More Info
1676,Zeff Ryan,AIDS,AIDS,NaT,AKA Jeff Ryan


In [43]:
df_new.groupby('Cause')['Short Note'].apply(lambda x: x.isnull().sum())

Cause
AIDS           6
Accidental     1
Medical       33
Murder        10
Overdose       7
Suicide       18
Unknown        5
Unsure        90
unknown        1
Name: Short Note, dtype: int64

In [44]:
df_new.groupby('Cause')['Long Description'].apply(lambda x: x.isnull().sum())

Cause
AIDS          41
Accidental    26
Medical       96
Murder        17
Overdose      13
Suicide       18
Unknown        1
Unsure        23
unknown        0
Name: Long Description, dtype: int64

In [45]:
search_terms = ['Aids', 'aids', 'Suicide', 'suicide', 'meth', 'methamphetamine',
                'Methamphetamine', 'gun', 'Gun', 'Self', 'self', 'asphyxiation', 
                'hang', 'abuse', 'drugs', 'attack', 'cocaine', 'Cocaine',
                'car crash', 'partner', 'cancer', 'Cancer', 'OD', 'Overdose',
                'overdose', 'Murder', 'murder', 'murdered', 'boyfriend', 'husband',
                'heart attack', 'stroke', 'Stroke']

exclude_terms = ['Suicide', 'Murder', 'AIDS', 'Overdose']

total_occurrence = 0

from collections import Counter

cause_counter = Counter()

for index, row in df_new.iterrows():
    if any(term in row['Cause'] for term in exclude_terms):
        total_occurrence += 1
        continue

    if isinstance(row['Short Note'], str):
        short_note_matched = any(term.lower() in row['Short Note'].lower() for term in search_terms)
    else:
        short_note_matched = False
        
    if isinstance(row['Long Description'], str):
        long_desc_matched = any(term.lower() in row['Long Description'].lower() for term in search_terms)
    else:
        long_desc_matched = False
    
    if short_note_matched or long_desc_matched:
        cause_counter[row['Cause']] += 1

leading_cause = cause_counter.most_common(1)
if leading_cause:
    leading_cause_name, leading_cause_count = leading_cause[0]
    print(f"The leading cause of death is '{leading_cause_name}' with {leading_cause_count} occurrences.")
else:
    print("No leading cause of death found.")

total_occurrence += sum(cause_counter.values())
print("Total occurrences across all columns:", total_occurrence)

x_df_new = (total_occurrence / df_new.shape[0]) * 100
print(f"\n{x_df_new:.2f}% of actors are recorded to be dead related to the search variables")

The leading cause of death is 'Medical' with 204 occurrences.
Total occurrences across all columns: 588

60.56% of actors are recorded to be dead related to the search variables


In [62]:
df_alt = df[~((df['Short Note'].isna()) & (df['Long Description'] == 'More Info'))]

In [63]:
df_alt = df_alt.dropna(subset=['Short Note', 'Long Description'], how='all')

In [64]:
df_alt

Unnamed: 0,Name,Cause,Short Note,Date of Death,Long Description
1,Adam Faust,Medical,Cardiac Arrest,2021-08-02,More Info
3,Adonis Cheeks,Suicide,,2019-11-01,Exact date of death unknown at this time.\nDat...
5,Adrianna Analese,Murder,Killed by her husband in a murder-suicide,2008-11-21,
8,Aiden Bonini,Medical,Meningitis,2007-03-26,"""Aiden was sadly taken from this world earlier..."
9,Aileene Dacosta,Medical,She died in 2011 at age 23 of cerebral vascula...,2011-01-23,More Info
...,...,...,...,...,...
1665,Yasmin Viana,Medical,Heart Failure,2015-08-11,
1668,Yumika Hayashi,Unsure,Unknown,2005-06-29,From Jans-Web: [Japanese AV Star] Yumika Hayas...
1669,Yurizan Beltran,Overdose,Apparent Overdose,2017-12-13,More Info
1676,Zeff Ryan,AIDS,AIDS,NaT,AKA Jeff Ryan


In [65]:
search_terms = ['Aids', 'aids', 'Suicide', 'suicide', 'meth', 'methamphetamine',
                'Methamphetamine', 'gun', 'Gun', 'Self', 'self', 'asphyxiation', 
                'hang', 'abuse', 'drugs', 'attack', 'cocaine', 'Cocaine',
                'car crash', 'partner', 'cancer', 'Cancer', 'OD', 'Overdose',
                'overdose', 'Murder', 'murder', 'murdered', 'boyfriend', 'husband',
                'heart attack', 'cardiac arrest', 'stroke', 'Stroke']

exclude_terms = ['Suicide', 'Murder', 'AIDS', 'Overdose']

total_occurrence = 0

from collections import Counter

cause_counter = Counter()

for index, row in df_alt.iterrows():
    if any(term in row['Cause'] for term in exclude_terms):
        total_occurrence += 1
        continue

    if isinstance(row['Short Note'], str):
        short_note_matched = any(term.lower() in row['Short Note'].lower() for term in search_terms)
    else:
        short_note_matched = False
        
    if isinstance(row['Long Description'], str):
        long_desc_matched = any(term.lower() in row['Long Description'].lower() for term in search_terms)
    else:
        long_desc_matched = False
    
    if short_note_matched or long_desc_matched:
        cause_counter[row['Cause']] += 1

leading_cause = cause_counter.most_common(1)
if leading_cause:
    leading_cause_name, leading_cause_count = leading_cause[0]
    print(f"The leading cause of death is '{leading_cause_name}' with {leading_cause_count} occurrences.")
else:
    print("No leading cause of death found.")

total_occurrence += sum(cause_counter.values())
print("Total occurrences across all columns:", total_occurrence)

x_df_alt = (total_occurrence / df_alt.shape[0]) * 100
print(f"\n{x_df_alt:.2f}% of actors are recorded to be dead related to the search variables")

The leading cause of death is 'Medical' with 209 occurrences.
Total occurrences across all columns: 572

66.74% of actors are recorded to be dead related to the search variables


In [67]:
df_alt.shape

(857, 5)

In [68]:
df_alt

Unnamed: 0,Name,Cause,Short Note,Date of Death,Long Description
1,Adam Faust,Medical,Cardiac Arrest,2021-08-02,More Info
3,Adonis Cheeks,Suicide,,2019-11-01,Exact date of death unknown at this time.\nDat...
5,Adrianna Analese,Murder,Killed by her husband in a murder-suicide,2008-11-21,
8,Aiden Bonini,Medical,Meningitis,2007-03-26,"""Aiden was sadly taken from this world earlier..."
9,Aileene Dacosta,Medical,She died in 2011 at age 23 of cerebral vascula...,2011-01-23,More Info
...,...,...,...,...,...
1665,Yasmin Viana,Medical,Heart Failure,2015-08-11,
1668,Yumika Hayashi,Unsure,Unknown,2005-06-29,From Jans-Web: [Japanese AV Star] Yumika Hayas...
1669,Yurizan Beltran,Overdose,Apparent Overdose,2017-12-13,More Info
1676,Zeff Ryan,AIDS,AIDS,NaT,AKA Jeff Ryan
