In [1]:
import pandas as pd
from collections import Counter
import ast

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import metrics 

import xgboost as xgb

import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'xgboost'

In [None]:
df = pd.read_csv("nndb.csv")

In [None]:
df["IS_DECEASED"] = df["DIED"].notnull()

In [None]:
df.head()

In [None]:
data = df.RISK_FACTORS.to_list()

In [None]:
# Function to parse and clean individual strings
def parse_and_clean(s):
    # Check for NaN (float) values
    if isinstance(s, float):
        return []
    # Safely evaluate the string as a list
    parsed_list = ast.literal_eval(s)
    # Clean each element in the list
    return [item.strip() for item in parsed_list]

In [None]:
cleaned_data = [parse_and_clean(row) for row in data]

In [None]:
# Normalize the data: strip spaces and convert to lowercase
normalized_data = [[factor.strip().lower() for factor in sublist] for sublist in cleaned_data]

In [None]:
df2 = pd.DataFrame({'RISK_FACTORS': normalized_data})

In [None]:
# Flatten the list and get unique risk factors, normalized
unique_risk_factors = set(factor for sublist in normalized_data for factor in sublist)

In [None]:
# Create DataFrames for each risk factor and store them in a list
encoded_frames = []
for risk_factor in unique_risk_factors:
    # Create a column for each risk factor with 0s
    risk_factor_col = pd.Series([0] * len(df2))
    
    # Update the column with 1 where the risk factor is present
    for i, row in enumerate(df2['RISK_FACTORS']):
        if isinstance(row, list) and risk_factor in [item.lower().strip() for item in row]:
            risk_factor_col.at[i] = 1

    # Add the column to the list of DataFrames
    temp_df = pd.DataFrame({risk_factor: risk_factor_col})
    encoded_frames.append(temp_df)

In [None]:
# Concatenate all the encoded DataFrames
encoded_df = pd.concat(encoded_frames, axis=1)

# Join the encoded DataFrame with the original DataFrame
df = df.join(encoded_df)

In [None]:
df.obesity.value_counts()

In [None]:
# Drop the original RISK_FACTORS column if necessary
df.drop('RISK_FACTORS', axis=1, inplace=True)

In [None]:
df['GENDER'] = df['GENDER'].astype("category")
df['RACE'] = df['RACE'].astype("category")
df['OCCUPATION'] = df['OCCUPATION'].astype("category")
df['NATIONALITY'] = df['NATIONALITY'].astype("category")
df['BIRTHPLACE'] = df['BIRTHPLACE'].astype("category")

In [None]:
# Convert the 'date_column' to datetime, coercing out-of-bounds dates to NaT
df['BORN'] = pd.to_datetime(df['BORN'], errors='coerce')
df['DIED'] = pd.to_datetime(df['DIED'], errors='coerce')

# Filter to keep only the rows where the date is >= 1700-01-01
df = df[df['BORN'] >= pd.Timestamp('1700-01-01')]

In [None]:
df_y = df['IS_DECEASED']
df_X = df.drop(columns=['IS_DECEASED', 'DIED', 'LOCATION_OF_DEATH', 'CAUSE_OF_DEATH', 'NAME', 'AKA', 'LINK', 'BORN', 'EXECUTIVE_SUMMARY'])

In [None]:
name_column = df['NAME']
age_column = df['AGE']
is_dead_column = df['IS_DECEASED']

In [None]:
# Split the data into 30% test and 70% training
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=0)

In [None]:
# count examples in each class
counter = Counter(df_y)
# estimate scale_pos_weight value
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

In [None]:
def create_pipe(clf):

    pipeline = Pipeline([('clf', clf)])

    return pipeline

In [None]:
clf = xgb.XGBClassifier(random_state=42, 
                        verbosity=0, 
                        tree_method="hist",
                        enable_categorical=True)

pipeline = create_pipe(clf)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
feat_list = []
xgb_cols = pipeline['clf'].get_booster().feature_names
feat_imp = pipeline['clf'].feature_importances_

total_importance = 0
# Print the name and gini importance of each feature
for feature in zip(xgb_cols, feat_imp):
    feat_list.append(feature)
    total_importance += feature[1]
        
# create DataFrame using data
df_imp = pd.DataFrame(feat_list, columns =['FEATURE', 'IMPORTANCE']).sort_values(by='IMPORTANCE', ascending=False)
df_imp['SUMMED_TOTAL'] = df_imp['IMPORTANCE'].cumsum()
df_imp.head(30)

In [None]:
def print_confusion(pipeline):
    ''' take a supplied pipeline and run it against the train-test spit 
    and product scoring results.'''
    
    y_pred = pipeline.predict(X_test)

    print(metrics.classification_report(y_test, y_pred, digits=3))
        
    ConfusionMatrixDisplay.from_predictions(y_test, 
                                            y_pred, 
                                            cmap=plt.cm.Blues)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig('02_confusion_matrix.png', dpi=300);

In [None]:
print_confusion(pipeline)

In [None]:
feat_list = []
xgb_cols = pipeline['clf'].get_booster().feature_names
feat_imp = pipeline['clf'].feature_importances_

total_importance = 0
# Print the name and gini importance of each feature
for feature in zip(xgb_cols, feat_imp):
    feat_list.append(feature)
    total_importance += feature[1]
        
# create DataFrame using data
df_imp = pd.DataFrame(feat_list, columns =['FEATURE', 'IMPORTANCE']).sort_values(by='IMPORTANCE', ascending=False)
df_imp['SUMMED_TOTAL'] = df_imp['IMPORTANCE'].cumsum()
df_imp.head(30)

In [None]:

# Predict the outcome variable based on the model
probs = pipeline.predict_proba(df_X)

# Get the Win probability for the `win` class
probs = probs[:,1]
probs = probs.round(3)

# Combine predictions with the name column
result_df = pd.DataFrame({
    'NAME': name_column,
    'AGE': age_column,
    'IS_DECEASED': is_dead_column,
    'PREDICTION': probs,
})

# Add the probability percentage to the DataFrame
# X['last_prediction_date'] = pd.Timestamp.today().strftime('%Y-%m-%d')
# X['convert_probability'] = probs.tolist()

In [None]:
result_df = result_df.sort_values('PREDICTION', ascending=False)
result_df

In [None]:
filtered_df = result_df[result_df['IS_DECEASED'] == False]
filtered_df

In [None]:
df_leads = filtered_df[filtered_df['PREDICTION'] >= 0.75]
df_leads

In [None]:
df_leads.to_csv('dead_pool_leads.csv', index=False)
