<a href="https://colab.research.google.com/github/anelglvz/ML-AI-for-the-Working-Analyst/blob/main/Semana5_Working_Analyst_Survival_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MaxMitre/ML-AI-for-the-Working-Analyst/blob/main/Semana6/Survival_Analysis.ipynb)

In [None]:
# Puede necesitar instalar alguna de estas librerías, solo descomentenlas a necesidad

!pip install lifelines
#!pip install plotly==4.14.3

### Objetivo

Implementar el análisis de supervivencia a datos sobre prestamos, para esto veremos:
  1. Estimador Kaplan Meier
  2. Modelo de Cox
  3. Mención de Survival Random Forest

### Carga

In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns 

from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
from lifelines.statistics import logrank_test

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Los datos de los prestamos pueden ser encontrados [aquí](https://www.kaggle.com/datasets/yousuf28/prosper-loan). La parte de las etiquetas fue obtenida [aquí](https://github.com/ketra21/prosperloan/blob/master/Prosper%20Loan%20Data%20-%20Variable%20Definitions.csv).

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Curso-WorkingAnalyst/semana6/prosperLoanData.csv')
df_dict = pd.read_csv('/content/drive/MyDrive/Curso-WorkingAnalyst/semana6/Prosper Loan Data - Variable Definitions.csv')

In [None]:
df_dict

In [None]:
df_dict_reindex = df_dict.set_index('Variable')

In [None]:
df_dict_reindex

In [None]:
df.head()

In [None]:
df.info()

#### Filtrando columnas

In [None]:
df_dict_reindex.loc[['LoanKey', 'ProsperRating (numeric)', 'EmploymentStatusDuration',
                    'LoanOriginationDate', 'LoanStatus', 'EmploymentStatus',
                    'ClosedDate', 'Occupation', 'BorrowerState', 'IsBorrowerHomeowner',
                    'StatedMonthlyIncome', 'IncomeRange', 'LoanOriginalAmount'],:]

In [None]:
df_sub = df.loc[:, ['LoanKey', 'ProsperRating (numeric)', 'EmploymentStatusDuration',
                    'LoanOriginationDate', 'LoanStatus', 'EmploymentStatus',
                    'ClosedDate', 'Occupation', 'BorrowerState', 'IsBorrowerHomeowner',
                    'StatedMonthlyIncome', 'IncomeRange', 'LoanOriginalAmount']]
df_sub.head()

In [None]:
df_sub.info()

In [None]:
df_sub['LoanOriginationDate'] = pd.to_datetime(df['LoanOriginationDate'])
df_sub['ClosedDate'] = pd.to_datetime(df_sub['ClosedDate'], errors='ignore')

In [None]:
df_sub

In [None]:
df_sub['LoanKey'].unique().shape

In [None]:
df_sub = df_sub.drop_duplicates(subset='LoanKey').reset_index(drop=True)
df_sub.shape

Revisando el rango máximo de tiempo e ingresos de usuarios

In [None]:
df_sub[['LoanOriginationDate', 'ClosedDate', 'StatedMonthlyIncome']].max()

In [None]:
df_sub[['LoanOriginationDate', 'ClosedDate', 'StatedMonthlyIncome']].min()

#### Revisando la variable del evento -> LoanStatus

In [None]:
df_sub.LoanStatus.unique()

In [None]:
df_status = df_sub[df_sub['LoanStatus'].isin([ 'Current', 'Chargedoff', 'Defaulted'])].copy()
df_status.head()

In [None]:
df_status.LoanStatus.unique()

In [None]:
df_status['LoanStatus_Censored'] = df_status['LoanStatus'].apply(lambda x: 0 if x == 'Current' else 1)

In [None]:
df_status.head(10)

In [None]:
max_date = df_status.ClosedDate.max()
max_date

Vamos a arreglar los tiempos para los datos censurados

In [None]:
df_status.ClosedDate.fillna(max_date, inplace=True)

In [None]:
df_status

In [None]:
df_status['time'] = df_status['ClosedDate'] - df_status['LoanOriginationDate']

In [None]:
df_status.head()

Veamos elementos que tiene problemas en la columna "time" (prestamos creados tarde)

In [None]:
pd.to_timedelta(0, unit='D')

In [None]:
df_status[df_status['time'] < pd.to_timedelta(0, unit='D')]

In [None]:
df_status_clean = df_status[df_status['time'] > pd.to_timedelta(0, unit='D')]
df_status_clean.head()

In [None]:
df_status_clean['time'] = df_status_clean.time.dt.days

In [None]:
df_status_clean

In [None]:
df_status_clean.LoanStatus_Censored.value_counts(normalize=True)

Recordemos: 0 son datos censurados

### Kaplan-Meier

In [None]:
df_status_clean[['time', 'LoanStatus_Censored']]

In [None]:
T = df_status_clean.time
E = df_status_clean.LoanStatus_Censored

model = KaplanMeierFitter()

model.fit(T, event_observed=E)

function = model.survival_function_

In [None]:
function.plot(title='Kaplan Meier Estimator')

In [None]:
model.median_survival_time_

In [None]:
model.event_table

In [None]:
print(model.predict(1000))
print(model.predict(775))

In [None]:
borrower = (df_status_clean['IsBorrowerHomeowner'] == True)

T = df_status_clean.time
E = df_status_clean.LoanStatus_Censored

model = KaplanMeierFitter()

model.fit(T[borrower], event_observed=E[borrower], label='Homeowner')
fig_1 = model.survival_function_

model.fit(T[~borrower], event_observed=E[~borrower], label='Not Homeowner')
fig_2 = model.survival_function_


figure = pd.concat([fig_1, fig_2], axis=1)
figure.plot(backend='plotly')

In [None]:
results = logrank_test(T[borrower], T[~borrower], event_observed_A=E[borrower],
                       event_observed_B=E[~borrower])
results.print_summary()

Entonces se puede rechaza la hipótesis nula de que las funciones de riesgo son iguales (por lo tanto tampoco sus funciones de *supervivencia*) 

Es importante señalar que siempre se supone falla!!!

In [None]:
def survival(data, group_field, time_field, event_field):
  """
  Funcion que recibe un DataFrame y filtra por columna

  Returns
    gráfica de la función de supervivencia (Survival function)
    Logrank Test
  """

  model = KaplanMeierFitter()
  results = []
  con_exp = []

  for i in data[group_field].dropna().unique():
    group = data[data[group_field] == i]
    T = group[time_field]
    E = group[event_field]
    con_exp.append([T, E])
    model.fit(T, E, label=str(i))
    results.append(model.survival_function_)

  survival = pd.concat(results, axis=1)
  
  
  return survival


In [None]:
rates = survival(df_status_clean, 'Occupation', 'time', 'LoanStatus_Censored')
rates.plot(kind='scatter', title='Pago completo dependiendo de su ocupación', 
           backend='plotly')

In [None]:
ocupation1 = (df_status_clean['Occupation'] == 'Computer Programmer')
ocupation2 = (df_status_clean['Occupation'] == 'Professor')

T = df_status_clean.time
E = df_status_clean.LoanStatus_Censored

model = KaplanMeierFitter()

model.fit(T[ocupation1], event_observed=E[ocupation1], label='Computer Programmer')
fig_1 = model.survival_function_

model.fit(T[ocupation2], event_observed=E[ocupation2], label='Professor')
fig_2 = model.survival_function_


figure = pd.concat([fig_1, fig_2], axis=1)
figure.plot(backend='plotly',kind='scatter')

In [None]:
results = logrank_test(T[ocupation1], T[ocupation2], event_observed_A=E[ocupation1],
                       event_observed_B=E[ocupation2])
results.print_summary()

### Cox Model (Segunda Sesión)

In [None]:
df_status_clean.head()

In [None]:
to_model = df_status_clean[['LoanOriginalAmount', 'IsBorrowerHomeowner', 'StatedMonthlyIncome', 
                    'time', 'LoanStatus_Censored']]

In [None]:
model = CoxPHFitter()
model.fit(to_model, duration_col='time', event_col='LoanStatus_Censored')

In [None]:
model.print_summary()

In [None]:
model.plot()

In [None]:
model.plot_partial_effects_on_outcome(covariates='LoanOriginalAmount',
                                      values=[1000, 10000, 
                                              20000], cmap='coolwarm',
                                      figsize=(15, 10))

baseline es $h_0(t)$

Prediciendo riesgos

In [None]:
to_model[:5]

In [None]:
model.predict_partial_hazard(to_model[:5])

### DF filter 2009


In [None]:
df_2009 = df_status_clean[df_status_clean['LoanOriginationDate'].dt.year == 2009].reset_index(drop=True)
df_2009.head()

In [None]:
df_2009.info()

In [None]:
# Valores distintos en la columna
df_2009.EmploymentStatus.unique()

In [None]:
df_2009[['EmploymentStatus']]

In [None]:
dummies = pd.get_dummies(df_2009[['EmploymentStatus']], drop_first=True)
dummies

In [None]:
dummies.sum()

In [None]:
df_2009.IncomeRange.unique()

In [None]:
def income_range_ordinal(label: str):
  if label in ('$0', 'Not employed'):
    return 0
  elif label == '$1-24,999':
    return 1
  elif label == '$75,000-99,999':
    return 2
  elif label == '$25,000-49,999':
    return 3
  elif label == '$50,000-74,999':
    return 4
  else:
    return 5

In [None]:
df_2009['IncomeRange_ord'] = df_2009.IncomeRange.apply(income_range_ordinal)

In [None]:
df_2009.head()

In [None]:
df_2009_dumm = pd.concat([df_2009, dummies], axis=1)
df_2009_dumm.head()

In [None]:
df_2009_num = df_2009_dumm._get_numeric_data()
df_2009_num

In [None]:
null_col = df_2009_num.isnull().sum()
null_col[null_col > 0]

In [None]:
df_2009_num.dropna(inplace=True)

In [None]:
df_2009_num.info()

In [None]:
df_2009_num.columns.duplicated()

In [None]:
df_2009_num

### Cox Model 2009

In [None]:
to_model = df_2009_num #.iloc[:, 0:10]

In [None]:
to_model

In [None]:
model = CoxPHFitter()
model.fit(to_model, duration_col='time', event_col='LoanStatus_Censored')

model.print_summary()

In [None]:
model.plot()

In [None]:
model.baseline_cumulative_hazard_.plot()

In [None]:
model.baseline_survival_.plot()

In [None]:
to_model[100:105]

In [None]:
model.predict_partial_hazard(to_model[100:105])

## Caso con estimador paramétrico

In [None]:
model = CoxPHFitter(baseline_estimation_method='spline', n_baseline_knots=6)
model.fit(to_model, duration_col='time', event_col='LoanStatus_Censored')

# model.print_summary()
model.baseline_cumulative_hazard_.plot()

In [None]:
model.baseline_survival_.plot()

# Random Survival Forest for 2010

In [None]:
!pip install scikit-survival

In [None]:
df_2010 = df_status_clean[df_status_clean['LoanOriginationDate'].dt.year == 2010].reset_index(drop=True)
df_2010.head()

In [None]:
to_model2 = df_2010[['LoanOriginalAmount', 'StatedMonthlyIncome', 
                  'time', 'LoanStatus_Censored']].copy()

In [None]:
to_model2

In [None]:
# Modificación de datos para alimentar el modelo Random Survival Forest
Xt = to_model2[['LoanOriginalAmount', 'StatedMonthlyIncome']]
y = np.array(to_model2[['LoanStatus_Censored', 'time']].apply(tuple, axis=1)).astype([('cens', '?'), ('time', '<f8')])

In [None]:
to_model2[['LoanStatus_Censored', 'time']]

In [None]:
from sklearn.model_selection import train_test_split

from sksurv.ensemble import RandomSurvivalForest

In [None]:
random_state = 40

X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25, random_state=random_state)

In [None]:
y_train[:10]

In [None]:
rsf = RandomSurvivalForest(n_estimators=1000,
                           min_samples_split=10,
                           min_samples_leaf=15,
                           max_features="sqrt",
                           random_state=random_state)
rsf.fit(X_train, y_train)

In [None]:
# Concordance index
rsf.score(X_train, y_train)

In [None]:
rsf.score(X_test, y_test)

## Como podemos ver de que servirían las predicciones

Haremos una predicción del riesgo

In [None]:
rsf.predict(X_test[:5])

In [None]:
y_test[:5]

In [None]:
X_test[:5]