In [4]:
import numpy as np
import pandas as pd
from datetime import datetime
from lifelines.datasets import load_rossi
from lifelines import CoxPHFitter


In [5]:
full_df = pd.read_csv("county_full_df.csv")

# Model Preprocessing

In [6]:
X = full_df.loc[:, ['SALE_AMOUNT','DEED_CATEGORY_TYPE','PROPERTY_INDICATOR_CODE','censor', 
                    'date_difference','PURCHASE_AMOUNT',
                   'PURCHASE_COMBINED_LTV','ESTIMATED_EQUITY','TOTAL_VALUE_CALCULATED',
                   'BEDROOMS', 'TOTAL_BATHROOMS_CALCULATED' ]]

In [7]:
X = pd.get_dummies(X, columns = ['DEED_CATEGORY_TYPE','PROPERTY_INDICATOR_CODE' ], drop_first=True)
X

Unnamed: 0,SALE_AMOUNT,censor,date_difference,PURCHASE_AMOUNT,PURCHASE_COMBINED_LTV,ESTIMATED_EQUITY,TOTAL_VALUE_CALCULATED,BEDROOMS,TOTAL_BATHROOMS_CALCULATED,DEED_CATEGORY_TYPE_U,...,PROPERTY_INDICATOR_CODE_31.0,PROPERTY_INDICATOR_CODE_32.0,PROPERTY_INDICATOR_CODE_50.0,PROPERTY_INDICATOR_CODE_51.0,PROPERTY_INDICATOR_CODE_52.0,PROPERTY_INDICATOR_CODE_53.0,PROPERTY_INDICATOR_CODE_54.0,PROPERTY_INDICATOR_CODE_70.0,PROPERTY_INDICATOR_CODE_80.0,PROPERTY_INDICATOR_CODE_90.0
0,325000.0,1,3074.000000,325000.0,80.00,549900,566923,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0,6030.614293,325000.0,80.00,549900,566923,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,1,119.000000,852500.0,76.25,318300,778997,4,4.0,0,...,0,0,0,0,0,0,0,0,0,0
3,852500.0,0,5325.614293,852500.0,76.25,318300,778997,4,4.0,0,...,0,0,0,0,0,0,0,0,0,0
4,533000.0,0,6513.614293,533000.0,80.00,134100,572400,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295694,376600.0,1,1768.000000,185000.0,80.00,204600,279863,5,4.0,0,...,0,0,0,0,0,0,0,0,0,0
295695,185000.0,0,3198.614536,185000.0,80.00,204600,279863,5,4.0,0,...,0,0,0,0,0,0,0,0,0,0
295696,247000.0,0,4763.614536,247000.0,103.30,58850,199123,4,2.0,0,...,0,0,0,0,0,0,0,0,0,0
295697,205875.0,1,1850.000000,205875.0,102.15,86504,182932,4,2.0,0,...,0,0,0,0,0,0,0,0,0,0


# Modeling with Survival Regression
https://lifelines.readthedocs.io/en/latest/Survival%20Regression.html
## Cox Proportional Hazards model

In [8]:
cph = CoxPHFitter()
cph.fit(X, duration_col='date_difference', event_col='censor')

cph.print_summary()




0,1
model,lifelines.CoxPHFitter
duration col,'date_difference'
event col,'censor'
baseline estimation,breslow
number of observations,295699
number of events observed,191511
partial log-likelihood,-2253414.08
time fit was run,2020-12-27 19:49:18 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
SALE_AMOUNT,0.0,1.0,0.0,0.0,0.0,1.0,1.0,30.93,<0.005,695.23
PURCHASE_AMOUNT,-0.0,1.0,0.0,-0.0,-0.0,1.0,1.0,-6.25,<0.005,31.17
PURCHASE_COMBINED_LTV,-0.0,1.0,0.0,-0.0,-0.0,1.0,1.0,-5.4,<0.005,23.86
ESTIMATED_EQUITY,-0.0,1.0,0.0,-0.0,-0.0,1.0,1.0,-17.49,<0.005,225.0
TOTAL_VALUE_CALCULATED,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,-1.88,0.06,4.05
BEDROOMS,0.05,1.06,0.0,0.05,0.06,1.05,1.06,34.1,<0.005,844.21
TOTAL_BATHROOMS_CALCULATED,0.01,1.01,0.0,0.0,0.01,1.0,1.01,6.02,<0.005,29.12
DEED_CATEGORY_TYPE_U,1.35,3.87,0.01,1.33,1.37,3.79,3.95,126.14,<0.005,inf
PROPERTY_INDICATOR_CODE_10.0,0.16,1.18,0.04,0.08,0.25,1.08,1.28,3.84,<0.005,12.97
PROPERTY_INDICATOR_CODE_11.0,0.21,1.24,0.04,0.13,0.3,1.14,1.35,4.99,<0.005,20.66

0,1
Concordance,0.56
Partial AIC,4506886.16
log-likelihood ratio test,15789.57 on 29 df
-log2(p) of ll-ratio test,inf


In [None]:
cph.plot()

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from matplotlib import pyplot as plt

plt.style.use('bmh')

In [None]:
r = cph.compute_residuals(X, 'martingale')
r.head()

In [None]:
r.plot.scatter(
    x='date_difference', y='martingale', c=np.where(r['censor'], '#008fd5', '#fc4f30'),
    alpha=0.75
)

In [None]:
r = cph.compute_residuals(X, 'deviance')
r.head()

In [None]:
r.plot.scatter(
    x='date_difference', y='deviance', c=np.where(r['censor'], '#008fd5', '#fc4f30'),
    alpha=0.75
)


In [None]:
r = r.join(X.drop(['date_difference', 'censor'], axis=1))

In [None]:
plt.scatter(r['DEED_CATEGORY_TYPE_U'], r['deviance'], color=np.where(r['censor'], '#008fd5', '#fc4f30'))


## Modeling with Weibull Accelerated Time Failure 

In [None]:
from lifelines import WeibullAFTFitter
from lifelines.datasets import load_rossi


aft = WeibullAFTFitter()
aft.fit(X, duration_col='date_difference', event_col='censor')

aft.print_summary(3)

In [None]:
print(aft.median_survival_time_)
print(aft.mean_survival_time_)

In [None]:
wft = WeibullAFTFitter().fit(X, 'date_difference', 'censor')
wft.plot()

In [None]:
from lifelines import LogLogisticAFTFitter
from lifelines import LogNormalAFTFitter

llf = LogLogisticAFTFitter().fit(X, 'date_difference', 'censor')
lnf = LogNormalAFTFitter().fit(X, 'date_difference', 'censor')

In [None]:
llf.print_summary(3)

In [None]:
lnf.print_summary(3)

In [None]:
print(cph.AIC_partial_) # lower is better


In [None]:
from lifelines.calibration import survival_probability_calibration
survival_probability_calibration(cph, X, t0=25)

In [None]:
# filter down to just censored subjects to predict remaining survival
censored_subjects = X.loc[~X['censor'].astype(bool)]
censored_subjects_last_obs = censored_subjects['date_difference']

# predict new survival function
wft.predict_survival_function(censored_subjects, conditional_after=censored_subjects_last_obs)

# predict median remaining life
wft.predict_median(censored_subjects, conditional_after=censored_subjects_last_obs)