In [1]:
import pandas as pd
import sys
sys.path.append("..") # so we can import our custom utils module
import utils
from functools import reduce

In [2]:
df = pd.read_excel('../data/debt_lives_processed.xlsx')

### Reshape dataset to a list of state sequences
Each sequence represents the state lifecycle of an individual CDP.

Every state in the sequence has a timestamp and the number of seconds the CDP stayed in this state.

In [3]:
sequences = utils.dataframe_to_sequences(df)
sequences[:5] # see first 5 CDPs represented as state sequences

[[('safe', 1549495867.0, 0.0)],
 [('safe', 1549495085.0, 782.0)],
 [('safe', 1549493938.0, 1929.0)],
 [('safe', 1549492995.0, 2772.0), ('wiped', 1549495767.0, 100.0)],
 [('safe', 1549492791.0, 3076.0)]]

### Transition probabilities (discrete)
Time unit = second

In [4]:
Q_s = utils.transition_probabilities(sequences)
Q_s

Unnamed: 0,safe,unsafe,wiped,bitten,shut
safe,1.0,5.351749e-08,2.391805e-07,9.889101e-09,6.011022e-09
unsafe,7.2e-05,0.9991006,7.966904e-05,0.0007473049,2.329504e-07
wiped,0.0,0.0,1.0,0.0,0.0
bitten,0.0,0.0,0.0,1.0,0.0
shut,0.0,0.0,0.0,0.0,1.0


### Transition probabilities (discrete)
Time unit = 60 seconds = minute

In [5]:
Q_m = utils.transition_probabilities(sequences, seconds_per_time_unit=60)
Q_m

Unnamed: 0,safe,unsafe,wiped,bitten,shut
safe,0.999981,3e-06,1.4e-05,5.933461e-07,3.606613e-07
unsafe,0.004333,0.946035,0.00478,0.0448383,1.397703e-05
wiped,0.0,0.0,1.0,0.0,0.0
bitten,0.0,0.0,0.0,1.0,0.0
shut,0.0,0.0,0.0,0.0,1.0


### Transition rates (continuous case)
aka infinitesimal generator matrix
aka intensity matrix

In [6]:
Λ_s = utils.transition_rates(sequences)
Λ_s

Unnamed: 0,safe,unsafe,wiped,bitten,shut
safe,-3.085981e-07,5.351749e-08,2.391805e-07,9.889101e-09,6.011022e-09
unsafe,7.221463e-05,-0.0008994216,7.966904e-05,0.0007473049,2.329504e-07
wiped,0.0,0.0,0.0,0.0,0.0
bitten,0.0,0.0,0.0,0.0,0.0
shut,0.0,0.0,0.0,0.0,0.0


### Sequence distribution

In [7]:
utils.sequence_distribution(sequences)

safe -> wiped                       0.657235
safe                                0.163138
safe -> unsafe -> bitten            0.122163
safe -> shut                        0.016489
safe -> bitten                      0.015994
safe -> unsafe -> wiped             0.013024
safe -> unsafe -> safe -> bitten    0.011196
safe -> unsafe -> safe -> wiped     0.000381
safe -> unsafe -> safe              0.000190
safe -> unsafe                      0.000114
safe -> unsafe -> safe -> shut      0.000038
safe -> unsafe -> shut              0.000038
dtype: float64

### Time spent distribution
What fraction of all dai-time is spent in state _i_ before moving to state *j*?

In [12]:
sequences_with_next_state = utils.dataframe_to_sequences_with_end_state(df)
time_spent_matrix = utils.time_spent_before_state_change_distribution(sequences_with_next_state)
time_spent_matrix

to_state,<end>,bitten,safe,shut,unsafe,wiped
from_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
safe,0.248137,0.015634,,0.01048293,0.255243,0.4704431
unsafe,6e-06,4e-05,1.2e-05,5.955267e-10,,4.954644e-07


In [14]:
time_spent_matrix.sum(axis=1)

from_state
safe      0.999941
unsafe    0.000059
dtype: float64