In [179]:
import pandas as pd
import numpy as np
import os
import datetime
from HMM import unsupervised_HMM
from HMM import supervised_HMM
from HMM_helper import sample_sentence
import simplejson
from hmmlearn import hmm

In [180]:
import git
import sys
repo = git.Repo("./", search_parent_directories=True)
homedir = repo.working_dir

In [181]:
def makeHMMUnSupData(Input, colname, fipsname):
    #Takes input dataframe, and gives out HMM format of data, a list of lists 
    #of the colname value, each list in the set represents one fips code.
    Output = []
    for fips in Input[fipsname].unique():
        temp = list(Input[Input[fipsname] == fips][colname])
        Output.append(temp)
    return Output

In [182]:
def makeHMMmap(Output):
    #Takes in output of makeHMMUnSupData and transforms data into list from 0 to D-1, where D is the number of unique
    #values of the output
    #Unqiue values in the input
    UniqueVals = np.array(list(set(x for l in Output for x in l)))
    UniqueVals = np.sort(UniqueVals)
    HMMOutput = []
    templs = []
    Map = {}
    RMap = {}
    for x in range(len(UniqueVals)):
        Map[int(UniqueVals[x])] = x
        RMap[x] = int(UniqueVals[x])
    for ls in Output:
        for val in ls:
            templs.append(Map[val])
        HMMOutput.append(templs)
        templs = []
    return [Map,RMap,HMMOutput]

In [183]:
def makeHMMSupData(UnSupData):
    #Takes list of lists of time series data from makeHMMUnSupData and makes it into data with X and Y
    X = []
    Y = []
    tempX = []
    tempY = []
    for ls in UnSupData:
        lenls = len(ls)
        for n in range(lenls):            
            if n == 0:
                tempX.append(ls[n])
            elif n == lenls - 1:
                tempY.append(ls[n])
            else:
                tempX.append(ls[n])
                tempY.append(ls[n])
        if len(tempX) != 0 and len(tempY) != 0:
            X.append(tempX)
            Y.append(tempY)
        tempX = []
        tempY = []   
    return [X,Y]

In [187]:
def makeX(Data, DTW, cluster_col, cluster_num, fipsname, deathsname):
    #Takes in the dataset, cluster column and number, and gives out the deaths info in this cluster
    #In the form able to be processed by hmmlearn's HMM modules    
    fips = list(DTW[DTW[cluster_col] == cluster_num]['FIPS'])
    Rows = Data[Data[fipsname].isin(fips)]
    RawData = makeHMMUnSupData(Rows, deathsname, fipsname)
    #RawData = [a[0] for a in RawData]
    temp = []
    lengths = []
    for i in RawData:
        temp.extend(i)
        lengths.append(len(i))
    temp = np.array(temp).reshape(-1,1)
    return [temp, lengths]

In [188]:
#Dataframes of deaths
NYT_F = pd.read_csv(f"{homedir}/models/HMM_Work/NYT_daily_Filled.csv", index_col=0)
NYT_W = pd.read_csv(f"{homedir}/models/HMM_Work/NYT_daily_Warp.csv", index_col=0)
JHU = pd.read_csv(f"{homedir}/models/HMM_Work/JHU_daily.csv", index_col=0)
#list of lists of deaths data
with open('NYT_daily_Warp_Death.txt') as f:
    NYT_daily_Warp_Death = simplejson.load(f)
with open('NYT_daily_Death_Filled.txt') as g:
    NYT_daily_Death_Filled = simplejson.load(g)
with open('JHU_daily_death.txt') as h:
    JHU_daily_death = simplejson.load(h)
#DTW Based Clusters
DTW_Clusters = pd.read_csv(f"{homedir}/models/HMM_Work/DTW_Clustering.csv", index_col=0)

In [195]:
test = makeX(NYT_F, DTW_Clusters, 'NYT_F_Z_L', 3, 'fips', "deaths")

In [196]:
model1 = hmm.GaussianHMM(n_components=10, covariance_type="full")
model2 = hmm.GMMHMM(n_components=4, n_mix=2, covariance_type="full")


In [211]:
model3 = hmm.GaussianHMM(n_components=4, covariance_type="full")
model3.fit(test[0],test[1])
model3.score_samples(test[0],test[1])

(83123.19036987724, array([[5.24264714e-026, 0.00000000e+000, 1.00000000e+000,
         2.20311290e-122],
        [6.71091493e-006, 3.55639168e-041, 9.99993289e-001,
         6.30747745e-014],
        [6.71165910e-006, 2.29341890e-027, 9.99993288e-001,
         6.30984054e-014],
        ...,
        [6.72395748e-006, 2.05113388e-024, 9.99993276e-001,
         6.35857322e-014],
        [6.71295138e-006, 2.14389389e-021, 9.99993287e-001,
         8.72878937e-013],
        [1.82700768e-005, 4.87354010e-016, 9.99981703e-001,
         2.65802220e-008]]))

In [212]:
model1.get_stationary_distribution()

array([6.78641935e-01, 6.18624625e-04, 9.69925956e-04, 7.66309930e-02,
       1.05320001e-01, 1.37472145e-04, 9.37721758e-02, 3.78499830e-02,
       5.85895509e-03, 1.99934441e-04])

In [210]:
model1.score_samples(test[0],test[1])

(86155.42524077903,
 array([[1.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
         1.89160770e-075, 2.52088085e-150, 0.00000000e+000],
        [9.99998342e-001, 0.00000000e+000, 1.05986938e-096, ...,
         6.70495089e-011, 2.50462540e-013, 3.81110875e-191],
        [9.99998342e-001, 1.87126218e-207, 5.57076813e-044, ...,
         6.70503660e-011, 2.50462938e-013, 4.81402682e-090],
        ...,
        [9.99998342e-001, 1.00671306e-208, 2.99697207e-045, ...,
         6.70497865e-011, 2.50458637e-013, 2.58985899e-091],
        [9.99998342e-001, 7.69135330e-204, 1.89435887e-040, ...,
         6.70882422e-011, 2.53815694e-013, 1.63702638e-086],
        [9.99995633e-001, 1.55723701e-148, 3.61358648e-034, ...,
         4.15376246e-009, 8.51721169e-010, 1.33562472e-080]]))

In [37]:
X1 = [0.5, 1.0, -1.0, 0.42, 0.24]
X2 = [2.4, 4.2, 0.5, -0.24]
X = np.concatenate([X1, X2]).reshape(-1,1)
lengths = [len(X1), len(X2)]
hmm.GaussianHMM(n_components=2).fit(X, lengths)

GaussianHMM(algorithm='viterbi', covariance_type='diag', covars_prior=0.01,
            covars_weight=1, init_params='stmc', means_prior=0, means_weight=0,
            min_covar=0.001, n_components=2, n_iter=10, params='stmc',
            random_state=None, startprob_prior=1.0, tol=0.01,
            transmat_prior=1.0, verbose=False)

In [102]:
X

array([[ 0.5 ],
       [ 1.  ],
       [-1.  ],
       [ 0.42],
       [ 0.24],
       [ 2.4 ],
       [ 4.2 ],
       [ 0.5 ],
       [-0.24]])

In [10]:
#This is just a testing file so far, because our actual HMM clusterings are not available
#Maknig basic list of list data from the direct NYT Data (no clustering we just take the whole dataset)
DailyDeathUnSup = makeHMMUnSupData(NYT_daily, 'deaths', 'fips')
#Making the mapping of number of deaths to HMM states
[DailyDeathMap, DailyDeathRMap, DailyDeathUnSupHMM] = makeHMMmap(DailyDeathUnSup)
#Making supervised X and Y datasets
DailyDeathSup = makeHMMSupData(DailyDeathUnSupHMM)

In [11]:
#using the superviesed testing data, and making a supervised HMM from this 
SupHMM = supervised_HMM(DailyDeathSup[0],DailyDeathSup[1])
SupHMM

<HMM.HiddenMarkovModel at 0x1a01ee52c50>

In [13]:
test = np.zeros(14)
for j in range(1000): #This generates a sample of length 14, with a starting state of 2, 
    test += np.array(sample_sentence(SupHMM, DailyDeathMap, 14, 2))#state of 2 means 2 people died in the county yesterday
print(test/1000)

[1.355 1.145 1.213 1.019 0.806 0.712 0.668 0.776 0.617 0.632 0.631 0.596
 0.632 0.562]
