In [1]:
from __future__ import division
import json
import warnings
import numpy as np
import pandas as pd
from IOHMM import UnSupervisedIOHMM
from IOHMM import OLS, DiscreteMNL, CrossEntropyMNL
warnings.simplefilter("ignore")

In [2]:
data = pd.read_csv('sample-data.csv', skiprows=[0])
print(data.shape)
data[:1]

(402, 16)


Unnamed: 0.1,Unnamed: 0,dyad#,GOAL,A_Evasive,Forthcoming,Who Initiated?,Auditor Gender Attachment,Client Gendor Attachment,Client Obs opinion,Round,Auditor Inquiry,Client Response,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
0,,56.0,1.0,4.0,4.0,AID,ANA,CAN,NO,1.0,I5,,,,,


In [3]:
data_1 = data[['Auditor Inquiry', 'Client Response']].dropna() # eliminate NaN in dataset
print(data_1.shape)
data_1[10:13]

(269, 2)


Unnamed: 0,Auditor Inquiry,Client Response
13,I6,R6
16,I6,"R12, R11"
17,I6,R5


In [4]:
data_1['Auditor Inquiry'].value_counts()[data_1['Auditor Inquiry'].value_counts() > 2]

I6    131
I1     40
I2     24
I5     23
I4     22
I3     21
I7      6
Name: Auditor Inquiry, dtype: int64

In [5]:
print(len(data_1['Client Response'].unique()))
data_1['Client Response'].value_counts()[data_1['Client Response'].value_counts() > 3]

133


R19            81
R1              9
R1P             8
NO              7
No response     6
R7              4
R11             4
Name: Client Response, dtype: int64

In [6]:
# top 7 inquiries and 6 responses
data_2 = data_1[((data_1['Client Response'] == 'R19') | (data_1['Client Response'] == 'R1') | (data_1['Client Response'] == 'R1P') |
               (data_1['Client Response'] == 'NO') | (data_1['Client Response'] == 'R11') | (data_1['Client Response'] == 'R7')) 
                & (data_1['Auditor Inquiry'] != 'I6, I6')  & (data_1['Auditor Inquiry'] != 'I3, I6')]
data_2.shape

(112, 2)

In [7]:
data_2[:3]

Unnamed: 0,Auditor Inquiry,Client Response
2,I2,R7
3,I3,R19
5,I6,R19


In [8]:
Inquiry = pd.get_dummies(data_2['Auditor Inquiry'])
Response = pd.get_dummies(data_2['Client Response'])
data_3 = pd.concat([Inquiry, Response], axis = 1)
data_3[:3]

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,NO,R1,R11,R19,R1P,R7
2,0,1,0,0,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,1,0,0,0,0,1,0,0


In [9]:
# inquiry -> response
data_4 = pd.concat([Inquiry, data_2['Client Response']], axis = 1)
data_4[:3]

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,Client Response
2,0,1,0,0,0,0,0,R7
3,0,0,1,0,0,0,0,R19
5,0,0,0,0,0,1,0,R19


In [10]:
# response -> next inquiry
data_5 = pd.concat([Response.iloc[:-1], data_2['Auditor Inquiry'].shift(-1)], axis = 1)[:-1]
data_5[:3]

Unnamed: 0,NO,R1,R11,R19,R1P,R7,Auditor Inquiry
2,0.0,0.0,0.0,0.0,0.0,1.0,I3
3,0.0,0.0,0.0,1.0,0.0,0.0,I6
5,0.0,0.0,0.0,1.0,0.0,0.0,I6


In [11]:
# inquiry -> response model, 3 hidden states
SHMM = UnSupervisedIOHMM(num_states = 3, max_EM_iter=200, EM_tol=1e-6)
SHMM.set_models(model_emissions = [DiscreteMNL(solver='lbfgs')],
model_transition=CrossEntropyMNL(solver='lbfgs'),
model_initial=CrossEntropyMNL(solver='lbfgs'))

SHMM.set_inputs(covariates_initial = ['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7'], covariates_transition = ['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7'],
covariates_emissions = [['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7']])

SHMM.set_outputs([['Client Response']])
SHMM.set_data([data_4])

SHMM.train()

#expected ouput itself

print(len(SHMM.model_emissions[0][0].coef))

print(np.round(SHMM.model_emissions[0][0].coef, 3))
print(np.round(SHMM.model_emissions[1][0].coef, 3))
print(np.round(SHMM.model_emissions[2][0].coef, 3))

6
[[-4.714 -1.5   -0.308 -1.254 -0.    -0.448 -0.732 -0.472]
 [-0.292 10.683 -1.211 -4.127 -0.    -1.641 -2.25  -1.746]
 [ 0.219 -4.048 -1.016  9.787 -0.    -1.309 -1.733 -1.462]
 [ 9.298  0.968  4.008  1.103  0.     5.27   7.189 -9.24 ]
 [-4.714 -1.5   -0.308 -1.254 -0.    -0.448 -0.732 -0.472]
 [ 0.204 -4.604 -1.164 -4.255 -0.    -1.423 -1.742 13.392]]
[[  1.232  -4.331  -0.893  -1.899  -0.     12.405  -3.31   -0.739]
 [ -1.497  24.862  -4.515  -6.532  -0.     -0.289 -10.969  -4.054]
 [ -3.563 -11.169  -3.62   -3.938  -0.     -0.431  19.393  -3.797]
 [  3.305 -23.266 -15.926  22.197   0.    -10.866  14.292  16.875]
 [ -1.501  25.276  -4.617  -6.752  -0.     -0.285 -10.993  -4.131]
 [  2.025 -11.37   29.57   -3.075  -0.     -0.533  -8.413  -4.154]]
[[ -2.113  -2.177  -3.588  -3.295  -3.46   20.419 -10.013  -0.   ]
 [  4.503  12.115  -2.996  -2.198  -3.076  -3.713   4.372  -0.   ]
 [  2.645  -1.101  -1.92   17.881  -2.018  -2.581  -7.617  -0.   ]
 [  5.317  -7.226  12.262  -9.928  12.5

In [12]:
# response -> next inquiry model, 3 hidden states
SHMM = UnSupervisedIOHMM(num_states = 3, max_EM_iter=200, EM_tol=1e-6)

SHMM.set_models(model_emissions = [DiscreteMNL(solver='lbfgs')],
model_transition=CrossEntropyMNL(solver='lbfgs'),
model_initial=CrossEntropyMNL(solver='lbfgs'))

SHMM.set_inputs(covariates_initial = ['NO', 'R1', 'R11', 'R19', 'R1P', 'R7'], covariates_transition = ['NO', 'R1', 'R11', 'R19', 'R1P', 'R7'],
covariates_emissions = [['NO', 'R1', 'R11', 'R19', 'R1P', 'R7']])

SHMM.set_outputs([['Auditor Inquiry']])
SHMM.set_data([data_5])

SHMM.train()

#expected ouput itself

print(len(SHMM.model_emissions[0][0].coef))

print(np.round(SHMM.model_emissions[0][0].coef, 3))
print(np.round(SHMM.model_emissions[1][0].coef, 3))
print(np.round(SHMM.model_emissions[2][0].coef, 3))

7
[[  3.589  15.196  -3.778  -7.833   8.455  -3.992  -4.459]
 [ -0.133  -1.594  -1.932  -2.892   9.697  -1.829  -1.583]
 [  3.69   -3.05   -3.369  10.879   5.889  -3.466  -3.193]
 [ -7.21   -0.701  -1.049  -1.146  -2.912  -0.809  -0.594]
 [  0.579  -1.695  -2.024  13.989  -6.043  -1.942  -1.707]
 [  6.695  -7.456  13.201 -11.852 -12.175  12.845  12.131]
 [ -7.21   -0.701  -1.049  -1.146  -2.912  -0.809  -0.594]]
[[ 0.299 -1.751  7.692 -0.    -2.034 -1.609 -1.999]
 [-3.059 -0.411 -1.01  -0.    -0.745 -0.377 -0.517]
 [ 0.963 -2.201 -3.987  0.    -2.384 -2.003 11.539]
 [-3.059 -0.411 -1.01  -0.    -0.745 -0.377 -0.517]
 [ 3.072 -3.408  4.919 -0.    -3.773  9.076 -3.742]
 [ 4.843  8.593 -5.594 -0.    10.426 -4.334 -4.247]
 [-3.059 -0.411 -1.01  -0.    -0.745 -0.377 -0.517]]
[[ 2.221 -3.665 -2.216 16.881 -2.534 -6.244 -0.   ]
 [ 1.083 -3.51  -2.037 -2.271 -1.499 10.399 -0.   ]
 [ 5.392 -5.32  13.705 -4.184 -4.899  6.09   0.   ]
 [-6.029 -3.55  -1.787 -1.935  6.299 -5.055 -0.   ]
 [-6.413 -3