# Poisson regression for anomalies (real label)

In [47]:
import re
import os 
import pandas as pd 
import numpy as np
import time
import math

#change to the directory for ADL data
adlpath = '/Users/xuzekun/Desktop/research/paper5/data/NCSU-ADL'

#change to the directory for anomaly data
anomalypath = '/Users/xuzekun/Desktop/research/paper5/data/ADLData-HR-Anomoly'

#change to the correpsonding adl file with cluster labels

adlsuffix = ['/Subject015/Annotations.csv','/Subject059/Annotations.csv',
           '/Subject274/Annotations.csv','/Subject292/Annotations.csv',
           '/Subject380/Annotations.csv','/Subject390/Annotations.csv',
           '/Subject454/Annotations.csv','/Subject503/Annotations.csv',
           '/Subject805/Annotations.csv','/Subject875/Annotations.csv']

#change to the corresponding anomaly filenames in the anomaly data
anomalysuffix = ['/15anomolies.csv','/59anomolies.csv','/274anomolies.csv',
           '/292anomolies.csv','/380anomolies.csv','/390anomolies.csv',
           '/454anomolies.csv','/503anomolies.csv','/805anomolies.csv',
           '/875anomolies.csv']

In [48]:
def anomaly_count(adlsuffix, anomalysuffix, adlpath, anomalypath):
    
    suffix1 = adlsuffix
    suffix2 = anomalysuffix
    path1 = adlpath
    path2 = anomalypath
    
    subjid = re.findall(r'\d+',suffix2)[0]
    filename1 = path1 + suffix1
    lookup = pd.read_csv(filename1)
    
    filename2 = path2 + suffix2
    anomaly = pd.read_csv(filename2,header=None)
    anomaly.columns = ['obsnum1','obsnum2','etime1','etime2',
                   'Start_Time_min','Start_End_min']
    useful = anomaly[['Start_Time_min','Start_End_min']]

    together = pd.concat([lookup,useful])
    together.sort_values(['Start_Time_min'], ascending=[1],inplace=True)

    minimum = np.min(lookup.Start_Time_min)
    maximum = np.max(lookup.Start_End_min)
    filtering = (together.Start_Time_min>=minimum)&(together.Start_End_min<=maximum)
    together = together[filtering]

    together.reset_index(inplace=True)
    del together['index']

    ######################################
    nrow = together.shape[0]
    newid = np.zeros(nrow)
    nalist = pd.isnull(together['Label'])
    temp = 0

    for i in range(nrow):
        if nalist[i] == False:
            tempend = together.iloc[i, 1]
            temp += 1
            newid[i] = temp 
        else:
            if together.iloc[i,2]>tempend:
                temp += 1
                newid[i] = temp
            else:
                newid[i] = temp

    together['matchid'] = newid

    summary = pd.DataFrame(together.groupby('matchid')['Start_Time_min'].count())
    summary.reset_index(inplace=True)
    summary.columns = ['matchid','freq']

    combine2 = pd.merge(together, summary, "left", 'matchid')
    combine2.dropna(inplace=True)
    combine2['anomaly_freq'] = combine2['freq'] - 1
    combine2['minutes'] = combine2['Start_End_min'] - combine2['Start_Time_min']
    combine3 = combine2[['Label','minutes','anomaly_freq']]
    combine3['subject'] = subjid
    return combine3
    


### Create the processed anomaly count data

In [49]:
i=0
for part1, part2 in zip(adlsuffix, anomalysuffix):
    
    tempdata = anomaly_count(part1, part2, adlpath, anomalypath)
    if i == 0:
        anomaly = tempdata
        i += 1
    else:
        anomaly = pd.concat([anomaly, tempdata])
        i += 1

        
anomaly.reset_index(inplace=True)
del anomaly['index']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [50]:
print(anomaly.shape)
print(np.unique(anomaly['Label']))
anomaly.head()

(340, 4)
['Cleaning up' 'Setting dinner' 'bicycle' 'carrying the box' 'drinking'
 'laying' 'rest' 'rowing' 'sync' 'typing' 'walk']


Unnamed: 0,Label,minutes,anomaly_freq,subject
0,sync,0.18125,0,15
1,rowing,3.0004,0,15
2,carrying the box,0.325333,0,15
3,carrying the box,0.177383,0,15
4,bicycle,3.000567,0,15


### Data manipulation

- remove sync
- create log(min)
- (already ordered by subject and time)

In [73]:
anomaly1 = anomaly[anomaly['Label']!='sync']
anomaly1['logmin'] = np.log(anomaly1['minutes'])
anomaly1['index'] = anomaly1.index
#anomaly1.sort_values(['subject'], ascending=[1], inplace=True)
print(anomaly1.shape)
anomaly1.head()

(320, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,Label,minutes,anomaly_freq,subject,logmin,index
1,rowing,3.0004,0,15,1.098746,1
2,carrying the box,0.325333,0,15,-1.122906,2
3,carrying the box,0.177383,0,15,-1.729444,3
4,bicycle,3.000567,0,15,1.098801,4
5,drinking,0.2569,0,15,-1.359068,5


In [45]:
anomaly.groupby(['subject','Label'])['anomaly_freq','minutes'].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,anomaly_freq,minutes
subject,Label,Unnamed: 2_level_1,Unnamed: 3_level_1
15,Cleaning up,1,1.765467
15,Setting dinner,1,2.873184
15,bicycle,0,6.000967
15,carrying the box,1,1.543266
15,drinking,0,1.118866
15,laying,0,4.007166
15,rest,0,6.000933
15,rowing,0,6.000834
15,sync,3,0.392600
15,typing,0,5.538967


##### GEE  with log(second) as offset.

In [69]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

fam = sm.families.Poisson()
ind = sm.cov_struct.Exchangeable()


mod = smf.gee("anomaly_freq ~ Label", groups="subject", 
              data=anomaly1,  cov_struct=ind, family=fam,
              offset = "logmin")
res = mod.fit()

print(res.summary())

                               GEE Regression Results                              
Dep. Variable:                anomaly_freq   No. Observations:                  320
Model:                                 GEE   No. clusters:                       10
Method:                        Generalized   Min. cluster size:                  32
                      Estimating Equations   Max. cluster size:                  32
Family:                            Poisson   Mean cluster size:                32.0
Dependence structure:         Exchangeable   Num. iterations:                    15
Date:                     Wed, 05 Sep 2018   Scale:                           1.000
Covariance type:                    robust   Time:                         18:54:57
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept                     1.7857      0.525      3.3

### generate the confidence interval for # of anomalies in each context

In [71]:
covmat = res.cov_robust
coef = res.params
#np.diag(covmat)

contrast = np.array([1,0,0,0,0,0,0,0,0,0,
                     1,1,0,0,0,0,0,0,0,0,
                     1,0,1,0,0,0,0,0,0,0,
                     1,0,0,1,0,0,0,0,0,0,
                     1,0,0,0,1,0,0,0,0,0,
                     1,0,0,0,0,1,0,0,0,0,
                     1,0,0,0,0,0,1,0,0,0,
                     1,0,0,0,0,0,0,1,0,0,
                     1,0,0,0,0,0,0,0,1,0,
                     1,0,0,0,0,0,0,0,0,1]).reshape(10,10)

meanvar = np.diag(np.dot(np.dot(contrast, covmat),\
                         contrast.T))

 
from scipy.stats import norm
z = norm.ppf(1 - 0.025)

pairwise = pd.DataFrame({'context':coef.index,
              'lower':contrast.dot(coef)-z*np.sqrt(meanvar),
              'upper':contrast.dot(coef)+z*np.sqrt(meanvar)})
pairwise['expmean'] = np.exp(contrast.dot(coef))
pairwise['explower'] = np.exp(pairwise['lower'])
pairwise['expupper'] = np.exp(pairwise['upper'])
pairwise.iloc[:,[0,3,4,5]]

Unnamed: 0,device,expmean,explower,expupper
0,Intercept,5.963893,2.129499,16.702524
1,Label[T.Setting dinner],7.026521,2.291389,21.546755
2,Label[T.bicycle],5.356532,1.685177,17.026362
3,Label[T.carrying the box],6.838175,2.092089,22.351168
4,Label[T.drinking],4.370165,1.079352,17.694272
5,Label[T.laying],2.285786,0.789725,6.615995
6,Label[T.rest],1.589391,0.639153,3.952361
7,Label[T.rowing],6.014072,1.932042,18.720644
8,Label[T.typing],3.686478,1.148581,11.832088
9,Label[T.walk],7.017123,3.214666,15.317305


### Subject-specific modeling

In [99]:
fam1 = sm.families.Poisson()
ind1 = sm.cov_struct.Independence()

sub1 = anomaly1[anomaly1['subject']=='454']
sub1['logsec'] = np.log(sub1['minutes'] * 60)
mod1 = smf.gee("anomaly_freq ~ Label", groups="index", 
              data=sub1,  cov_struct=ind, family=fam,
              offset = "logmin")
res1 = mod1.fit()

print(res1.summary())

                               GEE Regression Results                              
Dep. Variable:                anomaly_freq   No. Observations:                   32
Model:                                 GEE   No. clusters:                       32
Method:                        Generalized   Min. cluster size:                   1
                      Estimating Equations   Max. cluster size:                   1
Family:                            Poisson   Mean cluster size:                 1.0
Dependence structure:         Exchangeable   Num. iterations:                    17
Date:                     Wed, 05 Sep 2018   Scale:                           1.000
Covariance type:                    robust   Time:                         19:13:44
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept                     1.0301      0.606      1.7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [100]:
covmat1 = res1.cov_robust
coef1 = res1.params
#np.diag(covmat)

contrast1 = np.array([1,0,0,0,0,0,0,0,0,0,
                     1,1,0,0,0,0,0,0,0,0,
                     1,0,1,0,0,0,0,0,0,0,
                     1,0,0,1,0,0,0,0,0,0,
                     1,0,0,0,1,0,0,0,0,0,
                     1,0,0,0,0,1,0,0,0,0,
                     1,0,0,0,0,0,1,0,0,0,
                     1,0,0,0,0,0,0,1,0,0,
                     1,0,0,0,0,0,0,0,1,0,
                     1,0,0,0,0,0,0,0,0,1]).reshape(10,10)

meanvar1 = np.diag(np.dot(np.dot(contrast1, covmat1),\
                         contrast1.T))

 
from scipy.stats import norm
z = norm.ppf(1 - 0.025)

pairwise1 = pd.DataFrame({'context':coef1.index,
              'lower':contrast1.dot(coef1)-z*np.sqrt(meanvar1),
              'upper':contrast1.dot(coef1)+z*np.sqrt(meanvar1)})
pairwise1['expmean'] = np.exp(contrast1.dot(coef1))
pairwise1['explower'] = np.exp(pairwise1['lower'])
pairwise1['expupper'] = np.exp(pairwise1['upper'])
pairwise1.iloc[:,[0,3,4,5]]

Unnamed: 0,context,expmean,explower,expupper
0,Intercept,2.801338,0.8539851,9.189261
1,Label[T.Setting dinner],0.5114606,0.0875178,2.989014
2,Label[T.bicycle],0.333276,0.3332631,0.3332888
3,Label[T.carrying the box],3.506262,1.350084,9.106007
4,Label[T.drinking],1.356239,0.2227953,8.255941
5,Label[T.laying],4.139938e-08,1.035389e-08,1.655328e-07
6,Label[T.rest],0.3329496,0.3324928,0.3334069
7,Label[T.rowing],1.828929,0.5903737,5.665869
8,Label[T.typing],0.4818774,0.1130235,2.054492
9,Label[T.walk],4.99222,1.504868,16.5611
