# Cobalt Data Anomalies

## Appointment Anomalies

### Duplicate Appointments
* multiple appointments with the same start_time, account_id, and provider_id but unique appointment ids

In [None]:
appointment_start_time = appointment.sort_values(['account_id', 'start_time']).groupby(['account_id'])[['start_time']].diff()
appointment_start_time.columns = ['next_apt_time']

appointment_diff = appointment.merge(appointment_start_time, how='inner', left_index=True, right_index=True)

In [None]:
dup_appt = appointment_completed.groupby(['account_id', 'start_time']).filter(lambda x: len(x)>1)
dup_appt_acct_ids = pd.DataFrame(dup_appt.account_id.unique(), columns=['account_id'])

In [None]:
dup_appt.to_csv(BACKEND_ISSUES_PATH + 'dup_appt_' + str(datetime.datetime.now().date()).replace('-','') + '.csv')
dup_appt_acct_ids.to_csv(BACKEND_ISSUES_PATH + 'dup_appt_acct_ids_' + str(datetime.datetime.now().date()).replace('-','') + '.csv')

In [None]:
print(len(dup_appt))
print(len(dup_appt.account_id.unique()))
dup_appt.head(2)

In [None]:
print(len(dup_appt_acct_ids))
print(len(dup_appt_acct_ids.account_id.unique()))
dup_appt_acct_ids.head(2)

### Created Time > Start Time
* Removing negative times should resolve **all** issues, including dup appts mentioned above

In [None]:
temp = appointment[appointment['created_completed_time']==0]
print(len(temp))
print(len([item for item in (temp['start_time'] - temp['created'])if item < np.timedelta64(0,'D')]))
temp['start_time'] - temp['created']

In [None]:
created_start = appointment_completed[appointment_completed['created'] > appointment_completed['start_time']]
created_start_acct_ids = pd.DataFrame(created_start.account_id.unique(), columns=['account_id'])

In [None]:
created_start.to_csv(BACKEND_ISSUES_PATH + 'created_start_' + str(datetime.datetime.now().date()).replace('-','') + '.csv')
created_start_acct_ids.to_csv(BACKEND_ISSUES_PATH + 'created_start_acct_ids_' + str(datetime.datetime.now().date()).replace('-','') + '.csv')

In [None]:
print(len(created_start))
print(len(created_start.account_id.unique()))
created_start.head(2)

In [None]:
print(len(created_start_acct_ids))
print(len(created_start_acct_ids.account_id.unique()))
created_start_acct_ids.head(2)

## Assessment complete_flag reliability: False vs True

### Sample account with 6 sessions
* Notice the multiple GAD7 sessions

In [None]:
# All sessions for sample account
account_instance_sessions[account_instance_sessions['account_id']=='REDACTED'].sort_values(['created'])

### First GAD7 entry (idx 21671)
* This session occurs **first** chronologically
* This session has a **complete_flag=True**
* This session is associated with **zero** answers

In [None]:
# GAD7 session 1
account_instance_sessions[account_instance_sessions['account_session_id']=='REDACTED']

In [None]:
# GAD7 session 1 answers
account_instance_session_answers[account_instance_session_answers['account_session_id']=='REDACTED']

### Second GAD7 entry (idx 15342)
* This session occurs **second** chronologically
* This session has a **complete_flag=False**
* This session is associated with **all** answers

In [None]:
# GAD7 session 2
account_instance_sessions[account_instance_sessions['account_session_id']=='REDACTED']

In [None]:
# GAD7 session 2 answers
account_instance_session_answers[account_instance_session_answers['account_session_id']=='REDACTED']

### Conclusions/Questions:
* Question 1: Can complete_flag be trusted?
* Question 2: If both GAD7 assessments had ALL questions answered, which score to accept?
    * This occurs *6 times* throughout the data (i.e. a participant has multiple complete assessments *within the same instance*)

## Assessment complete_flag reliability: Status of "false" sessions

In [None]:
print(len(account_session))
account_session.complete_flag.value_counts()

### "False" Sessions: 
* complete_flag=False 
* assessments are one of PHQ4, PHQ9, GAD7, PCPTSD

In [None]:
instance_session_ids = [PHQ4_id,PHQ9_id,GAD7_id,PCPTSD_id]
account_session_false = account_session.copy()
account_session_false = account_session[account_session['complete_flag']==False]
account_session_false = account_session_false[account_session_false['assessment_id'].isin(instance_session_ids)]
account_session_false_ids = account_session_false.account_session_id

In [None]:
print(len(account_session_false))
account_session_false.head(2)

In [None]:
account_session_answer_false = account_session_answer.copy()
account_session_answer_false = account_session_answer_false[account_session_answer_false['account_session_id'].isin(account_session_false_ids)]

In [None]:
print(len(account_session_answer_false))
print(len(account_session_answer_false.account_session_id.unique()))

In [None]:
print(len(account_session_answer_false))
account_session_answer_false.head(2)

### PHQ4
* Number of *session answers* with complete_flag=False
    * 180 sessions answered all 4 questions

In [None]:
PHQ4_answers_false = account_session_answer_false[account_session_answer_false['assessment_name']=='PHQ4']
PHQ4_answers_false.groupby(['account_session_id']).count().account_session_answer_id.value_counts()

In [None]:
# Alternative grouping strategy --> same results
tempp_idx = account_session_false[account_session_false['assessment_name']=='PHQ4'].account_session_id
account_session_answer_false[account_session_answer_false['account_session_id'].isin(tempp_idx)].groupby(['account_session_id']).count().account_session_answer_id.value_counts()

### PHQ9
* Number of *session answers* with complete_flag=False
    * 108 sessions answered all 7 questions

In [None]:
PHQ9_answers_false = account_session_answer_false[account_session_answer_false['assessment_name']=='PHQ9']
PHQ9_answers_false.groupby(['account_session_id']).count().account_session_answer_id.value_counts()

### GAD7
* Number of *session answers* with complete_flag=False
    * 47 sessions answered all 5 questions

In [None]:
GAD7_answers_false = account_session_answer_false[account_session_answer_false['assessment_name']=='GAD7']
GAD7_answers_false.groupby(['account_session_id']).count().account_session_answer_id.value_counts()

### PCPTSD
* Number of *session answers* with complete_flag=False
    * 5 sessions answered all 5 questions

In [None]:
PCPTSD_answers_false = account_session_answer_false[account_session_answer_false['assessment_name']=='PCPTSD']
PCPTSD_answers_false.groupby(['account_session_id']).count().account_session_answer_id.value_counts()

### Conclusion
* 180 + 108 + 47 + 5 = **340** potentially uncounted completed assessments
* Potentialy uncounted complete instances/escalations:
    * *5 or less* possible escalations for participants that *completed the entire survey chain through the PCPTSD*
    * *0 or more* possible escalations for participants that *completed the PHQ4 with a score < 3*

In [None]:
print('Complete PHQ4 assessments according to complete_flag:',len(account_session[(account_session['assessment_name']=='PHQ4') & account_session['complete_flag']==True]))
print()
print('Comparison with anomaly-aware processing:\n', account_instance_session_update[account_instance_session_update['assessment_name']=='PHQ4'][['complete_flag','outcome_complete']].sum())

In [None]:
print('Incomplete PHQ4 assessments according to complete_flag:',len(account_session[(account_session['assessment_name']=='PHQ4') & (account_session['complete_flag']==False)]))
print()
print('Comparison with anomaly-aware processing:\ncomplete_flag:', account_instance_session_update[(account_instance_session_update['assessment_name']=='PHQ4') & (account_instance_session_update['complete_flag']==False)].complete_flag.count(), '\noutcome_complete:', account_instance_session_update[(account_instance_session_update['assessment_name']=='PHQ4') & (account_instance_session_update['outcome_complete']==0)].outcome_complete.count())

## Cobalt Meeting 11/12/2021: Asessment and complete_flag Anomalies

### Attempted PHQ4 Assessments: Distribution of Questions Answered
* The vast majority of incomplete PHQ4 assessments have 0 answers, so clicking "skip" is likely the cause

In [None]:
account_instance_session_update[account_instance_session_update['assessment_name']=='PHQ4'].num_questions_answered.value_counts()

### Completed PHQ4 Assessments: Distribution of complete_flag Values
* Note: completed = all distinct questions answered, regardless of complete_flag status

#### Using native database tables

In [None]:
account_session_temp = account_session[account_session['assessment_id']==PHQ4_id]
temp_session_ids = account_session_temp.account_session_id

In [None]:
account_session_answer_temp = account_session_answer[account_session_answer['account_session_id'].isin(temp_session_ids)]
account_session_answer_temp = account_session_answer_temp.groupby(['account_session_id']).filter(lambda x: len(x)==4)
account_session_answer_temp = account_session_answer_temp.groupby(['account_session_id']).mean()

account_session_merged_temp = account_session_answer_temp.merge(account_session_temp[['account_session_id','created']], 
                                                                left_index=True, 
                                                                right_on='account_session_id', 
                                                                how='inner')

In [None]:
account_session_merged_temp.complete_flag.value_counts()

In [None]:
account_session_merged_temp[account_session_merged_temp['complete_flag']==False].created.dt.date.sort_values(ascending=False)

#### Using derived instance tables
* Identical result to above

In [None]:
account_instance_session_update[(account_instance_session_update['assessment_name']=='PHQ4') & 
                                (account_instance_session_update['outcome_complete']==1)].complete_flag.value_counts()

In [None]:
account_instance_session_update[(account_instance_session_update['assessment_name']=='PHQ4') & 
                                (account_instance_session_update['outcome_complete']==1) & 
                                (account_instance_session_update['complete_flag']==False)].created.dt.date.sort_values(ascending=False)

### Accounts with >1 completed assessment within the same instance
* I forgot to mention this is the call
* The sample account below has:
    * 21 session attempts within about 6-7 minutes
    * 2 completed PHQ9 assessments 
    * 2 completed GAD7 assessments
* This type of event occurs **only 6 times** in the entire database 
* Given the rarity and complexity of these events, my thought is to just exclude these accounts/sessions

In [None]:
account_session[account_session['account_id']=='REDACTED'].sort_values('created')

# Unused / Deprecated Code