# REQUIREMENTS

Remove data for questions participants should not have received (based on improper branching logic)

Data for child questions should only occur for those participants who should have received the questions based on the skip and branching logic of the PPI survey. For the identified PIDs for each question, drop the row.

## Set up

In [None]:
library(plyr) 
library(dplyr)
library(tidyr)
library(reticulate)
library(crayon)


In [None]:
pd = reticulate::import("pandas")

In [None]:
PROJECT_ID = ''

## Load the PIDs with data to be removed 

This is a list of pids identified by the team who received child questions that they should not have received based on their answers to the parent questions.

In [None]:
pids_rem_lifestyle <- read.csv("AC67_rem_lifestyle.csv")  # pids for lifestyle questions
pids_rem_overallh <- read.csv("AC67_rem_overallh.csv") # pids for overall health questions
pids_rem_personalmed <- read.csv("AC67_rem_personalmed.csv") # pids for personam medical history questions
pids_rem_hcau <- read.csv("AC67_rem_hcau.csv") # pids for hcau- insurance questions


# ```HCAU``` QUESTIONS
The SQL query below will delete from the observation table all the records for the person_ids in ```pids_remv_hcau``` related to these child HCAU questions 


In [None]:
pids_remv_hcau <- na.omit(c(pids_rem_hcau[,1], pids_rem_hcau[,2]))
pids_remv_hcau <- as.numeric(paste(pids_remv_hcau, sep = ','))

In [None]:
# query to find the correct observation_source_concept_ids
pd$read_gbq(" 

SELECT DISTINCT observation_source_concept_id, concept_name, concept_code

FROM `{}.observation` o
INNER JOIN `{}.concept` c ON o.observation_source_concept_id = c.concept_id
WHERE concept_code LIKE '%Insurance_InsuranceType%' #hcau
OR concept_code LIKE 'HealthInsurance_InsuranceTypeUpdate'#hcau
",PROJECT_ID, dialect="standard")


In [None]:
## QUERY TO REMOVE RECORDS FOR THESE PIDS RECORDS  

pd$read_gbq(sprintf(" 

DELETE 
 FROM `{}.observation` 
 WHERE person_id IN (%s)
    AND (observation_source_concept_id = 43528428 #hcau
    OR observation_source_concept_id = 1384450) #hcau
#ORDER BY concept_code
", pids_remv_hcau), PROJECT_ID, dialect="standard")


# ```PERSONAL MEDICAL HISTORY``` QUESTIONS¶

In [None]:
pids_remv_personalmed <- na.omit(c(pids_rem_personalmed[,1], pids_rem_personalmed[,2], pids_rem_personalmed[,3]))
pids_remv_personalmed <- as.character(paste0(pids_remv_personalmed, sep = ''))

In [None]:
# query to find the correct observation_source_concept_ids
pd$read_gbq(" 

SELECT DISTINCT observation_source_concept_id, concept_name, concept_code

FROM `{}.observation` o
INNER JOIN `{}.concept` c ON o.observation_source_concept_id = c.concept_id
WHERE concept_code LIKE 'NervousSystem_DementiaCurrently'#personal medical hist
OR concept_code LIKE 'NervousSystem_HowOldWereYouDementia'#personal medical hist  ### COULD NOT FIND QUESTION IN CDR
OR concept_code LIKE 'NervousSystem_RxMedsforDementia'#personal medical hist
", PROJECT_ID, dialect="standard")


In [None]:
## QUERY TO REMOVE RECORDS FOR THESE PIDS RECORDS  

pd$read_gbq(sprintf(" 

DELETE 
 FROM `{}.observation` 
 WHERE person_id IN (%s)
 AND (observation_source_concept_id = 43530367 #NervousSystem_DementiaCurrently
     # OR observation_source_concept_id = ??? #NervousSystem_HowOldWereYouDementia ### COULD NOT FIND THIS QUESTION IN CDR
      OR observation_source_concept_id = 43528852) #NervousSystem_RxMedsforDementia
", pids_remv_personalmed), PROJECT_ID, dialect="standard")


# REMOVE THESE PIDS FOR THESE ```overall health``` QUESTIONS¶

In [None]:
pids_remv_overallh <- na.omit(c(pids_rem_overallh[,1], pids_rem_overallh[,2], pids_rem_overallh[,3], pids_rem_overallh[,4]))
pids_remv_overallh <- as.character(paste0(pids_remv_overallh, sep = ''))

In [None]:
# query to find the correct observation_source_concept_ids
pd$read_gbq(" 

SELECT DISTINCT observation_source_concept_id, concept_name, concept_code

FROM `{}.observation` o
INNER JOIN `{}.concept` c ON o.observation_source_concept_id = c.concept_id
WHERE ( concept_code LIKE 'Pregnancy_1PregnancyStatus'#overall health
OR c.concept_code LIKE 'YesNone_MenstrualStoppedReason'#overall health
OR c.concept_code LIKE 'OverallHealth_HysterectomyHistory'#overall health
OR c.concept_code LIKE 'OverallHealth_OvaryRemovalHistory')#overall health
", PROJECT_ID, dialect="standard")


In [None]:
## QUERY TO REMOVE RECORDS FOR THESE PIDS RECORDS  

pd$read_gbq(sprintf(" 

DELETE 
 FROM `{}.observation` 
 WHERE person_id IN (%s)
 AND (observation_source_concept_id = 1585811 # Pregnancy_1PregnancyStatus
    OR observation_source_concept_id = 1585789 #YesNone_MenstrualStoppedReason
    OR observation_source_concept_id = 1585791 #OverallHealth_HysterectomyHistory
    OR observation_source_concept_id = 1585796) #OverallHealth_OvaryRemovalHistory
", pids_remv_overallh), PROJECT_ID, dialect="standard")


# ```LIFESTYLE``` QUESTIONS¶

In [None]:
pids_remv_lifestyle <- na.omit(c(pids_rem_lifestyle[,1], pids_rem_lifestyle[,2], pids_rem_lifestyle[,3], 
                                pids_rem_lifestyle[,4], pids_rem_lifestyle[,5], pids_rem_lifestyle[,6], 
                                pids_rem_lifestyle[,7]))
pids_remv_lifestyle <- as.character(paste0(pids_remv_lifestyle, sep = ''))

In [None]:
# query to find the correct observation_source_concept_ids
pd$read_gbq(" 

SELECT DISTINCT observation_source_concept_id, concept_name, concept_code

FROM `{}.observation` o
INNER JOIN `{}.concept` c ON o.observation_source_concept_id = c.concept_id
WHERE 
(concept_code LIKE 'Smoking_DailySmokeStartingAge'#Lifestyle
OR concept_code LIKE 'AttemptQuitSmoking_CompletelyQuitAge'#Lifestyle
OR concept_code LIKE 'Smoking_NumberOfYears'#Lifestyle
OR concept_code LIKE 'Smoking_CurrentDailyCigaretteNumber'#Lifestyle
OR concept_code LIKE 'Smoking_AverageDailyCigaretteNumber'#Lifestyle
OR concept_code LIKE 'Alcohol_AverageDailyDrinkCount'#Lifestyle
OR concept_code LIKE 'Alcohol_6orMoreDrinksOccurence')#Lifestyle
", PROJECT_ID, dialect="standard")


In [None]:
## QUERY TO REMOVE RECORDS FOR THESE PIDS RECORDS  

pd$read_gbq(sprintf(" 

DELETE 
 FROM `{}.observation` 
 WHERE person_id IN (%s)
 AND (observation_source_concept_id = 1585864 #Smoking_DailySmokeStartingAge
        OR observation_source_concept_id = 1585870 #AttemptQuitSmoking_CompletelyQuitAge
        OR observation_source_concept_id = 1585873 #Smoking_NumberOfYears'#Lifestyle
        OR observation_source_concept_id = 1586159 #Smoking_CurrentDailyCigaretteNumber
        OR observation_source_concept_id = 1586162 #Smoking_AverageDailyCigaretteNumber
        OR observation_source_concept_id = 1586207 #Alcohol_AverageDailyDrinkCount
        OR observation_source_concept_id = 1586213) #Alcohol_6orMoreDrinksOccurence
", pids_remv_lifestyle), PROJECT_ID, dialect="standard")
 