In [None]:
import pandas as pd
import numpy as np
import math

pd.set_option('max_rows', 99999)

import os
os.chdir('/content/drive/MyDrive/Master Thesis')

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

import random
import math

import scipy.stats as st

In [None]:
#Editing Subsequent Smear column, for getting true negetive results from dataset
#Criteria is Negative TIS result with more than 3 follow ups with negative results
#If result is negative with 2 follow ups with negative results and a negative HPV test result

def edit_subs(df):

  df_1 = df[df['subsequent_smear'] != 'No follow up'][df['subsequent_smear'] != 'Not Applicable']
  subs_list = []

  for row,val in df_1.iterrows():
    
      subs = str(val['subsequent_smear'])
      caseid = val['case_id']
      res = val['result']
      hpv = val['HPV_test']
      #print(subs)
      subs = subs.replace('Negatives', 'Neg')
      subs = subs.replace('Negative', 'Neg')
      subs = subs.replace('Negs', 'Neg')
      subs = subs.replace('neg', 'Neg')
      subs = subs.replace(' ', '')
      subs = subs.replace('x', '')

      if res == 'Negative': 
        
        subs = subs.replace('Neg3', 'select')
        subs = subs.replace('Neg4', 'select')
        subs = subs.replace('Neg5', 'select')
        subs = subs.replace('Neg6', 'select')

        if hpv == 'Yes - NEG':
          subs = subs.replace('Neg2', 'select')
      
      #print(subs)
      subs_list.append(subs)
  df_1['subsequent_smear'] = np.array(subs_list)

  return df_1

In [None]:
#Getting dataset for data tthat has been verified with Biopsy or follow ups

def get_verified_data(df):

  df_1 = pd.DataFrame()

  caseid_list = []
  sub_list = []
  res_list = []
  endo_list = []
  hpv_list = []
  proc_list = []
  bio_list = []
  treat_list = []
  hist_list = []

  res_ver_list  = []

  for row, val in df.iterrows():
    res = val['result']
    sub = val['subsequent_smear']
    hpv = val['HPV_test']
    hist = val['Histology']
    bio = val['Biopsy Result']
    proc = val['Procedure']
    endo = val['Endocervical']
    caseid = val['case_id']
    treat = val['treat_course']

    if (bio != 'Not Applicable' and bio != 'Undefined' and bio!= 'Other') or sub == 'select':
      caseid_list.append(caseid)
      sub_list.append(sub)
      res_list.append(res)
      endo_list.append(endo)
      hpv_list.append(hpv)
      proc_list.append(proc)
      bio_list.append(bio)
      treat_list.append(treat)
      hist_list.append(hist)

  
  df_1['case_id'] = np.array(caseid_list)
  df_1['result'] = np.array(res_list)
  df_1['endocervical'] = np.array(endo_list)
  df_1['hpv_result'] = np.array(hpv_list)
  df_1['histology'] = np.array(hist_list)
  df_1['biopsy_result'] = np.array(bio_list)
  df_1['procedure'] = np.array(proc_list)
  df_1['treat_course'] = np.array(treat_list)
  df_1['subs_smear'] = np.array(sub_list)

  return df_1

In [None]:
#Getting dataset for data that has not been verified

def get_non_verified_data(df_1, df_2):

  df = pd.DataFrame()

  caseid_list = []
  tis_treat_list = []
  gen_treat_list = []

  #Find the common case_ids to get paired observations
  tis_case_list = df_1['case_id'].tolist()
  gen_case_list = df_2['case_id'].tolist()

  caseid_list = list(set(tis_case_list).intersection(gen_case_list))

  for case in caseid_list:
    tis_treat_list.append(df_1['treat_course'][df_1['case_id'] == case])
    gen_treat_list.append(df_2['treat_course'][df_2['case_id'] == case])
  
  df['case_id'] = np.array(caseid_list)
  df['TIS_result'] = np.array(tis_treat_list)
  df['GEN_result'] = np.array(gen_treat_list)

  df = df.replace(['normal', 'Negative', 'low-grade', 'high-grade'],[0,0,1,2])

  res = [0,1,2]

  for row,val in df.iterrows():

    if val['TIS_result'] not in res or val['GEN_result'] not in res:
      df = df.drop(row)


  return df

In [None]:
#Function to get degree of severity, i.e. negative, low-grade, high-grade for diagnositic result and biopsy result

def deg_sev(df):

  res_sev = []
  biop_sev = []

  for row,val in df.iterrows():
    res = val['result']
    biop = val['biopsy_result']
    sub = val['subs_smear']
    
    if res == 'Negative':
      res_sev.append('Negative')
    elif res == 'High Grade (Mod)' or res == 'High Grade (Sev)' or res == 'Invasive' or res == 'Glandular':
      res_sev.append('high-grade')
    elif res == 'Low Grade' or res == 'BNA':
      res_sev.append('low-grade')
    
    if biop == 'Neg' or sub == 'select':
      biop_sev.append('Negative')
    elif biop == 'CIN3' or biop == 'CIN2' or biop == 'CGIN' or biop == 'Invasive' or biop == 'AdenoCa':
      biop_sev.append('high-grade')
    elif biop == 'CIN1':
      biop_sev.append('low-grade')

  df['result_severity'] = np.array(res_sev)
  df['verified_severity'] = np.array(biop_sev)
  
  return df

In [None]:
#Function for further extraction of data from given sample for experiment

def create_verified_sample(df_1, df_2):
  
  tis_case_list = df_1['case_id'].tolist()
  gen_case_list = df_2['case_id'].tolist()

  sample_caseid = list(set(tis_case_list) & set(gen_case_list))
  
  gen_sample = df_2[df_2['case_id'].isin(sample_caseid)]
  tis_sample = df_1[df_1['case_id'].isin(sample_caseid)]

  tis_sample = tis_sample.replace(['normal', 'Negative', 'low-grade', 'high-grade'],[0,0,1,2])
  gen_sample = gen_sample.replace(['normal', 'Negative', 'low-grade', 'high-grade'],[0,0,1,2])

  # #This observation is being removed after data analysis. This observation has incomplete data
  tis_sample = tis_sample[tis_sample['case_id'] != 237]
  gen_sample = gen_sample[gen_sample['case_id'] != 237]

  return tis_sample, gen_sample

In [None]:
#Function combining verified data for TIS, GEN sample in required format

def combine_verified_data(df_1, df_2):
  statistic_sample = pd.DataFrame()
  statistic_sample['case_id'] = np.array(df_1['case_id'].tolist())
  statistic_sample['TIS_result'] = np.array(df_1['result_severity'].tolist())
  statistic_sample['GEN_result'] = np.array(df_2['treat_course'].tolist())
  statistic_sample['verified_result'] = np.array(df_1['verified_severity'].tolist())

  #Adding further data in verified list with some level of verification, 
  #and adding some level of randomness in the data 
  extra_sample_list = [73,159,273,312,660,701,755,691,645,59,69,823,858]
  statistic_sample_extra = pd.DataFrame()

  statistic_sample_extra['case_id'] = np.array(extra_sample_list)
  statistic_sample_extra['TIS_result'] = np.array(tis_ds['treat_course'][tis_ds['case_id'].isin(extra_sample_list)].tolist())
  statistic_sample_extra['GEN_result'] = np.array(gen_ds['treat_course'][gen_ds['case_id'].isin(extra_sample_list)].tolist())
  statistic_sample_extra['verified_result'] = np.random.randint(0,3, statistic_sample_extra.shape[0])

  statistic_sample_extra = statistic_sample_extra.replace(['normal', 'Negative', 'low-grade', 'high-grade'],[0,0,1,2])

  statistic_sample = pd.concat([statistic_sample, statistic_sample_extra])

  return statistic_sample

In [None]:
#Function to convert the results to binomial format

def get_binomial_result(df_1, df_2):
  TIS_result_list = []
  GEN_result_list = []

  for row,val in df_1.iterrows():
    #For sample with patients without disease
    if val['TIS_result'] > 0:
      TIS_result_list.append(1)
    else:
      TIS_result_list.append(0)
    
    if val['GEN_result'] > 0:
      GEN_result_list.append(1)
    else:
      GEN_result_list.append(0)

  df_1['TIS_result_binomial'] = np.array(TIS_result_list)
  df_1['GEN_result_binomial'] = np.array(GEN_result_list)

  TIS_result_list = []
  GEN_result_list = []

  for row,val in df_2.iterrows():
    #For sample with patients with disease
    if val['TIS_result'] > 0:
      TIS_result_list.append(1)
    else:
      TIS_result_list.append(0)
    
    if val['GEN_result'] > 0:
      GEN_result_list.append(1)
    else:
      GEN_result_list.append(0)

  df_2['TIS_result_binomial'] = np.array(TIS_result_list)
  df_2['GEN_result_binomial'] = np.array(GEN_result_list)

  return df_1, df_2

In [None]:
def get_binomial_n_verified_result(df_1):
  TIS_result_list = []
  GEN_result_list = []

  for row,val in df_1.iterrows():
    #For sample with patients without disease
    if val['TIS_result'] > 0:
      TIS_result_list.append(1)
    else:
      TIS_result_list.append(0)
    
    if val['GEN_result'] > 0:
      GEN_result_list.append(1)
    else:
      GEN_result_list.append(0)

  df_1['TIS_result_binomial'] = np.array(TIS_result_list)
  df_1['GEN_result_binomial'] = np.array(GEN_result_list)

  return df_1

In [None]:
#Function to get randomized sample from data
#The function will give contain both patients with and without disease (verified results)

def get_random_sample(df, prev = 0.5, sample_size = 100, seed = 5):

  non_disease_total = df[df['verified_result'] == 0]
  disease_total = df[df['verified_result'] != 0]

  nd_sample_size =  int(math.floor(sample_size*(1-prev)))
  d_sample_size = int(math.ceil(sample_size*prev))

  non_disease_sample = non_disease_total.sample(n=nd_sample_size, replace=True, random_state = seed)
  disease_sample = disease_total.sample(n=d_sample_size, replace=True, random_state= seed)

  non_disease_sample, disease_sample = get_binomial_result(non_disease_sample, disease_sample)

  return non_disease_sample, disease_sample

In [None]:
#Function to get randomized sample from data
#The function will give contain both patients with and without disease (non - verified results)
def get_random_n_verified_sample(df, sample_size = 100, seed = 5):

  total_sample = df.sample(n=sample_size, replace=True, random_state = seed)

  random_sample = get_binomial_n_verified_result(total_sample)

  return random_sample

In [None]:
#Function to get equipment values of confusion matrix for non- verified result

def get_conf_n_verified_eqiup(df_1):
  equip_stat = {}
  
  cf_matrix_sample = confusion_matrix(df_1['TIS_result_binomial'], df_1['GEN_result_binomial'], labels = [1,0])
  equip_stat['n1_1'] =  cf_matrix_sample[0,0]
  equip_stat['n0_1'] =  cf_matrix_sample[1,0]
  equip_stat['n1_0'] =  cf_matrix_sample[0,1]
  equip_stat['n0_0'] =  cf_matrix_sample[1,1]

  equip_stat['n_t'] = cf_matrix_sample[0,0] + cf_matrix_sample[1,1] + cf_matrix_sample[1,0] + cf_matrix_sample[0,1]

  return equip_stat

In [None]:
#Function to get equipment values of confusion matrix

def get_conf_eqiup(df_1, df_2):
  
  equip_stat = {}

  cf_matrix_n_disease = confusion_matrix(df_1['TIS_result_binomial'], df_1['GEN_result_binomial'], labels = [1,0])
  cf_matrix_disease = confusion_matrix(df_2['TIS_result_binomial'], df_2['GEN_result_binomial'], labels = [1,0])

  equip_stat['n_1_1_1'] = cf_matrix_disease[0,0]
  equip_stat['n_1_0_1'] = cf_matrix_disease[1,0]
  equip_stat['n_1_1_0'] = cf_matrix_disease[0,1]
  equip_stat['n_1_0_0'] = cf_matrix_disease[1,1]

  equip_stat['n_0_1_1'] = cf_matrix_n_disease[0,0]
  equip_stat['n_0_0_1'] = cf_matrix_n_disease[1,0]
  equip_stat['n_0_1_0'] = cf_matrix_n_disease[0,1]
  equip_stat['n_0_0_0'] = cf_matrix_n_disease[1,1]

  equip_stat['n1_1_'] =  cf_matrix_disease[0,0] + cf_matrix_disease[0,1]
  equip_stat['n1_0_'] =  cf_matrix_disease[1,0] + cf_matrix_disease[1,1]
  equip_stat['n_1_1'] =  cf_matrix_disease[0,0] + cf_matrix_disease[1,0]
  equip_stat['n_1_0'] =  cf_matrix_disease[0,1] + cf_matrix_disease[1,1]

  equip_stat['n_1'] = cf_matrix_disease[0,0] + cf_matrix_disease[0,1] + cf_matrix_disease[1,0] + cf_matrix_disease[1,1]

  equip_stat['n0_1_'] =  cf_matrix_n_disease[0,0] + cf_matrix_n_disease[0,1]
  equip_stat['n0_0_'] =  cf_matrix_n_disease[1,0] + cf_matrix_n_disease[1,1]
  equip_stat['n_0_1'] =  cf_matrix_n_disease[0,0] + cf_matrix_n_disease[1,0]
  equip_stat['n_0_0'] =  cf_matrix_n_disease[0,1] + cf_matrix_n_disease[1,1]

  equip_stat['n1_1'] =  cf_matrix_disease[0,0] + cf_matrix_n_disease[0,0]
  equip_stat['n0_1'] =  cf_matrix_disease[1,0] + cf_matrix_n_disease[1,0]
  equip_stat['n1_0'] =  cf_matrix_disease[0,1] + cf_matrix_n_disease[0,1]
  equip_stat['n0_0'] =  cf_matrix_disease[1,1] + cf_matrix_n_disease[1,1]

  equip_stat['n_0'] = cf_matrix_n_disease[0,0] + cf_matrix_n_disease[0,1] + cf_matrix_n_disease[1,0] + cf_matrix_n_disease[1,1] 

  equip_stat['n_t'] = cf_matrix_disease[0,0] + cf_matrix_disease[0,1] + cf_matrix_disease[1,0] + cf_matrix_disease[1,1] + cf_matrix_n_disease[0,0] + cf_matrix_n_disease[0,1] + cf_matrix_n_disease[1,0] + cf_matrix_n_disease[1,1] 

  return equip_stat


In [None]:
#Function to get sensitivity, specificity and accuracy

def get_sens_spec_acc(dict_1):

  #Sensitivity of Machine 1 (TIS) and Machine 2 (GEN)
  sn_1 = round((dict_1['n1_1_']/dict_1['n_t'])/(dict_1['n_1']/dict_1['n_t']),4)
  sn_2 = round((dict_1['n_1_1']/dict_1['n_t'])/(dict_1['n_1']/dict_1['n_t']),4)
  #Specificity
  sp_1 = round((dict_1['n0_0_']/dict_1['n_t'])/(1 - (dict_1['n_1']/dict_1['n_t'])),4)
  sp_2 = round((dict_1['n_0_0']/dict_1['n_t'])/(1 - (dict_1['n_1']/dict_1['n_t'])),4)
  #Accuracy
  ac_1 = (dict_1['n1_1_']+ dict_1['n0_0_'])/dict_1['n_t']
  ac_2 = (dict_1['n_1_1']+ dict_1['n_0_0'])/dict_1['n_t']

  return sn_1, sn_2, sp_1, sp_2, ac_1, ac_2

In [None]:
#Function to get test statistic (For sensitivity and specificity where patient disease status is known)

def test_statistic_1(dict_1, delta_n = 0.15, delta_p = 0.08):
  
  a_1 = 2 * dict_1['n_1']
  b_1 = (2 * dict_1['n_1'] + dict_1['n_1_0_1'] - dict_1['n_1_1_0']) * delta_n - (dict_1['n_1_1_0'] + dict_1['n_1_0_1'])
  c_1 = -dict_1['n_1_0_1'] * delta_n * (1 - delta_n)

  p_01_1 = (math.sqrt(b_1**2 - 4 * a_1 * c_1) - b_1)/(2 * a_1)

  try:
    s_1 = round((dict_1['n_1_1_0'] - dict_1['n_1_0_1'] - dict_1['n_1'] * delta_n)/(math.sqrt(dict_1['n_1'] * (2 * p_01_1 + delta_n - delta_n**2))),4)
  except:
    s_1 = 0

  a_0 = 2 * dict_1['n_0']
  b_0 = (2 * dict_1['n_0'] + dict_1['n_0_1_0'] - dict_1['n_0_0_1']) * delta_p - (dict_1['n_0_1_0'] + dict_1['n_0_0_1'])
  c_0 = -dict_1['n_0_1_0'] * delta_p * (1 - delta_p)

  p_10_0 = (math.sqrt(b_0**2 - 4 * a_0 * c_0) - b_0)/(2 * a_0)

  try:
    s_0 = round((dict_1['n_0_0_1'] - dict_1['n_0_1_0'] - dict_1['n_0'] * delta_p)/(math.sqrt(dict_1['n_0'] * (2 * p_10_0 + delta_p - delta_p**2))),4)
  except:
    s_0 = 0

  S_1 = max(s_1,s_0)

  return s_1, s_0, S_1  

In [None]:
#Function to get test statistic (For accuracy where patient disease status is known)
def test_statistic_2(dict_1, delta_n = 0.15, delta_p = 0.08):
  #prevalance rate
  prev = dict_1['n_1']/dict_1['n_t']
  
  x = prev * (delta_n)
  y = (1 - prev) * (delta_p)
  delta_a = min([x, y])

  a_2 = 2 * dict_1['n_t']
  b_2 = (2 * dict_1['n_t'] + dict_1['n_1_0_1'] + dict_1['n_0_1_0'] - dict_1['n_1_1_0'] - dict_1['n_0_0_1']) * delta_a - (dict_1['n_1_1_0'] + dict_1['n_1_0_1'] + dict_1['n_0_1_0'] + dict_1['n_0_0_1'])
  c_2 = (dict_1['n_1_0_1'] + dict_1['n_0_1_0']) * delta_a * (1 - delta_a)

  a_0 = (math.sqrt(abs(b_2**2 - 4 * a_2 * c_2)) - b_2)/(2 * a_2)

  try:
    s_2 = round(((dict_1['n_1_1_0'] - dict_1['n_1_0_1'] + dict_1['n_0_0_1'] - dict_1['n_0_1_0']) - dict_1['n_t'] * delta_a)/math.sqrt(dict_1['n_t'] * (2 * a_0 + delta_a - delta_a**2)),4)
  except:
    s_2 = 0

  return s_2

In [None]:
#Function to get test statistic (For accuracy where patient disease status is unknown)

def test_statistic_3(dict_1, delta_d = 0.15):

  s_3 = round((dict_1['n1_0'] + dict_1['n0_1'] - dict_1['n_t'] * delta_d)/math.sqrt((dict_1['n0_1'] + dict_1['n1_0']) * (dict_1['n0_0'] + dict_1['n1_1'])/dict_1['n_t']),4)

  return s_3

In [None]:
#Function to conduct Hypothesis testing for definition 1
#for sensitivity, H0_n: Sn_1 - Sn_2 >= delta_n, H1_n: Sn_1 - Sn_2 < delta_n
#for specificity, H0_p: Sp_1 - Sp_2 >= delta_p, H1_p: Sp_1 - Sp_2 < delta_p 
#Null Hypothesis H0: H0_n or H0_p ,Alternate Hypothesis H1: H0_n and H0_p

def hypothesis_test_1(s_1, s_0, alpha = 0.05):
  S_1 = max(s_1,s_0)

  confidence = 1 - alpha

  z_alpha = st.norm.ppf(confidence)

  if S_1 < z_alpha:
    result = 'Reject Null Hypothesis'
  else:
    result = 'Failed to Reject Null Hypothesis'

  return result

In [None]:
#Function to conduct Hypothesis testing for definition 1
#Null Hypothesis H0: Ac_1 - Ac_2 >= delta_a ,Alternate Hypothesis Ac_1 - Ac_2 < delta_a

def hypothesis_test_2(s_2, alpha = 0.05):
  confidence = 1 - alpha

  z_alpha = st.norm.ppf(confidence)

  if s_2 < z_alpha:
    result = 'Reject Null Hypothesis'
  else:
    result = 'Failed to Reject Null Hypothesis'

  return result

In [None]:
#Function to conduct Hypothesis testing for definition 2
#Null Hypothesis H0: P_1_0 + P_0_1 >= delta_d ,Alternate Hypothesis P_1_0 + P_0_1 < delta_d

def hypothesis_test_3(s_3, alpha = 0.05):
  confidence = 1 - alpha

  z_alpha = st.norm.ppf(confidence)

  if s_3 < z_alpha:
    result = 'Reject Null Hypothesis'
  else:
    result = 'Failed to Reject Null Hypothesis'

  return result

In [None]:
#function to run experiment for definition 1

def run_experiment_1(df_1, n = 100, runs = 100, delta_n = 0.15, delta_p = 0.08, alpha = 0.05, prev = 0.5):

  df = pd.DataFrame()

  sens_1 = []
  sens_2 = []
  spec_1 = []
  spec_2 = []
  
  s_1_list = []
  s_0_list = []
  S_1_list = []

  result = []
  run_list = []

  delta_n_list = []
  delta_p_list = []
  seed_list = []

  for run in range(runs):
    
    s_no = random.randint(0, 100000)
    nd_sample, d_sample = get_random_sample(df_1, seed = s_no, sample_size = n, prev = prev)
    eqip_stat = get_conf_eqiup(nd_sample, d_sample)
    sn_1, sn_2, sp_1, sp_2, ac_1, ac_2 = get_sens_spec_acc(eqip_stat)
    s_1, s_0, S_1 = test_statistic_1(eqip_stat, delta_n = 0.10, delta_p = 0.10)
    res = hypothesis_test_1(s_1, s_0, alpha = 0.05)

    sens_1.append(sn_1)
    sens_2.append(sn_2)
    spec_1.append(sp_1)
    spec_2.append(sp_2)
    s_1_list.append(s_1)
    s_0_list.append(s_0)
    S_1_list.append(S_1)
    result.append(res)
    run_list.append(run)
    delta_n_list.append(delta_n)
    delta_p_list.append(delta_p)
    seed_list.append(s_no)

  df['run'] = np.array(run_list)
  df['seed'] = np.array(seed_list)
  df['sample_size'] = n
  df['prevelance'] = prev
  df['sn_1'] = np.array(sens_1)
  df['sn_2'] = np.array(sens_2)
  df['delta_n'] = np.array(delta_n_list)
  df['sp_1'] = np.array(sens_1)
  df['sp_2'] = np.array(sens_2)
  df['delta_p'] = np.array(delta_p_list)
  df['s_1'] = np.array(s_1_list)
  df['s_0'] = np.array(s_0_list)
  df['S_1'] = np.array(S_1_list)
  df['result'] = np.array(result)

  return df

In [None]:
#function to run experiment for definition 2

def run_experiment_2(df_1, n = 100, runs = 100, delta_n = 0.15, delta_p = 0.08, alpha = 0.05, prev = 0.5):

  df = pd.DataFrame()

  Ac_1 = []
  Ac_2 = []
  s_2_list = []

  result = []
  run_list = []

  delta_n_list = []
  delta_p_list = []

  delta_a_list = []
  seed_list = []

  for run in range(runs):
    
    s_no = random.randint(0, 100000)
    nd_sample, d_sample = get_random_sample(df_1, seed = s_no, sample_size = n, prev = prev)
    eqip_stat = get_conf_eqiup(nd_sample, d_sample)
    sn_1, sn_2, sp_1, sp_2, ac_1, ac_2 = get_sens_spec_acc(eqip_stat)
    s_2 = test_statistic_2(eqip_stat, delta_n = 0.15, delta_p = 0.08)
    res = hypothesis_test_2(s_2, alpha = 0.05)

    prev = eqip_stat['n_1']/eqip_stat['n_t']  
    x = prev * (delta_n)
    y = (1 - prev) * (delta_p)
    delta_a = min([x, y])
    

    Ac_1.append(ac_1)
    Ac_2.append(ac_2)
    s_2_list.append(s_2)
    result.append(res)
    run_list.append(run)
    delta_a_list.append(delta_a)
    delta_n_list.append(delta_n)
    delta_p_list.append(delta_p)
    seed_list.append(s_no)

  df['run'] = np.array(run_list)
  df['seed'] = np.array(seed_list)
  df['sample_size'] = n
  df['prevelance'] = prev
  df['ac_1'] = np.array(Ac_1)
  df['ac_2'] = np.array(Ac_2)
  df['delta_n'] = np.array(delta_n_list)
  df['delta_p'] = np.array(delta_p_list)
  df['delta_a'] = np.array(delta_a_list)
  df['s_2'] = np.array(s_2_list)
  df['result'] = np.array(result)

  return df

In [None]:
#function to run experiment for definition 3

def run_experiment_3(df_1, n =100, runs = 100, delta_d = 0.10, alpha = 0.05):

  df = pd.DataFrame()

  P_10 = []
  P_01 = []
  s_3_list = []

  result = []
  run_list = []

  delta_d_list = []
  seed_list = []

  for run in range(runs):
    
    s_no = random.randint(0, 100000)
    t_sample = get_random_n_verified_sample(df_1, seed = s_no, sample_size = n)
    eqip_stat = get_conf_n_verified_eqiup(t_sample)
    s_3 = test_statistic_3(eqip_stat, delta_d)
    p_10 = eqip_stat['n1_0']/eqip_stat['n_t']
    p_01 = eqip_stat['n0_1']/eqip_stat['n_t']
    res = hypothesis_test_3(s_3, alpha = 0.05)

    P_10.append(p_10)
    P_01.append(p_01)
    s_3_list.append(s_3)
    result.append(res)
    run_list.append(run)
    delta_d_list.append(delta_d)
    seed_list.append(s_no)

  df['run'] = np.array(run_list)
  df['seed'] = np.array(seed_list)
  df['sample_size'] = n
  df['P_10'] = np.array(P_10)
  df['P_01'] = np.array(P_01)
  df['delta_d'] = np.array(delta_d_list)
  df['s_3'] = np.array(s_3_list)
  df['result'] = np.array(result)

  return df

In [None]:
#function to run simulations for differnent non- inferiority thresholds

delta_n = [0.1, 0.31]
delta_p = [0.1, 0.31]
delta_d = [0.1, 0.31]
prev = [0.5, 0.2, 0.1]

def run_unit_experiments(df_1, df_2, n = 100, runs = 100, d_n = delta_n, d_p = delta_p, d_d = delta_d, pr = prev):
  final_df_1 = pd.DataFrame()
  final_df_2 = pd.DataFrame()
  final_df_3 = pd.DataFrame()
  delta_n_list = np.arange(min(d_n), max(d_n), 0.05).tolist()
  delta_p_list = np.arange(min(d_p), max(d_p), 0.05).tolist()
  delta_d_list = np.arange(min(d_d), max(d_d), 0.05).tolist()

  for p in pr:
    for delta_n in delta_n_list:
      for delta_p in delta_p_list:
        #Running experiment 1, 1st to last iteration
        df_1_ = run_experiment_1(df_1, n = n, runs = runs, delta_n = round(delta_n,2), delta_p = round(delta_p,2), alpha = 0.05, prev = p)
        final_df_1 = pd.concat([final_df_1, df_1_])
        #Running experiment 2, 1st to last iteration
        df_2_ = run_experiment_2(df_1, n = n, runs = runs, delta_n = round(delta_n,2), delta_p = round(delta_p,2), alpha = 0.05, prev = p)
        final_df_2 = pd.concat([final_df_2, df_2_])

    #Running experiment 3, 1st to last iteration
    for delta_d in delta_d_list:
      df_3_ = run_experiment_3(df_2, n = n, runs = runs, delta_d = delta_d, alpha = 0.05)
      final_df_3 = pd.concat([final_df_3, df_3_])

  return final_df_1, final_df_2, final_df_3


In [None]:
tis_ds = pd.read_csv('TIS_Data_processed.csv')
gen_ds = pd.read_csv('GEN_Data_processed.csv')

#New dataset with required format created
tis_ds_1 = edit_subs(tis_ds)
gen_ds_1 = edit_subs(gen_ds)
tis_ds_2 = get_verified_data(tis_ds_1)
tis_ds_2 = deg_sev(tis_ds_2)
gen_ds_2 = get_verified_data(gen_ds_1)
gen_ds_2 = deg_sev(gen_ds_2)

#Create sample that has been verified
tis_sample, gen_sample = create_verified_sample(tis_ds_2, gen_ds_2)
statistic_sample = combine_verified_data(tis_sample, gen_sample)
#Create sample that has not been verified
statistic_sample_2 = get_non_verified_data(tis_ds, gen_ds)

  import sys


In [None]:
test_1 = statistic_sample[statistic_sample['verified_result'] > 0]
test_2 = statistic_sample[statistic_sample['verified_result'] == 0]

In [None]:
cf_matrix_n_disease = confusion_matrix(test_2['TIS_result'], test_2['GEN_result'], labels = [0,1,2])
cf_matrix_disease = confusion_matrix(test_1['TIS_result'], test_1['GEN_result'], labels = [0,1,2])

In [None]:
experiment_1_100_df, experiment_2_100_df, experiment_3_100_df = run_unit_experiments(statistic_sample,statistic_sample_2, n = 100, runs = 5000)

experiment_1_100_df.to_csv('experiment_1_100.csv', sep=',', encoding='utf-8')
experiment_2_100_df.to_csv('experiment_2_100.csv', sep=',', encoding='utf-8')
experiment_3_100_df.to_csv('experiment_3_100.csv', sep=',', encoding='utf-8')

In [None]:
experiment_1_500_df, experiment_2_500_df, experiment_3_500_df = run_unit_experiments(statistic_sample,statistic_sample_2, n = 500, runs = 5000)

experiment_1_500_df.to_csv('experiment_1_500.csv', sep=',', encoding='utf-8')
experiment_2_500_df.to_csv('experiment_2_500.csv', sep=',', encoding='utf-8')
experiment_3_500_df.to_csv('experiment_3_500.csv', sep=',', encoding='utf-8')