In [1]:
import os
import fnmatch
import numpy as np
import pandas as pd
from random import sample
from sklearn.ensemble import RandomForestRegressor

### All files that will be fed to Felix should have .csv format

In [2]:
#checked
def find_qPCR_file():
    for filename in os.listdir('.'):
        if fnmatch.fnmatch(filename, '*Results*.xls'):
            return filename

In [3]:
filename = find_qPCR_file()

In [4]:
filename

'2021-12-01 155008_Results_ViiA7_export.xls'

In [5]:
#checked
def read_qPCR_file(qPCR_filename):
    raw_df = pd.read_excel(qPCR_filename)
    raw_df.columns = raw_df.iloc[39,:].values.flatten().tolist()
    df = raw_df.iloc[40:,:]
    qPCR_df = df[['Well Position', 'CT', 'Ct Mean']].copy()
    qPCR_df.columns = ['well', 'ct', 'mean_ct']
    return qPCR_df

In [6]:
qPCR_df = read_qPCR_file(filename)

In [7]:
qPCR_df

Unnamed: 0,well,ct,mean_ct
40,A1,16.0812,16.2987
41,A2,16.1566,16.2987
42,A3,16.6583,16.2987
43,A4,26.6946,26.3383
44,A5,Undetermined,26.3383
45,A6,25.982,26.3383
46,B1,Undetermined,
47,B2,Undetermined,
48,B3,Undetermined,
49,B4,Undetermined,


In [8]:
#checked
def write_qPCR_output(filename):
    qPCR_filename = find_qPCR_file()
    qPCR_df = read_qPCR_file(qPCR_filename)
    qPCR_df.to_csv(filename, index=False)

In [9]:
write_qPCR_output("qPCR_output.txt")
#info: well postion, ct, mean ct for selected wells
#format: Well Position, CT, Ct Mean

In [10]:
#need checking
def setTrueCt(query_df, qPCR_df):
    trueCT_values = qPCR_df["mean_ct"]
    query_df = query_df.join(trueCT_values)
    return query_df

In [11]:
#query file: index, content
#observed: dataframe
#qPCROutput: not Known
def update_observed_file(observed_filename, query_filename, qPCR_filename, first_run):
    if first_run == True:
        observed = pd.DataFrame()
    else:
        observed = pd.read_csv(observed_filename)
    query = pd.read_csv(query_filename)
    qPCR = pd.read_csv(qPCR_filename)
    query_ct = setTrueCt(query, qPCR)
    observed = observed.append(query_ct, ignore_index=True)
    observed.to_csv(observed_filename, index = False)

In [12]:
update_observed_file("observed.txt", "query.csv", "qPCR_output.txt", False)

In [13]:
def update_unobserved_file(unobserved_filename, query_filename):
    # query.txt = the first column will have the index with respect to the unobserved.csv
    unobserved = pd.read_csv(unobserved_filename)
    query_index = pd.read_csv(query_filename).iloc[:,0].to_numpy()
    print(query_index)
    unobserved.drop(labels = query_index, axis = 0, inplace = True)
    unobserved.to_csv(unobserved_filename, index = True)


In [14]:

#read observed.csv
# return the uncertainty of each unobserved data
def get_query_index(observed_file, unobserved_file, batch_size, first_run):
    
    most_uncertain = []
    unobserved = pd.read_csv(unobserved_file)
    
    
    if first_run == True: 
        #random selected seeds
        most_uncertain = sample(list(range(len(unobserved))), batch_size)
    
    else: 
        #uncertainty sampling
        observed = pd.read_csv(observed_file)
        X, y = observed[["fwdGC", "revGC"]], observed["Ct"]
        reg = RandomForestRegressor(n_estimators=3,random_state=0)
        reg.fit(X,y ) #observed
        # find the most uncertain
        outputs = []
        for estimator in reg.estimators_:
            outputs.append(estimator.predict(unobserved[["fwdGC", "revGC"]])) #unobserved
        stds = np.std(outputs, axis=0)
        for i in range(batch_size):
            max_index = int(np.argmax(stds))
            most_uncertain.append(max_index)
            stds[max_index] = -1
            
    return most_uncertain


In [15]:
most_uncertain = get_query_index("observed.txt", "unobserved.txt", 12, True)

In [16]:
most_uncertain

[57, 220, 43, 34, 192, 70, 131, 176, 223, 80, 47, 42]

In [17]:
#first_run: boolean var -- True if write initial seeds as query, False if write selected batch by uncertainty sampling
def write_query_file(output_filename, observed_file, unobserved_file, batch_size, first_run):
    #  batch 3 parameter set (samples) for us to "query.csv" 
    query_index = get_query_index(observed_file, unobserved_file, batch_size, first_run)
    unobserved = pd.read_csv(unobserved_file)
    toWrite = unobserved.iloc[query_index, :].to_csv(index = True)
    with open(output_filename, "w") as f:
        f.write("idx")
        f.write(toWrite)
        f.close()

In [18]:
write_query_file("query.csv", "observed.txt", "unobserved.txt", 12, True)