In [1]:
import datetime

import numpy as np
import pandas as pd
import Levenshtein

from tqdm.notebook import tqdm

In [2]:
file_info_path = "../output/sample_info/file_existence.tsv"
concentration_path = "../data/sample_metadata/nys-wws-sars2-concentration.csv"

In [3]:
file_info = pd.read_table(file_info_path)
concentration = pd.read_table(concentration_path)

In [4]:
bams_without_conc = file_info.loc[
    np.isin(file_info.sample_present, ("bam_without_concentration", "unexpected_bam_name")), 
    "sample_id"
]
conc_without_bams = file_info.loc[file_info.sample_present == "concentration_without_bam", "sample_id"]

In [5]:
def same_site(bam1, bam2):
    return bam1[8:19] == bam2[8:19]

def close_date(bam1, bam2):
    date1 = datetime.date.fromisoformat(bam1[:8])
    date2 = datetime.date.fromisoformat(bam2[:8])
    return abs(date2 - date1) <= datetime.timedelta(days=3)

In [6]:
records = []

for focal_bam in bams_without_conc:
    for bam in conc_without_bams:
        if (same_site(focal_bam, bam) and close_date(focal_bam, bam)):
            records.append((focal_bam, bam, "close date"))
        elif Levenshtein.distance(focal_bam, bam) == 1:
            records.append((focal_bam, bam, "possible typo"))

In [7]:
df = pd.DataFrame.from_records(records, columns=["bam_file", "concentration_entry", "reason"])
df.to_csv("bams-concentration-candidates.tsv", sep="\t", index=False)