In [70]:
import pandas as pd
import requests
from pathlib import Path
from tempfile import TemporaryDirectory
import gzip
import struct
import shutil

In [71]:
resp_df = pd.read_csv("lung-pet-ct-responses.csv", dtype=str, keep_default_na=False)
qa_res_df = pd.read_csv("qa-results.csv", dtype=str, keep_default_na=False)

In [72]:
# qa_res_df.sort_values(
#     by=[
#         "Reviewer",
#         "Validation",
#         "Collection",
#         "PatientID",
#         "StudyDate",
#         "StudyDate_suffix",
#     ],
#     inplace=True,
# )
# qa_res_df.to_csv("qa-results.csv", index=False)


In [73]:
def is_nrrd(fp):
    with open(fp, "rb") as f:
        return f.read(4) == b"NRRD"


def is_nii_gz(fp):
    with open(fp, "rb") as f:
        try:
            nii = gzip.open(fp).read()
        except gzip.BadGzipFile:
            return False
        header_size = struct.unpack("<i", nii[0:4])[0]
        magic = nii[344:348]
        return header_size == 348 and magic in (b"n+1\00", b"ni1+\00")


def convert_gdrive_link(url):
    file_id = url.split("id=")[-1]
    return f"https://docs.google.com/uc?export=download&id={file_id}"


In [74]:
resp_df.rename(
    columns={
        "Please enter your name": "Reviewer",
        "What is the patient name?": "PatientID",
        "Was the AI predicted ROIs accurate?": "LikertScore",
        "Do you have any comments about the AI predicted ROIs?": "CommentsAboutAISegmentation",
        "Do you have any comments about the findings from the study scans?": "CommentsAboutScan",
        "What is the study date folder?": "StudyDateFolder",
    },
    inplace=True,
)
resp_df.replace(
    {
        "Strongly Agree - Use-as-is (i.e., clinically acceptable, and could be used for treatment without change)": "5",
        "Agree - Minor edits that are not necessary. Stylistic differences, but not clinically important. The current segmentation is acceptable": "4",
        "Neither agree nor disagree - Minor edits that are necessary. Minor edits are those that the review judges can be made in less time than starting from scratch or are expected to have minimal effect on treatment outcome": "3",
        "Disagree - Major edits. This category indicates that the necessary edit is required to ensure correctness, and sufficiently significant that user would prefer to start from the scratch": "2",
        "Strongly disagree - Unusable. This category indicates that the quality of the automatic annotations is so bad that they are unusable.": "1",
    },
    inplace=True,
)
resp_df.replace({"Mariam Aboian": "rad1", "Tej Verma": "ne1"}, inplace=True)

# change studydate from m-d-y to yyyymmmdd
# resp_df["StudyDate"] = pd.to_datetime(df["StudyDate"]).dt.strftime("%Y%m%d")


In [75]:
reviewers = resp_df["Reviewer"].unique()
for reviewer in reviewers:
    StudyDateFolders = resp_df[resp_df["Reviewer"] == reviewer][
        [
            "PatientID",
            "StudyDateFolder",
        ]
    ]
    # check that there are no duplicates in StudyDateFolder
    if len(StudyDateFolders) != len(StudyDateFolders.drop_duplicates()):
        print("Reviewer {} has duplicate scans reviewed".format(reviewer))
        # print duplicates
        print(StudyDateFolders[StudyDateFolders.duplicated()])
    else:
        print("Reviewer {} has no duplicate scans reviewed".format(reviewer))


Reviewer ne1 has no duplicate scans reviewed
Reviewer rad1 has no duplicate scans reviewed


In [76]:
# fill in qa-results:
for resp_i, resp_row in resp_df.iterrows():
    if "_" in resp_row["StudyDateFolder"]:
        study_date, suffix = resp_row["StudyDateFolder"].split("_")
    else:
        study_date = resp_row["StudyDateFolder"]
        suffix = "0"
    reader = resp_row["Reviewer"]
    pid = resp_row["PatientID"]

    # find the row to edit in qa_res_df
    index_to_edit = None
    qa_rows = qa_res_df[
        (qa_res_df["Reviewer"] == reader)
        & (qa_res_df["PatientID"] == pid)
        & (qa_res_df["StudyDate"] == study_date)
        & (qa_res_df["StudyDate_suffix"] == suffix)
    ]
    if len(qa_rows) == 1:
        index_to_edit = qa_rows.index[0]
    elif len(qa_rows) > 1:
        raise RuntimeError("More than one row found for {}".format(resp_row))
    else:
        # find blank row
        qa_rows = qa_res_df[
            (qa_res_df["Reviewer"] == "")
            & (qa_res_df["PatientID"] == pid)
            & (qa_res_df["StudyDate"] == study_date)
            & (qa_res_df["StudyDate_suffix"] == suffix)
        ]
        if len(qa_rows) == 1:
            index_to_edit = qa_rows.index[0]
        elif len(qa_rows) > 1:
            raise RuntimeError("More than one row found for {}".format(resp_row))
        else:
            # copy minimum from other row
            qa_rows = qa_res_df[
                (qa_res_df["PatientID"] == pid)
                & (qa_res_df["StudyDate"] == study_date)
                & (qa_res_df["StudyDate_suffix"] == suffix)
            ]
            if len(qa_rows) == 0:
                raise RuntimeError("No rows found for {}".format(resp_row))
            else:
                # raise RuntimeError("DEBUG BREAK: Copying row for {}".format(resp_row))
                row_to_copy = qa_rows.iloc[0]
                # keep only the minimum columes
                row_to_copy = row_to_copy[
                    [
                        "Collection",
                        "PatientID",
                        "StudyInstanceUID",
                        "PTSeriesInstanceUID",
                        "CTSeriesInstanceUID",
                        # "Segmentation",
                        "StudyDate",
                        "StudyDate_suffix",
                        "Validation",
                        # "LikertScore",
                        # "CommentsAboutAISegmentation",
                        # "CommentsAboutScan",
                        # "CorrectedSegmentation",
                        "AISegmentation",
                    ]
                ]
                # add to qa_res_df
                qa_res_df = pd.concat(
                    [qa_res_df, row_to_copy.to_frame().T], ignore_index=True
                )
                # get index of new row
                index_to_edit = qa_res_df.index[-1]

    # edit row
    qa_res_df.loc[index_to_edit, "Reviewer"] = reader
    qa_res_df.loc[index_to_edit, "LikertScore"] = resp_row["LikertScore"]
    qa_res_df.loc[index_to_edit, "CommentsAboutAISegmentation"] = resp_row[
        "CommentsAboutAISegmentation"
    ]
    qa_res_df.loc[index_to_edit, "CommentsAboutScan"] = resp_row["CommentsAboutScan"]

    # download and save corrected segmentation
    url = resp_row["Please upload your corrected segmentation file"]
    if url:
        if qa_res_df.loc[index_to_edit, "CorrectedSegmentation"]:
            continue
            # raise RuntimeError("CorrectedSegmentation already exists")

        resp_filename = f"{reader}_{pid}_{study_date}_{suffix}"

        # download
        r = requests.get(convert_gdrive_link(url), allow_redirects=True)
        with TemporaryDirectory() as tmpdir:
            tmp_file = Path(tmpdir) / "tmp"
            tmp_file.open("wb").write(r.content)
            if is_nrrd(tmp_file):
                resp_filename += ".nrrd"
            elif is_nii_gz(tmp_file):
                resp_filename += ".nii.gz"
            else:
                raise RuntimeError(f"Unknown file type for {resp_row}")

            resp_filepath = Path("qa-segmentations") / resp_filename
            resp_filepath.parent.mkdir(exist_ok=True, parents=True)
            shutil.copy(tmp_file, resp_filepath)
            qa_res_df.loc[index_to_edit, "CorrectedSegmentation"] = resp_filename
            qa_res_df.to_csv("qa-results.csv", index=False)  # checkpoint

In [77]:
# save qa_res_df
qa_res_df.sort_values(
    by=[
        "Reviewer",
        "Validation",
        "Collection",
        "PatientID",
        "StudyDate",
        "StudyDate_suffix",
    ],
    inplace=True,
)
qa_res_df.to_csv("qa-results.csv", index=False)
