Processing X-ray from 
=====================

In [1]:
import os
import pandas as pd
import shutil
pd.options.display.max_colwidth = 10000

In [2]:
def not_found(path: str) -> bool:
    return not os.path.exists(path)    

In [3]:
git_url = "https://github.com/ieee8023/covid-chestxray-dataset"

In [4]:
repo_dir = "/data/sources/covid-chestxray-dataset"
assert os.path.exists(repo_dir), "repo_dir should exist!"
image_dir = os.path.join(repo_dir, "images")
assert os.path.exists(image_dir), "images should exist!"
data_dir = "/data/sources/covid-19-cv/data"
if not_found(data_dir):
    print("the folder "+dat_dir+" does not exist! Making it now!")
    os.makedirs(data_dir)

In [5]:
import git
from git import Repo
git_url = "https://github.com/ieee8023/covid-chestxray-dataset"
if os.path.exists(repo_dir):
    print(repo_dir + " exists!")
    print("git pulling from "+git_url+"to update it")
    g = git.cmd.Git(repo_dir)
    g.pull()
    print("pulling finished")
else:
    print("git cloning repository with data to "+repo_dir)
    Repo.clone_from(git_url, repo_dir)
    print("clonning finished")

/data/sources/covid-chestxray-dataset exists!
git pulling from https://github.com/ieee8023/covid-chestxray-datasetto update it
pulling finished


In [6]:
metadata_path = os.path.sep.join([repo_dir, "metadata.csv"])

In [7]:
metadata = pd.read_csv(metadata_path)
metadata.head(5)

Unnamed: 0,Patientid,offset,sex,age,finding,survival,view,modality,date,location,filename,doi,url,license,clinical notes,other notes,Unnamed: 16
0,2,0.0,M,65.0,COVID-19,Y,PA,X-ray,2020,,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc2001272,,,,
1,2,3.0,M,65.0,COVID-19,Y,PA,X-ray,2020,,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc2001272,,,,
2,2,5.0,M,65.0,COVID-19,Y,PA,X-ray,2020,,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc2001272,,,,
3,2,6.0,M,65.0,COVID-19,Y,PA,X-ray,2020,,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc2001272,,,,
4,4,0.0,F,52.0,COVID-19,,PA,X-ray,2020,"Changhua Christian Hospital, Changhua City, Taiwan",nejmc2001573_f1a.jpeg,10.1056/NEJMc2001573,https://www.nejm.org/doi/full/10.1056/NEJMc2001573,,diffuse infiltrates in the bilateral lower lungs,,


In [11]:
def copy_images(df: pd.DataFrame, image_dir: str, data_dir: str) -> pd.DataFrame:
    df['path'] = df.apply(lambda row: os.path.join(data_dir,row["view"].replace(" ", "_"), row["finding"].replace(" ", "_"), row.filename), axis=1)
    for (i, row) in df.iterrows():
        from_image = os.path.join(image_dir, row["filename"])        
        to_image = row["path"]        
        p = os.path.dirname(to_image)
        if(not_found(p)):
            print(p+" not found, creating it!")
            os.makedirs(p)            
        if(not_found(to_image)):            
            shutil.copy2(from_image, to_image)
        #else:
        #    print(to_image + " already exists!")            
    return df        

In [14]:
df = copy_images(metadata, image_dir, data_dir)
df.path.head(10)

0    /data/sources/covid-19-cv/data/PA/COVID-19/auntminnie-a-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg
1    /data/sources/covid-19-cv/data/PA/COVID-19/auntminnie-b-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg
2    /data/sources/covid-19-cv/data/PA/COVID-19/auntminnie-c-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg
3    /data/sources/covid-19-cv/data/PA/COVID-19/auntminnie-d-2020_01_28_23_51_6665_2020_01_28_Vietnam_coronavirus.jpeg
4                                                     /data/sources/covid-19-cv/data/PA/COVID-19/nejmc2001573_f1a.jpeg
5                                                     /data/sources/covid-19-cv/data/PA/COVID-19/nejmc2001573_f1b.jpeg
6                                                                /data/sources/covid-19-cv/data/PA/ARDS/ARDSSevere.png
7                                                         /data/sources/covid-19-cv/data/PA/COVID-19/lancet-case2a.jpg
8                                               

In [18]:
df.to_csv (os.path.join(data_dir, "metadata.csv"), index = False, header=True)