In [1]:
import os 

import pandas as pd
import numpy as np
import webdataset as wds
import random

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [3]:
chexpert_path = "../../input/CheXpert-v1.0-small/"

train_df = pd.read_csv(os.path.join(chexpert_path, 'train.csv'))

In [4]:
train_df = train_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [5]:
train_df.head()

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0-small/train/patient00082/study1/...,Male,18,Frontal,PA,1.0,0.0,,,,,,,,0.0,0.0,,,
1,CheXpert-v1.0-small/train/patient35759/study9/...,Male,90,Frontal,AP,,,1.0,,,1.0,-1.0,,1.0,,,,,1.0
2,CheXpert-v1.0-small/train/patient04789/study1/...,Male,39,Lateral,,,,,,1.0,,,,,,1.0,,,
3,CheXpert-v1.0-small/train/patient38491/study5/...,Male,60,Frontal,AP,,,,,,1.0,,,,,,,,1.0
4,CheXpert-v1.0-small/train/patient06537/study3/...,Female,66,Frontal,AP,,,0.0,1.0,,,0.0,-1.0,-1.0,0.0,0.0,,,


In [6]:
train_df.columns

Index(['Path', 'Sex', 'Age', 'Frontal/Lateral', 'AP/PA', 'No Finding',
       'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
       'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
       'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
       'Support Devices'],
      dtype='object')

In [7]:
train_df["patient_id"] = train_df["Path"].apply(lambda row: "_".join(row.split("/")[2:4]))

train_df["No Finding"].fillna(0, inplace=True)
train_df["Enlarged Cardiomediastinum"].fillna(0, inplace=True)
train_df["Cardiomegaly"].fillna(0, inplace=True)
train_df["Lung Opacity"].fillna(0, inplace=True)
train_df["Lung Lesion"].fillna(0, inplace=True)
train_df["Edema"].fillna(0, inplace=True)
train_df["Consolidation"].fillna(0, inplace=True)
train_df["Pneumonia"].fillna(0, inplace=True)
train_df["Atelectasis"].fillna(0, inplace=True)
train_df["Pneumothorax"].fillna(0, inplace=True)
train_df["Pleural Effusion"].fillna(0, inplace=True)
train_df["Pleural Other"].fillna(0, inplace=True)
train_df["Fracture"].fillna(0, inplace=True)
train_df["Support Devices"].fillna(0, inplace=True)


In [9]:
def readfile(file_path):
    with open(file_path, "rb") as stream:
        return stream.read()

In [14]:
with wds.ShardWriter(
    "../../input/chexpert-shards/chexpert-train-%06d.tar", 
    maxcount=10000) as sink:
    for idx in range(train_df.shape[0]):
        img_path = train_df.loc[idx, "Path"]
        meta_cols= [x for x in train_df.columns if x != "Path"]
        image = readfile(os.path.join("../../input", img_path))
        meta = {col: str(train_df.loc[idx, col]) for col in meta_cols}

        sample = {
            "__key__": "%06d"%idx, 
            "jpg": image,
            "json": meta,
        }
        sink.write(sample)

# writing ../../input/chexpert-shards/chexpert-train-000000.tar 0 0.0 GB 0
# writing ../../input/chexpert-shards/chexpert-train-000001.tar 10000 0.5 GB 10000
# writing ../../input/chexpert-shards/chexpert-train-000002.tar 10000 0.5 GB 20000
# writing ../../input/chexpert-shards/chexpert-train-000003.tar 10000 0.5 GB 30000
# writing ../../input/chexpert-shards/chexpert-train-000004.tar 10000 0.5 GB 40000
# writing ../../input/chexpert-shards/chexpert-train-000005.tar 10000 0.5 GB 50000
# writing ../../input/chexpert-shards/chexpert-train-000006.tar 10000 0.5 GB 60000
# writing ../../input/chexpert-shards/chexpert-train-000007.tar 10000 0.5 GB 70000
# writing ../../input/chexpert-shards/chexpert-train-000008.tar 10000 0.5 GB 80000
# writing ../../input/chexpert-shards/chexpert-train-000009.tar 10000 0.5 GB 90000
# writing ../../input/chexpert-shards/chexpert-train-000010.tar 10000 0.5 GB 100000
# writing ../../input/chexpert-shards/chexpert-train-000011.tar 10000 0.5 GB 110000
# writing 