In [21]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import wandb

In [7]:
DATA_PATH = "../../corn"
os.path.isdir(DATA_PATH)

True

In [8]:
train_df = pd.read_csv(f"{DATA_PATH}/train.csv")
train_df.head()

Unnamed: 0,seed_id,view,image,label
0,0,top,train/00000.png,broken
1,1,bottom,train/00001.png,pure
2,3,top,train/00003.png,broken
3,4,top,train/00004.png,pure
4,5,top,train/00005.png,discolored


In [9]:
len(train_df)

14322

In [10]:
train_df.label.value_counts()

pure          5837
broken        4554
discolored    2504
silkcut       1427
Name: label, dtype: int64

In [11]:
test_df = pd.read_csv(f"{DATA_PATH}/test.csv")
test_df.head()

Unnamed: 0,seed_id,view,image
0,2,top,test/00002.png
1,11,bottom,test/00011.png
2,13,top,test/00013.png
3,19,bottom,test/00019.png
4,27,bottom,test/00027.png


## Utility

In [31]:
def log_data_as_table(df, data_type="train"):
    data_artifact = wandb.Artifact(name=data_type, type="dataset")
    
    columns = list(df.columns)
    data_at = wandb.Table(columns=columns)

    for row, tmp_df in tqdm(df.iterrows()):
        data = (
            tmp_df.seed_id,
            tmp_df.view,
            wandb.Image(f"{DATA_PATH}/{tmp_df.image}", mode="RGB"),
        )
        if data_type == "train":
            data += (tmp_df.label, )
        
        data_at.add_data(*data)

    # Store the table as artifact.
    data_artifact.add(data_at, f"{data_type}-table")
    # Now we will log the artifact to W&B.
    wandb.log_artifact(data_artifact)

## Log the data as W&B Tables

In [None]:
# Train data

run = wandb.init(
    project="pogchamp",
    job_type='upload',
)

log_data_as_table(train_df)

run.finish()

14322it [00:15, 953.27it/s]


In [None]:
# Test data

run = wandb.init(
    project="pogchamp",
    job_type='upload',
)

log_data_as_table(test_ds, data_type="test")

run.finish()