# Data preparation
1. Move the data to corresponding folder per dataset structure here: https://docs.ultralytics.com/datasets/classify/
```
cifar-10-/
|
|-- train/
|   |-- airplane/
|   |   |-- 10008_airplane.png
|   |   |-- 10009_airplane.png
|   |   |-- ...
|   |
|   |-- automobile/
|   |   |-- 1000_automobile.png
|   |   |-- 1001_automobile.png
|   |   |-- ...
|   |
|   |-- bird/
|   |   |-- 10014_bird.png
|   |   |-- 10015_bird.png
|   |   |-- ...
|   |
|   |-- ...
|
|-- test/
|   |-- airplane/
|   |   |-- 10_airplane.png
|   |   |-- 11_airplane.png
|   |   |-- ...
|   |
|   |-- automobile/
|   |   |-- 100_automobile.png
|   |   |-- 101_automobile.png
|   |   |-- ...
|   |
|   |-- bird/
|   |   |-- 1000_bird.png
|   |   |-- 1001_bird.png
|   |   |-- ...
|   |
|   |-- ...
```

In [1]:
#https://sidharkal.medium.com/image-classification-with-yolov8-40a14fe8e4bc

In [79]:
import os
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
TRANSFORMED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier"
CURATED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_classifier"
if not os.path.exists(CURATED_FOLDER):
    os.makedirs(CURATED_FOLDER)
    
YOLOFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
TRAIN_FOLDER = os.path.join(YOLOFOLDER, "train")
TEST_FOLDER = os.path.join(YOLOFOLDER, "test")

train_file_name = "c_train.parquet"
test_file_name = "c_test.parquet"

files = glob.glob(TRANSFORMED_FOLDER+"/*.csv")
train_df= []
test_df = []
for f in tqdm(files[24:]):
    temp = pd.read_csv(f)
    temp['data_group'] = temp['data_group'].apply(lambda x: x.replace('sel','test'))
    temp['name'] = temp['path'].apply(lambda x: x.split("/")[-1])
    temp = temp.drop_duplicates("name").reset_index(drop = True)
    train_temp = temp[temp['data_group']=='train'].reset_index(drop = True)
    train_count = train_temp.shape[0]
    if train_count<20000:
        delta = 20000-train_count
        # sample additional training from the validation set
        val_temp = temp[temp['data_group']=='val'].reset_index(drop = True)
        try:
            val_to_train = val_temp.sample(n = delta, random_state = 1)
            temp.loc[val_to_train.index, 'data_group'] = 'train'
            temp.drop("name", axis =1).to_csv(f, index = False)
        except:
            temp.drop("name", axis =1).to_csv(f, index = False)
        
    train_df.append(temp[temp['data_group']=='train'])
    test_df.append(temp[temp['data_group']=='test'])

# train_df = pd.concat(train_df).reset_index(drop = True)
# test_df = pd.concat(test_df).reset_index(drop = True)
# train_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, train_file_name), index = False)
# test_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, test_file_name), index = False)

100%|██████████| 103/103 [06:10<00:00,  3.59s/it]


In [80]:
train_df = pd.concat(train_df).reset_index(drop = True)
test_df = pd.concat(test_df).reset_index(drop = True)
train_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, train_file_name), index = False)
test_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, test_file_name), index = False)

In [74]:
val_to_train

Unnamed: 0,path,label,data_group,city,name
31152,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,Jy1X_KpLTQczNrPxuAEqHQ_90.jpg
21634,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,gPRbg0OCDxBx3mhigpMQ9w_90.jpg
1118,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,ofDBlE6FBphaTKo4cKIrKA_0.jpg
16399,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,tsMKz5HNZBbcmBlpyqZXqw_270.jpg
31913,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,TeI3F27BP5l2tIcpea_UuA_270.jpg
...,...,...,...,...,...
74522,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,mw4zNPhm8ErNcvDv9DnKfQ_90.jpg
61058,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,UpzWma3wGCkmcv5Jq9hhhw_180.jpg
59650,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,xlOzOI3O3lJxRnDfJxWc3g_0.jpg
17237,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,2BSRQs62pr4Wf1xtNschxQ_0.jpg


In [71]:
temp

Unnamed: 0,path,label,data_group,city,name
0,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,uDpHAgs4caw7o1KwLgEIKw_0.jpg
1,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,uDpHAgs4caw7o1KwLgEIKw_90.jpg
2,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,uDpHAgs4caw7o1KwLgEIKw_180.jpg
3,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,uDpHAgs4caw7o1KwLgEIKw_270.jpg
4,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,pA4Pwb_N2JlJ4662roQlGQ_0.jpg
...,...,...,...,...,...
90639,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,M3jt60iAlCGHoyR7mTNtBQ_270.jpg
90640,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,FOcZLOoVd2b1_mW4a7FqlQ_0.jpg
90641,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,FOcZLOoVd2b1_mW4a7FqlQ_90.jpg
90642,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,76,val,Modesto,FOcZLOoVd2b1_mW4a7FqlQ_180.jpg


In [11]:
import os
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
TRANSFORMED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier"
CURATED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_classifier"
if not os.path.exists(CURATED_FOLDER):
    os.makedirs(CURATED_FOLDER)
    
YOLOFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
TRAIN_FOLDER = os.path.join(YOLOFOLDER, "train")
TEST_FOLDER = os.path.join(YOLOFOLDER, "test")
for fl in [YOLOFOLDER, TRAIN_FOLDER, TEST_FOLDER]:
    if not os.path.exists(fl):
        os.makedirs(fl)
        
        
train_file_name = "c_train.parquet"
test_file_name = "c_test.parquet"
train_df = pd.read_parquet(os.path.join(CURATED_FOLDER, train_file_name))
test_df = pd.read_parquet(os.path.join(CURATED_FOLDER, test_file_name))


        
dataset = {
    "train":train_df,
    "test":test_df}

for folder in ['train', 'test']:
    print(f"working on the {folder} data")
    for city in tqdm(train_df['city'].unique()):
        city_folder = os.path.join(YOLOFOLDER, folder, city)
        if not os.path.exists(city_folder):
            os.makedirs(city_folder)
            print(f"start to copy data for city {city}")
            data = dataset[folder]
            temp = data[data['city']==city].reset_index(drop = True)
            for path in tqdm(temp['path'].values):
                img_name = path.split("/")[-1]
                shutil.copy(path, cityfolder)
            print(f"{city} is done")
            print("*"*100)

# # load the large train and test dataset and reoganize them


# Test training

In [62]:
import os
import glob
import pandas as pd
YOLOFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
files = glob.glob(YOLOFOLDER+'/*/*/*')

df = pd.DataFrame({'path':files})
df['file_type'] = df['path'].apply(lambda x: x.split("/")[-1].split(".")[-1])
df['folder'] = df['path'].apply(lambda x: x.split("/")[-2])
df['group'] = df['path'].apply(lambda x: x.split("/")[-3])
df['name'] = df['path'].apply(lambda x: x.split("/")[-1])

In [59]:
summary = df.groupby(['group','folder']).size().reset_index().rename(columns = {0:'count'})
summary[(summary['count']<1000)&(summary['group']=='test')]['folder'].unique()

array([], dtype=object)

In [63]:
summary[(summary['count']<10000)&(summary['group']=='train')]['folder'].nunique()

53

In [65]:
summary[(summary['count']<10000)&(summary['group']=='train')].sort_values('count')

Unnamed: 0,group,folder,count
167,train,Hindupur,3941
129,train,Antwerp,4336
143,train,Budapest,4708
249,train,Vijayawada,5376
228,train,Saidpur,5667
132,train,Auckland,5874
150,train,Cochabamba,6587
205,train,Mumbai,7076
175,train,Jalna,7258
247,train,Victoria,7790


In [46]:
summary[(summary['count']<10000)&(summary['group']=='train')]['folder'].unique()

array(['Accra', 'Antwerp', 'Auckland', 'Bangkok', 'Budapest', 'Chicago', 'Cirebon', 'Cochabamba', 'Culiacan', 'Curitiba', 'Denver', 'Detroit', 'Dzerzhinsk', 'Florianopolis', 'Fukuoka', 'Gombe', 'Guadalajara', 'Hindupur', 'Ilheus', 'Jaipur', 'Jalna', 'Jequie', 'Kanpur', 'Kaunas', 'Kozhikode', 'Malegaon', 'Medan',
       'Miami', 'Milan', 'Montreal', 'Mumbai', 'Okayama', 'Palembang', 'Palermo', 'Palmas', 'Parbhani', 'Parepare', 'Pune', 'Quito', 'Rajshahi', 'Reynosa', 'Saidpur', 'San Francisco', 'Santiago', 'Sitapur', 'Stockholm', 'Thessaloniki', 'Valledupar', 'Victoria', 'Vienna', 'Vijayawada', 'Yamaguchi', 'Zwolle'],
      dtype=object)

In [42]:
# check if all Modesto's image is in the testdata
moved = df[(df['group']=='test')&(df['folder']=='Modesto')]['name'].unique()
test_df['name'] = test_df['path'].apply(lambda x: x.split("/")[-1])
to_move = test_df[test_df['city']=='Modesto']['name'].unique()
len(set(to_move).intersection(set(moved)))

2000

In [43]:
from ultralytics import YOLO
from tqdm import tqdm

# Load a model
model = YOLO('yolov8n-cls.pt')  # load a pretrained model (recommended for training)

# Train the model
results = model.train(data=YOLOFOLDER, epochs=20, imgsz=416)

New https://pypi.org/project/ultralytics/8.2.2 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.0.107 🚀 Python-3.9.16 torch-2.0.1+cu117 CUDA:0 (Quadro RTX 6000, 24212MiB)
[34m[1myolo/engine/trainer: [0mtask=classify, mode=train, model=yolov8n-cls.pt, data=/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8, epochs=20, patience=50, batch=16, imgsz=416, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=None, exist_ok=False, pretrained=False, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=0, resume=False, amp=True, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, show=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, vid_stride=1, line_width=None, visualize=False, augment

AttributeError: Caught AttributeError in DataLoader worker process 4.
Original Traceback (most recent call last):
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/ultralytics/yolo/data/dataset.py", line 260, in __getitem__
    sample = self.torch_transforms(im)
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/torchvision/transforms/transforms.py", line 95, in __call__
    img = t(img)
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/ultralytics/yolo/data/augment.py", line 877, in __call__
    imh, imw = im.shape[:2]
AttributeError: 'NoneType' object has no attribute 'shape'


In [None]:
# Validate the model
metrics = model.val() # no arguments needed, dataset and settings remembered
metrics.top1 # top1 accuracy
metrics.top5 # top5 accuracy

In [None]:
results = model.predict(“/kaggle/working/data/test/Cricket/704bb73ae1.jpg”)
probs = result.probs # Probs object for classification outputs
print(probs.data)