# Data preparation
1. Move the data to corresponding folder per dataset structure here: https://docs.ultralytics.com/datasets/classify/
```
cifar-10-/
|
|-- train/
|   |-- airplane/
|   |   |-- 10008_airplane.png
|   |   |-- 10009_airplane.png
|   |   |-- ...
|   |
|   |-- automobile/
|   |   |-- 1000_automobile.png
|   |   |-- 1001_automobile.png
|   |   |-- ...
|   |
|   |-- bird/
|   |   |-- 10014_bird.png
|   |   |-- 10015_bird.png
|   |   |-- ...
|   |
|   |-- ...
|
|-- test/
|   |-- airplane/
|   |   |-- 10_airplane.png
|   |   |-- 11_airplane.png
|   |   |-- ...
|   |
|   |-- automobile/
|   |   |-- 100_automobile.png
|   |   |-- 101_automobile.png
|   |   |-- ...
|   |
|   |-- bird/
|   |   |-- 1000_bird.png
|   |   |-- 1001_bird.png
|   |   |-- ...
|   |
|   |-- ...
```

In [12]:
import os
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
TRANSFORMED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier"
CURATED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_classifier"
if not os.path.exists(CURATED_FOLDER):
    os.makedirs(CURATED_FOLDER)
    
YOLOFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
TRAIN_FOLDER = os.path.join(YOLOFOLDER, "train")
TEST_FOLDER = os.path.join(YOLOFOLDER, "test")
for fl in [YOLOFOLDER, TRAIN_FOLDER, TEST_FOLDER]:
    if not os.path.exists(fl):
        os.makedirs(fl)
        
        
train_file_name = "c_train.parquet"
test_file_name = "c_test.parquet"
train_df = pd.read_parquet(os.path.join(CURATED_FOLDER, train_file_name))
test_df = pd.read_parquet(os.path.join(CURATED_FOLDER, test_file_name))


        
dataset = {
    "train":train_df,
    "test":test_df}

for folder in ['train', 'test']:
    print(f"working on the {folder} data")
    for city in tqdm(train_df['city'].unique()):
        city_folder = os.path.join(YOLOFOLDER, folder, city)
        if not os.path.exists(city_folder):
            os.makedirs(city_folder)
            print(f"start to copy data for city {city}")
            data = dataset[folder]
            temp = data[data['city']==city].reset_index(drop = True)
            for path in tqdm(temp['path'].values):
                img_name = path.split("/")[-1]
                shutil.copy(path, cityfolder)
            print(f"{city} is done")
            print("*"*100)

# # load the large train and test dataset and reoganize them
# files = glob.glob(TRANSFORMED_FOLDER+"/*.csv")
# train_df= []
# test_df = []
# for f in tqdm(files):
#     temp = pd.read_csv(f)
#     temp['data_group'] = temp['data_group'].apply(lambda x: x.replace('sel','test'))
#     train_df.append(temp[temp['data_group']=='train'])
#     test_df.append(temp[temp['data_group']=='test'])

# train_df = pd.concat(train_df).reset_index(drop = True)
# test_df = pd.concat(test_df).reset_index(drop = True)
# train_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, train_file_name), index = False)
# test_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, test_file_name), index = False)

# Test training

In [5]:
import os
import glob
YOLOFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
files = glob.glob(YOLOFOLDER+'/*/*/*')
files

['/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8/train/Dubai/3q4COLq5yNnvhTPRtOI7Fg_90.jpg',
 '/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8/train/Dubai/yZ5fkWgPhxsnenNSXed5Zg_270.jpg',
 '/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8/train/Dubai/1VBBmsCLILiYU3t5bSqysw_180.jpg',
 '/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8/train/Dubai/vYSIhmpYzMQAAAQ7LtadNQ_180.jpg',
 '/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8/train/Dubai/aKq7MUZZOxDAJnuUTKfbnQ_180.jpg',
 '/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8/train/Dubai/snZymQUzJANEniyfO0P_MA_90.jpg',
 '/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8/train/Dubai/mOgN0Qz4E2awOHu2i8AKyA_180.jpg',
 '/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8/train/Dubai/4pUfFf5cT9kjYZ6Nxs3MIg_90.jpg',
 '/lustre1/g/geog_pyloo/05_timemachine/_tra

In [None]:
import pandas as pd
df = pd.DataFrame({})
len(files)

In [1]:
from ultralytics import YOLO
from tqdm import tqdm

# Load a model
model = YOLO('yolov8n-cls.pt')  # load a pretrained model (recommended for training)

YOLOTRAIN = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
# Train the model
results = model.train(data=YOLOFOLDER, epochs=50, imgsz=400)

Downloading https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n-cls.pt to yolov8n-cls.pt...
100%|██████████| 5.28M/5.28M [00:00<00:00, 17.6MB/s]
New https://pypi.org/project/ultralytics/8.1.47 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.0.107 🚀 Python-3.9.16 torch-2.0.1+cu117 CUDA:0 (Quadro RTX 6000, 24212MiB)
[34m[1myolo/engine/trainer: [0mtask=classify, mode=train, model=yolov8n-cls.pt, data=/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8, epochs=50, patience=50, batch=16, imgsz=400, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=None, exist_ok=False, pretrained=False, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=0, resume=False, amp=True, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True,

FileNotFoundError: Found no valid file for the classes Modesto. Supported extensions are: .jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp