# Data preparation
1. Move the data to corresponding folder per dataset structure here: https://docs.ultralytics.com/datasets/classify/
```
cifar-10-/
|
|-- train/
|   |-- airplane/
|   |   |-- 10008_airplane.png
|   |   |-- 10009_airplane.png
|   |   |-- ...
|   |
|   |-- automobile/
|   |   |-- 1000_automobile.png
|   |   |-- 1001_automobile.png
|   |   |-- ...
|   |
|   |-- bird/
|   |   |-- 10014_bird.png
|   |   |-- 10015_bird.png
|   |   |-- ...
|   |
|   |-- ...
|
|-- test/
|   |-- airplane/
|   |   |-- 10_airplane.png
|   |   |-- 11_airplane.png
|   |   |-- ...
|   |
|   |-- automobile/
|   |   |-- 100_automobile.png
|   |   |-- 101_automobile.png
|   |   |-- ...
|   |
|   |-- bird/
|   |   |-- 1000_bird.png
|   |   |-- 1001_bird.png
|   |   |-- ...
|   |
|   |-- ...
```

In [12]:
import os
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
TRANSFORMED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier"
CURATED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_classifier"
if not os.path.exists(CURATED_FOLDER):
    os.makedirs(CURATED_FOLDER)
    
YOLOFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
TRAIN_FOLDER = os.path.join(YOLOFOLDER, "train")
TEST_FOLDER = os.path.join(YOLOFOLDER, "test")
for fl in [YOLOFOLDER, TRAIN_FOLDER, TEST_FOLDER]:
    if not os.path.exists(fl):
        os.makedirs(fl)
        
        
train_file_name = "c_train.parquet"
test_file_name = "c_test.parquet"
train_df = pd.read_parquet(os.path.join(CURATED_FOLDER, train_file_name))
test_df = pd.read_parquet(os.path.join(CURATED_FOLDER, test_file_name))


        
dataset = {
    "train":train_df,
    "test":test_df}

for folder in ['train', 'test']:
    print(f"working on the {folder} data")
    for city in tqdm(train_df['city'].unique()):
        city_folder = os.path.join(YOLOFOLDER, folder, city)
        if not os.path.exists(city_folder):
            os.makedirs(city_folder)
            print(f"start to copy data for city {city}")
            data = dataset[folder]
            temp = data[data['city']==city].reset_index(drop = True)
            for path in tqdm(temp['path'].values):
                img_name = path.split("/")[-1]
                shutil.copy(path, cityfolder)
            print(f"{city} is done")
            print("*"*100)

# # load the large train and test dataset and reoganize them
# files = glob.glob(TRANSFORMED_FOLDER+"/*.csv")
# train_df= []
# test_df = []
# for f in tqdm(files):
#     temp = pd.read_csv(f)
#     temp['data_group'] = temp['data_group'].apply(lambda x: x.replace('sel','test'))
#     train_df.append(temp[temp['data_group']=='train'])
#     test_df.append(temp[temp['data_group']=='test'])

# train_df = pd.concat(train_df).reset_index(drop = True)
# test_df = pd.concat(test_df).reset_index(drop = True)
# train_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, train_file_name), index = False)
# test_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, test_file_name), index = False)

# Test training

In [1]:
import os
import glob
YOLOFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
files = glob.glob(YOLOFOLDER+'/*/*/*')
files

44334

In [3]:
import pandas as pd
df = pd.DataFrame({'path':files})
df['file_type'] = df['path'].apply(lambda x: x.split("/")[-1].split(".")[-1])
df.groupby('file_type').size()

file_type
jpg    44334
dtype: int64

In [4]:
from ultralytics import YOLO
from tqdm import tqdm

# Load a model
model = YOLO('yolov8n-cls.pt')  # load a pretrained model (recommended for training)

# Train the model
results = model.train(data=YOLOFOLDER, epochs=10, imgsz=416)

Downloading https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n-cls.pt to 'yolov8n-cls.pt'...
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5.28M/5.28M [00:00<00:00, 14.1MB/s]
New https://pypi.org/project/ultralytics/8.2.2 available üòÉ Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.0.145 üöÄ Python-3.8.17 torch-2.0.1+cu117 CUDA:0 (Tesla T4, 15110MiB)
[34m[1mengine/trainer: [0mtask=classify, mode=train, model=yolov8n-cls.pt, data=/mnt/03_gsv/t_classifier_img_yolo8, epochs=10, patience=50, batch=16, imgsz=400, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=None, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plot

FileNotFoundError: Found no valid file for the classes Bacolod, Bangalore, Bangkok, Belgrade, Belo Horizonte, Berezniki, Berlin, Bogot√°, Boston, Brussels, Budapest, Buenos Aires, Capetown, Cebu City, Chicago, Cleveland, Cochabamba, Copenhagen, Culiacan, Curitiba, Delhi, Denver, Dhaka, Dubai, Fukuoka, Gaborone, Gainesville, FL, Gombe, Guadalajara, Guatemala City, Hindupur, Hong Kong, Houston, Hyderabad, Ilheus, Istanbul, Jaipur, Jakarta, Jalna, Jerusalem, Johannesburg, Kampala, Kaunas, Kigali, Killeen, Kozhikode, Kuala Lumpur, Kyiv, Lagos, Le Mans, Lima, London, Los Angeles, Madrid, Manchester, Manila, Medan, Metro Manila, Mexico City, Miami, Milan, Minneapolis, Modesto, Montreal, Mumbai, Munich, Nagoya, Nairobi, Okayama, Palembang, Palermo, Parbhani, Paris, Portland, OR, Pune, Quito, Rajshahi, Raleigh, Reynosa, Ribeirao Preto, Rio de Janeiro, Rome, Rovno, Saidpur, Saint Petersburg, San Francisco, Santiago, Sao Paulo, Seoul, Sitapur, Sydney, Taipei, Tel Aviv, Thessaloniki, Tokyo, Toledo, Toronto, Tyumen, Victoria, Vienna, Vijayawada, Warsaw, Wellington, Zwolle. Supported extensions are: .jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp