# Data preparation
1. Move the data to corresponding folder per dataset structure here: https://docs.ultralytics.com/datasets/classify/
```
cifar-10-/
|
|-- train/
|   |-- airplane/
|   |   |-- 10008_airplane.png
|   |   |-- 10009_airplane.png
|   |   |-- ...
|   |
|   |-- automobile/
|   |   |-- 1000_automobile.png
|   |   |-- 1001_automobile.png
|   |   |-- ...
|   |
|   |-- bird/
|   |   |-- 10014_bird.png
|   |   |-- 10015_bird.png
|   |   |-- ...
|   |
|   |-- ...
|
|-- test/
|   |-- airplane/
|   |   |-- 10_airplane.png
|   |   |-- 11_airplane.png
|   |   |-- ...
|   |
|   |-- automobile/
|   |   |-- 100_automobile.png
|   |   |-- 101_automobile.png
|   |   |-- ...
|   |
|   |-- bird/
|   |   |-- 1000_bird.png
|   |   |-- 1001_bird.png
|   |   |-- ...
|   |
|   |-- ...
```

In [34]:
#https://sidharkal.medium.com/image-classification-with-yolov8-40a14fe8e4bc
import os
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
TRANSFORMED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier"
CURATED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_classifier"
if not os.path.exists(CURATED_FOLDER):
    os.makedirs(CURATED_FOLDER)

In [79]:

    
YOLOFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
TRAIN_FOLDER = os.path.join(YOLOFOLDER, "train")
TEST_FOLDER = os.path.join(YOLOFOLDER, "test")

train_file_name = "c_train.parquet"
test_file_name = "c_test.parquet"

files = glob.glob(TRANSFORMED_FOLDER+"/*.csv")
train_df= []
test_df = []
for f in tqdm(files[24:]):
    temp = pd.read_csv(f)
    temp['data_group'] = temp['data_group'].apply(lambda x: x.replace('sel','test'))
    temp['name'] = temp['path'].apply(lambda x: x.split("/")[-1])
    temp = temp.drop_duplicates("name").reset_index(drop = True)
    train_temp = temp[temp['data_group']=='train'].reset_index(drop = True)
    train_count = train_temp.shape[0]
    if train_count<20000:
        delta = 20000-train_count
        # sample additional training from the validation set
        val_temp = temp[temp['data_group']=='val'].reset_index(drop = True)
        try:
            val_to_train = val_temp.sample(n = delta, random_state = 1)
            temp.loc[val_to_train.index, 'data_group'] = 'train'
            temp.drop("name", axis =1).to_csv(f, index = False)
        except:
            temp.drop("name", axis =1).to_csv(f, index = False)
        
    train_df.append(temp[temp['data_group']=='train'])
    test_df.append(temp[temp['data_group']=='test'])

# train_df = pd.concat(train_df).reset_index(drop = True)
# test_df = pd.concat(test_df).reset_index(drop = True)
# train_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, train_file_name), index = False)
# test_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, test_file_name), index = False)

100%|██████████| 103/103 [06:10<00:00,  3.59s/it]


In [80]:
train_df = pd.concat(train_df).reset_index(drop = True)
test_df = pd.concat(test_df).reset_index(drop = True)
train_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, train_file_name), index = False)
test_df[['path', 'data_group','city','label']].to_parquet(os.path.join(CURATED_FOLDER, test_file_name), index = False)

In [11]:
import os
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
TRANSFORMED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier"
CURATED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_classifier"
if not os.path.exists(CURATED_FOLDER):
    os.makedirs(CURATED_FOLDER)
    
YOLOFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
TRAIN_FOLDER = os.path.join(YOLOFOLDER, "train")
TEST_FOLDER = os.path.join(YOLOFOLDER, "test")
for fl in [YOLOFOLDER, TRAIN_FOLDER, TEST_FOLDER]:
    if not os.path.exists(fl):
        os.makedirs(fl)
        
        
train_file_name = "c_train.parquet"
test_file_name = "c_test.parquet"
train_df = pd.read_parquet(os.path.join(CURATED_FOLDER, train_file_name))
test_df = pd.read_parquet(os.path.join(CURATED_FOLDER, test_file_name))


        
dataset = {
    "train":train_df,
    "test":test_df}

for folder in ['train', 'test']:
    print(f"working on the {folder} data")
    for city in tqdm(train_df['city'].unique()):
        city_folder = os.path.join(YOLOFOLDER, folder, city)
        if not os.path.exists(city_folder):
            os.makedirs(city_folder)
            print(f"start to copy data for city {city}")
            data = dataset[folder]
            temp = data[data['city']==city].reset_index(drop = True)
            for path in tqdm(temp['path'].values):
                img_name = path.split("/")[-1]
                shutil.copy(path, cityfolder)
            print(f"{city} is done")
            print("*"*100)

# # load the large train and test dataset and reoganize them


# Test training

In [62]:
import os
import glob
import pandas as pd
YOLOFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
files = glob.glob(YOLOFOLDER+'/*/*/*')

df = pd.DataFrame({'path':files})
df['file_type'] = df['path'].apply(lambda x: x.split("/")[-1].split(".")[-1])
df['folder'] = df['path'].apply(lambda x: x.split("/")[-2])
df['group'] = df['path'].apply(lambda x: x.split("/")[-3])
df['name'] = df['path'].apply(lambda x: x.split("/")[-1])

In [59]:
summary = df.groupby(['group','folder']).size().reset_index().rename(columns = {0:'count'})
summary[(summary['count']<1000)&(summary['group']=='test')]['folder'].unique()

array([], dtype=object)

In [46]:
summary[(summary['count']<10000)&(summary['group']=='train')]['folder'].unique()

array(['Accra', 'Antwerp', 'Auckland', 'Bangkok', 'Budapest', 'Chicago', 'Cirebon', 'Cochabamba', 'Culiacan', 'Curitiba', 'Denver', 'Detroit', 'Dzerzhinsk', 'Florianopolis', 'Fukuoka', 'Gombe', 'Guadalajara', 'Hindupur', 'Ilheus', 'Jaipur', 'Jalna', 'Jequie', 'Kanpur', 'Kaunas', 'Kozhikode', 'Malegaon', 'Medan',
       'Miami', 'Milan', 'Montreal', 'Mumbai', 'Okayama', 'Palembang', 'Palermo', 'Palmas', 'Parbhani', 'Parepare', 'Pune', 'Quito', 'Rajshahi', 'Reynosa', 'Saidpur', 'San Francisco', 'Santiago', 'Sitapur', 'Stockholm', 'Thessaloniki', 'Valledupar', 'Victoria', 'Vienna', 'Vijayawada', 'Yamaguchi', 'Zwolle'],
      dtype=object)

In [42]:
# check if all Modesto's image is in the testdata
moved = df[(df['group']=='test')&(df['folder']=='Modesto')]['name'].unique()
test_df['name'] = test_df['path'].apply(lambda x: x.split("/")[-1])
to_move = test_df[test_df['city']=='Modesto']['name'].unique()
len(set(to_move).intersection(set(moved)))

2000

In [43]:
from ultralytics import YOLO
from tqdm import tqdm

# Load a model
model = YOLO('yolov8n-cls.pt')  # load a pretrained model (recommended for training)

# Train the model
results = model.train(data=YOLOFOLDER, epochs=20, imgsz=416)

New https://pypi.org/project/ultralytics/8.2.2 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.0.107 🚀 Python-3.9.16 torch-2.0.1+cu117 CUDA:0 (Quadro RTX 6000, 24212MiB)
[34m[1myolo/engine/trainer: [0mtask=classify, mode=train, model=yolov8n-cls.pt, data=/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8, epochs=20, patience=50, batch=16, imgsz=416, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=None, exist_ok=False, pretrained=False, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=0, resume=False, amp=True, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, show=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, vid_stride=1, line_width=None, visualize=False, augment

AttributeError: Caught AttributeError in DataLoader worker process 4.
Original Traceback (most recent call last):
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/ultralytics/yolo/data/dataset.py", line 260, in __getitem__
    sample = self.torch_transforms(im)
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/torchvision/transforms/transforms.py", line 95, in __call__
    img = t(img)
  File "/lustre1/u/yuanzf/anaconda3/envs/yolo5/lib/python3.9/site-packages/ultralytics/yolo/data/augment.py", line 877, in __call__
    imh, imw = im.shape[:2]
AttributeError: 'NoneType' object has no attribute 'shape'


In [63]:
# Validate the model
metrics = model.val() # no arguments needed, dataset and settings remembered
metrics.top1 # top1 accuracy
metrics.top5 # top5 accuracy

Ultralytics YOLOv8.0.107 🚀 Python-3.9.16 torch-2.0.1+cu117 CUDA:0 (Quadro RTX 6000, 24212MiB)
               classes   top1_acc   top5_acc: 100%|██████████| 7931/7931 [09:50<00:00, 13.44it/s]
                   all      0.827      0.972
Speed: 0.2ms preprocess, 0.4ms inference, 0.0ms loss, 0.0ms postprocess per image
Results saved to [1mruns/classify/val2[0m


0.9715744256973267

In [None]:
results = model.predict("/lustre1/g/geog_pyloo/05_timemachine/_curated/c_classifier/Capture.PNG")
probs = result.probs # Probs object for classification outputs
print(probs.data)

# Test Inference
1. [TO DO]: add further

In [169]:
from ultralytics import YOLO
from tqdm import tqdm
from glob import glob
import os
import pandas as pd
from multiprocessing import Pool
import numpy as np
TRANSFORMED_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier"

In [9]:
# load all data already in train and test
YOLOFOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
files = glob(YOLOFOLDER+'/*/*/*')

df = pd.DataFrame({'path':files})
df['file_type'] = df['path'].apply(lambda x: x.split("/")[-1].split(".")[-1])
df['folder'] = df['path'].apply(lambda x: x.split("/")[-2])
df['group'] = df['path'].apply(lambda x: x.split("/")[-3])
df['name'] = df['path'].apply(lambda x: x.split("/")[-1])
# check for duplicates
df["unique_count"] = df.groupby("name")["name"].transform("count")
print(df[df['unique_count']>1].shape[0])
df[df['unique_count']>1].groupby('folder').size() # this is not too bad. OK, keep moving

(118, 6)

In [16]:
files = glob(TRANSFORMED_FOLDER+"/*.csv") # these are all individual data
testdf = pd.read_csv(files[3])
testdf['name'] = testdf['path'].apply(lambda x: x.split("/")[-1])
valdf = testdf[~testdf["name"].isin(df["name"])].reset_index(drop = True)
n_all = testdf.shape[0]
n_val = valdf.shape[0]
print("total images: ", n_all, "total images can be used for validation: ", n_val)
valdf.head()

total images:  219197 total images can be used for validation:  208197


Unnamed: 0,path,label,data_group,city,name
0,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,80,val,Munich,sVt0eN0ZxmXRBOnC1WD0ug_0.jpg
1,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,80,val,Munich,sVt0eN0ZxmXRBOnC1WD0ug_90.jpg
2,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,80,val,Munich,sVt0eN0ZxmXRBOnC1WD0ug_180.jpg
3,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,80,val,Munich,sVt0eN0ZxmXRBOnC1WD0ug_270.jpg
4,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,80,val,Munich,suC1fdWHQPBj03oHh-goZA_0.jpg


In [202]:
def inference(paths, model = model, k = 1):
    results = model.predict(list(paths))
    pred_ls = []
    for i, r in enumerate(results):
        top_txt = {}
        top = r.cpu().probs.topk(k)
        top_txt["name"] = os.path.basename(paths[i])
        top_txt[f"top_{k}"] = np.array(top.indices)[0]
        top_txt[f"top_{k}_prob"] = np.array(top.values)[0]
        pred_ls.append(top_txt)
    pred_df = pd.DataFrame(pred_ls)
    return pred_df

In [200]:
VALFOLDER = (
    "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8_inf_dir"
)
CURATED_FOLDER = (
    "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_city_classifiier_infer"
)
if not os.path.exists(CURATED_FOLDER):
    os.makedirs(CURATED_FOLDER)

# load list of files to infer
ALL_TO_INFER = os.listdir(VALFOLDER)
# batch processing in case out of memory
C_SIZE = 1000

# START LOAD INFERENCE PARAMETER
model = YOLO(
    "/home/yuanzf/uvi-time-machine/_script/A-city-never-was/runs/classify/train4/weights/best.pt"
)



In [203]:
# use chunk size to do batch processing
task_ls = []
df = valdf.copy()
for i in range(0, df.shape[0], C_SIZE):
    task_ls.append(list(df.iloc[i : i + C_SIZE]["path"].values))

for i, task in enumerate(tqdm(task_ls)):
    print("Now processing: ", i)
    pred_df = inference(task)
    pred_df.to_parquet(
            os.path.join(CURATED_FOLDER, f), index=False, mode="append"
        )
print("Done", f)
print("*" * 100)

  0%|          | 0/209 [00:00<?, ?it/s]


Now processing:  0


0: 416x416 Munich 0.92, Copenhagen 0.05, Antwerp 0.01, Brussels 0.01, Thessaloniki 0.00, 1: 416x416 Munich 0.92, Copenhagen 0.04, Antwerp 0.02, Cleveland 0.00, Vienna 0.00, 2: 416x416 Munich 0.94, Copenhagen 0.05, Berlin 0.01, Warsaw 0.00, Antwerp 0.00, 3: 416x416 Munich 0.89, Copenhagen 0.06, Stockholm 0.02, Antwerp 0.01, Berlin 0.01, 4: 416x416 Berlin 0.85, Munich 0.14, Antwerp 0.00, Vienna 0.00, Buenos Aires 0.00, 5: 416x416 Antwerp 0.50, Munich 0.38, Berlin 0.04, Amsterdam 0.03, Buenos Aires 0.02, 6: 416x416 Munich 0.81, Berlin 0.19, Zwolle 0.00, Warsaw 0.00, Amsterdam 0.00, 7: 416x416 Berlin 0.92, Munich 0.06, Vienna 0.01, Buenos Aires 0.00, Antwerp 0.00, 8: 416x416 Le Mans 0.20, Warsaw 0.19, Berlin 0.11, Budapest 0.08, Vienna 0.06, 9: 416x416 Milan 0.33, Warsaw 0.24, Madrid 0.13, Amsterdam 0.09, Zwolle 0.03, 10: 416x416 Zwolle 0.71, Berlin 0.15, Amsterdam 0.10, Warsaw 0.02, Munich 0.01, 11: 416x416 Warsaw 0.27, Zwolle 0.14, Berlin 0.13, Amsterdam 0.11, Copenhagen 0.08, 12: 416x41

AttributeError: 'list' object has no attribute 'to_parquet'

In [211]:
# construct bash file to run multiple cities at the same time
files = glob(TRANSFORMED_FOLDER+"/*.csv")
cities = [os.path.basename(f)[:-4] for f in files]

['modesto',
 'chicago',
 'kaunas',
 'munich',
 'jerusalem',
 'kozhikode',
 'cleveland',
 'saintpetersburg',
 'killeen',
 'okayama',
 'bangkok',
 'budapest',
 'palembang',
 'raleigh',
 'antwerp',
 'kyiv',
 'copenhagen',
 'boston',
 'lemans',
 'lima',
 'accra',
 'thessaloniki',
 'ilheus',
 'palermo',
 'wellington',
 'hindupur',
 'telaviv',
 'belgrade',
 'zwolle',
 'madrid',
 'sitapur',
 'capetown',
 'toledo',
 'rovno',
 'warsaw',
 'taipei',
 'reynosa',
 'milan',
 'miami',
 'bacolod',
 'bogotá',
 'gombe',
 'astrakhan',
 'gainesville,fl',
 'pune',
 'buenosaires',
 'sydney',
 'nagoya',
 'culiacan',
 'nairobi',
 'fukuoka',
 'brussels',
 'jalna',
 'bangalore',
 'portland,or',
 'riodejaneiro',
 'guatemalacity',
 'vijayawada',
 'medan',
 'jaipur',
 'dhaka',
 'lagos',
 'belohorizonte',
 'montreal',
 'toronto',
 'kualalumpur',
 'sanfrancisco',
 'denver',
 'manila',
 'tokyo',
 'losangeles',
 'minneapolis',
 'mexicocity',
 'seoul',
 'cebucity',
 'houston',
 'victoria',
 'saidpur',
 'johannesburg',


In [216]:
to_infer = cities[:5]
for i in range(0, len(to_infer), 10):
    citybatch = to_infer[i:i+10]
    
    with open(f"inference_batch_pre{i}.sh", "w") as thefile:
        for c in citybatch:
            line = f"""python B3_inference_city.py --city={c}\n"""
            thefile.write(line)
    

# Save Confusion Matrix

In [33]:
import pandas as pd
mx = metrics.confusion_matrix.matrix
mx_df = pd.DataFrame(mx, columns = label_values, index = label_values)
mx_df

Unnamed: 0,Accra,Amsterdam,Antwerp,Astrakhan,Athens,Auckland,Bacolod,Bangalore,Bangkok,Belgrade,...,Toronto,Tyumen,Valledupar,Victoria,Vienna,Vijayawada,Warsaw,Wellington,Yamaguchi,Zwolle
Accra,983.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Amsterdam,0.0,594.0,24.0,0.0,1.0,1.0,1.0,0.0,0.0,5.0,...,0.0,2.0,0.0,0.0,5.0,0.0,8.0,1.0,0.0,237.0
Antwerp,0.0,17.0,739.0,2.0,0.0,0.0,0.0,0.0,1.0,5.0,...,0.0,0.0,0.0,2.0,10.0,0.0,21.0,0.0,0.0,18.0
Astrakhan,0.0,0.0,1.0,959.0,0.0,0.0,0.0,0.0,1.0,5.0,...,0.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Athens,0.0,0.0,0.0,4.0,778.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vijayawada,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,914.0,0.0,0.0,0.0,0.0
Warsaw,0.0,6.0,14.0,3.0,2.0,0.0,0.0,0.0,0.0,7.0,...,3.0,7.0,1.0,1.0,26.0,0.0,668.0,0.0,0.0,6.0
Wellington,1.0,0.0,0.0,0.0,1.0,63.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,1.0,0.0,0.0,849.0,0.0,1.0
Yamaguchi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,608.0,0.0


In [49]:
mx_df.to_csv(os.path.join(CURATED_FOLDER, "c_classifier_matrix.csv"))

In [36]:
CURATED_FOLDER

'/lustre1/g/geog_pyloo/05_timemachine/_curated/c_classifier'

In [35]:
os.listdir(CURATED_FOLDER)

['c_train_median.parquet',
 'c_test_median.parquet',
 'c_test_small.parquet',
 'c_train_small.parquet',
 'c_train.parquet',
 'c_test.parquet']

In [52]:
mx_df[['Rio de Janeiro']].values

array([[          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          4],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          1],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          1],
       [          1],
       [          2],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [          1],
       [          0],
       [          0],
       [          0],
       [          0],
       [          0],
       [  