In [20]:
import os
import logging
import yaml
import s3fs
import boto3
import torch
import torch.nn as nn
from ultralytics import YOLO
import mlflow
from mlflow.exceptions import MlflowException
from mlflow.models import infer_signature
from ultralytics import settings

In [21]:
logging.getLogger("mlflow").setLevel(logging.DEBUG)

In [2]:
mlflow.set_tracking_uri(uri="http://localhost:8080")
fs = s3fs.S3FileSystem()
# os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:9090'
# os.environ['AWS_ACCESS_KEY_ID'] = ''
# os.environ['AWS_SECRET_ACCESS_KEY'] = ''
artifact_uri = f"s3://stream-n-detect/models"
DATA_YAML_PATH = 'data/data.yaml'

settings.update({"dvc": False, "mlflow": False})
print(settings)

{'settings_version': '0.0.4', 'datasets_dir': '/Users/asukh/Work/VSCode/Python/stream-and-detect/trainer', 'weights_dir': 'weights', 'runs_dir': 'runs', 'uuid': '8c0982ee71335cf4485524a18ce8e44caf5e59c587e48ff1e991579ce82db241', 'sync': True, 'api_key': '', 'openai_api_key': '', 'clearml': True, 'comet': True, 'dvc': False, 'hub': True, 'mlflow': False, 'neptune': True, 'raytune': True, 'tensorboard': True, 'wandb': True}


In [3]:
# load
with open(r"params.yaml") as f:
    params = yaml.safe_load(f)

# load a pre-trained model 
model = YOLO(params['model_type'])

Downloading https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8n.pt to 'yolov8n.pt'...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6.23M/6.23M [00:00<00:00, 22.6MB/s]


In [5]:
# train
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_result = model.train(
    data=DATA_YAML_PATH,
    imgsz=params['imgsz'],
    batch=params['batch'],
    epochs=params['epochs'],
    optimizer=params['optimizer'],
    lr0=params['lr0'],
    seed=params['seed'],
    pretrained=params['pretrained'],
    # name=params['name'],
    device = device,
    project = params['project_dir'],
    workers=0
)

New https://pypi.org/project/ultralytics/8.2.31 available ðŸ˜ƒ Update with 'pip install -U ultralytics'
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8n.pt, data=data/data.yaml, epochs=1, time=None, patience=100, batch=8, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=0, project=models, name=train5, exist_ok=False, pretrained=True, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show

[34m[1mtrain: [0mScanning /Users/asukh/Work/VSCode/Python/stream-and-detect/trainer/data/train/labels.cache... 5805 images, 93 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5805/5805 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /Users/asukh/Work/VSCode/Python/stream-and-detect/trainer/data/valid/labels.cache... 549 images, 8 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 549/549 [00:00<?, ?it/s]


Plotting labels to models/train5/labels.jpg... 
[34m[1moptimizer:[0m SGD(lr=0.01, momentum=0.937) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mmodels/train5[0m
Starting training for 1 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/1         0G      1.378      1.492      1.115         65        640: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 726/726 [1:31:07<00:00,  7.53s/it]  
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 35/35 [01:57<00:00,  3.36s/it]


                   all        549       6270      0.687       0.58      0.623      0.397

1 epochs completed in 1.552 hours.
Optimizer stripped from models/train5/weights/last.pt, 6.2MB
Optimizer stripped from models/train5/weights/best.pt, 6.2MB

Validating models/train5/weights/best.pt...
Ultralytics YOLOv8.2.0 ðŸš€ Python-3.10.14 torch-2.1.2.post3 CPU (Apple M1 Pro)
Model summary (fused): 168 layers, 3006623 parameters, 0 gradients


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 35/35 [01:50<00:00,  3.15s/it]


                   all        549       6270      0.687      0.581      0.623      0.397
               bicycle        549        250      0.826      0.468       0.57      0.411
                   bus        549        108      0.524      0.593      0.562      0.425
                   car        549       3842      0.781      0.874      0.899      0.605
             motorbike        549       1238      0.768       0.52      0.616      0.342
                person        549        832      0.539      0.449      0.469      0.204
Speed: 0.6ms preprocess, 195.0ms inference, 0.0ms loss, 2.1ms postprocess per image
Results saved to [1mmodels/train5[0m


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
project_dir = params['project_dir']
val_result = model.val(data=DATA_YAML_PATH, device=device, project = f'{project_dir}/val/')

[34m[1mval: [0mScanning /Users/asukh/Work/VSCode/Python/stream-and-detect/trainer/data/valid/labels.cache... 549 images, 8 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 549/549 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 69/69 [00:40<00:00,  1.72it/s]


                   all        549       6270      0.637        0.6      0.618      0.387
               bicycle        549        250      0.762      0.449      0.566        0.4
                   bus        549        108      0.497      0.602      0.548      0.391
                   car        549       3842      0.743      0.882      0.892      0.601
             motorbike        549       1238      0.736      0.562      0.625      0.344
                person        549        832       0.45      0.505      0.457      0.199
Speed: 0.7ms preprocess, 65.6ms inference, 0.0ms loss, 3.1ms postprocess per image
Results saved to [1mmodels/val/train[0m


In [29]:
experiment_name = 'yolov8n detect'
try:
    experiment_id = mlflow.create_experiment(experiment_name, artifact_location=artifact_uri)
except MlflowException:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_name)

requirements_path = "requirements.txt"
with open(requirements_path, "w") as f:
    f.write("torch==2.3.1\n")
    f.write("cloudpickle==3.0.0\n")
    # Add other dependencies if needed

with mlflow.start_run(experiment_id=experiment_id):
    mlflow.log_params(params)
    mlflow.log_metric("precision", train_result.results_dict['metrics/precision(B)'])
    mlflow.log_metric("recall", train_result.results_dict['metrics/recall(B)'])
    mlflow.log_metric("fitness", train_result.fitness)
    mlflow.log_metric("inference", train_result.speed['inference'])
    mlflow.log_metric("loss", train_result.speed['loss'])
    mlflow.set_tag("training info", "yolo8n traffic data")
    # signature = infer_signature(X_train, lr.predict(X_train))
    signature = infer_signature(params=params)

    mlflow.pytorch.log_model(pytorch_model = model.model, artifact_path="models", signature=signature, pip_requirements=requirements_path)
    # mlflow.log_artifact('models')
    

2024/06/14 01:37:14 DEBUG mlflow.models.model: 
urllib3.exceptions.ResponseError: too many 500 error responses

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/miniconda3/envs/pytorch3.10/lib/python3.10/site-packages/requests/adapters.py", line 486, in send
    resp = conn.urlopen(
  File "/opt/miniconda3/envs/pytorch3.10/lib/python3.10/site-packages/urllib3/connectionpool.py", line 948, in urlopen
    return self.urlopen(
  File "/opt/miniconda3/envs/pytorch3.10/lib/python3.10/site-packages/urllib3/connectionpool.py", line 948, in urlopen
    return self.urlopen(
  File "/opt/miniconda3/envs/pytorch3.10/lib/python3.10/site-packages/urllib3/connectionpool.py", line 948, in urlopen
    return self.urlopen(
  [Previous line repeated 2 more times]
  File "/opt/miniconda3/envs/pytorch3.10/lib/python3.10/site-packages/urllib3/connectionpool.py", line 938, in urlopen
    retries = retries.increment(method, url, response=re