In [1]:
# Mount Google drive to upload datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/My Drive

/content/drive/My Drive


In [3]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 43 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 47.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=a941aa2cc51f1caebaf8de70701397e0b9f8455836fe03d8d24b96f93d2129dc
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [4]:
!pip install -q findspark

In [5]:
import findspark
findspark.init()

In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
                    .master('local')\
                    .appName('Colab')\
                    .config('spark.ui.port', '4050')\
                    .getOrCreate()

In [7]:
%cd /content/drive/My Drive/yolov5

/content/drive/My Drive/yolov5


In [8]:
!pip install -qr 'requirements.txt'

[?25l[K     |▌                               | 10 kB 20.6 MB/s eta 0:00:01[K     |█                               | 20 kB 12.0 MB/s eta 0:00:01[K     |█▋                              | 30 kB 6.4 MB/s eta 0:00:01[K     |██▏                             | 40 kB 8.0 MB/s eta 0:00:01[K     |██▊                             | 51 kB 6.0 MB/s eta 0:00:01[K     |███▎                            | 61 kB 7.0 MB/s eta 0:00:01[K     |███▉                            | 71 kB 6.6 MB/s eta 0:00:01[K     |████▍                           | 81 kB 6.6 MB/s eta 0:00:01[K     |█████                           | 92 kB 7.2 MB/s eta 0:00:01[K     |█████▌                          | 102 kB 6.9 MB/s eta 0:00:01[K     |██████                          | 112 kB 6.9 MB/s eta 0:00:01[K     |██████▋                         | 122 kB 6.9 MB/s eta 0:00:01[K     |███████▏                        | 133 kB 6.9 MB/s eta 0:00:01[K     |███████▊                        | 143 kB 6.9 MB/s eta 0:00:01[K   

In [9]:
%cd ../

/content/drive/My Drive


In [10]:
# Databricks notebook source
import torch
from PIL import Image
import glob
import pandas as pd
from typing import Iterator, Tuple
import uuid
from torch.utils.data import Dataset
from torchvision import datasets, models, transforms
from torchvision.datasets.folder import default_loader
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import ArrayType, FloatType, StringType
from pyspark.sql.functions import when, col, udf

In [11]:
model = torch.hub.load('ultralytics/yolov5', 'custom', 
                       '/content/drive/MyDrive/yolov5/yolov_runs/frozen_w_coco_unlabeled2/weights/frozen_backbone_coco_unlabeled_best.onnx',
                       trust_repo=True)

Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to /root/.cache/torch/hub/master.zip
[31m[1mrequirements:[0m PyYAML>=5.3.1 not found and is required by YOLOv5, attempting auto-update...
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/

[31m[1mrequirements:[0m 1 package updated per /root/.cache/torch/hub/ultralytics_yolov5_master/requirements.txt
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m

YOLOv5 🚀 2022-7-29 Python-3.7.13 torch-1.12.0+cu113 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)

Loading /content/drive/MyDrive/yolov5/yolov_runs/frozen_w_coco_unlabeled2/weights/frozen_backbone_coco_unlabeled_best.onnx for ONNX Runtime inference...
[31m[1mrequirements:[0m onnx not found and is required by YOLOv5, attempting auto-update...
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting onnx
  Downloading onnx-1.

In [12]:
def yolov_preds(img_paths, model):
    """Accepts a string of image paths and returns the Yolov predicted fields"""
    
    # Load the trained model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Create lists to hold the path and the Yolov fields
    failed = []
    processed = []
    yolo_xmin = []
    yolo_ymin = []
    yolo_xmax = []
    yolo_ymax = []
    yolo_confidence = []
    yolo_bb_classes = []
    yolo_names = []
    
    with torch.no_grad():
        for f in img_paths:
            img = Image.open(f)
            results = model(img, size=640)
            df = results.pandas().xyxy[0]
            bbs_det = len(df)
            
            if bbs_det == 0:
                failed.append(f)
            
            if bbs_det > 0:
                # Add the path to the img paths by a multiplier so that it's added multiple times if multiple bounding boxes are produced
                processed.extend([f for x in range(bbs_det)])
            
                # Make dataframes of the Yolov fields from the detection
                xmin = df.xmin.tolist()
                ymin = df.ymin.tolist()
                xmax = df.xmax.tolist()
                ymax = df.ymax.tolist()
                confidence = df.confidence.tolist()
                bb_classes = df['class'].tolist()
                names = df.name.tolist()
            
                # Add the values to their respective lists
                yolo_xmin.extend(xmin)
                yolo_ymin.extend(ymin)
                yolo_xmax.extend(xmax)
                yolo_ymax.extend(ymax)
                yolo_confidence.extend(confidence)
                yolo_bb_classes.extend(bb_classes)
                yolo_names.extend(names)
            
    return failed, pd.DataFrame({
        'img_path': processed,
        'xmin': yolo_xmin,
        'ymin':yolo_ymin,
        'xmax': yolo_xmax,
        'ymax': yolo_ymax,
        'confidence': yolo_confidence,
        'bb_class': yolo_bb_classes,
        'names': yolo_names,
    })

In [13]:
D = '/content/drive/MyDrive'

In [14]:
# Pass the list of images to the Yolov model and generate the Yolov predictions
imgs = glob.glob(D+'/yolov5/customdata/images/test/*.jpg')

In [16]:
# Create a dataframe from the predictions
failed_preds, yolo_test_preds = yolov_preds(imgs, model)

In [17]:
# Find the number of images where a prediction failed to be made
print(f'{len(failed_preds)} images failed to produce a prediction, out of {len(imgs)} total images')

25 images failed to produce a prediction, out of 1033 total images


In [19]:
# Check the first few rows of the yolo val preds dataframe
yolo_test_preds.head()

Unnamed: 0,img_path,xmin,ymin,xmax,ymax,confidence,bb_class,names
0,/content/drive/MyDrive/yolov5/customdata/image...,105.752777,60.255127,378.000244,610.764404,0.768734,0,Giraffa_tippelskirchi
1,/content/drive/MyDrive/yolov5/customdata/image...,299.829956,220.644257,431.216553,362.138153,0.876029,0,Giraffa_tippelskirchi
2,/content/drive/MyDrive/yolov5/customdata/image...,187.540527,236.484894,248.498108,362.121307,0.527206,0,Giraffa_tippelskirchi
3,/content/drive/MyDrive/yolov5/customdata/image...,23.572754,0.0,634.253723,570.580444,0.331881,0,Giraffa_tippelskirchi
4,/content/drive/MyDrive/yolov5/customdata/image...,238.281403,206.198959,630.858032,638.992126,0.724897,0,Giraffa_tippelskirchi


In [22]:
# Convert the Pandas dataframe to PySpark for assessment
spark_yolo_test = spark.createDataFrame(yolo_test_preds)

In [23]:
# Add a column for what the true label of the animal in the photo is
spark_yolo_test = spark_yolo_test.withColumn('true_img_label', when(col('img_path').contains('giraffe'), 'Giraffa_tippelskirchi')
                                                             .when(col('img_path').contains('hyena'), 'Crocuta_crocuta')
                                                             .otherwise('Panthera_pardus'))

In [24]:
# Find the count of bounding boxes predicted where the predicted label didn't match what was in the image
incorrect_bbs = spark_yolo_test.filter(col('names') != col('true_img_label'))

In [25]:
print(f'There were {incorrect_bbs.count()} incorrect bounding boxes predicted')

There were 9 incorrect bounding boxes predicted


In [26]:
# Find the number of unique images where an incorrect bounding box was predicted
incorrect_imgs = incorrect_bbs.select('img_path').distinct()

In [27]:
print(f'There were {incorrect_imgs.count()} images where an incorrect bounding box was predicted')

There were 9 images where an incorrect bounding box was predicted


In [28]:
# Find the number of correct bounding boxes
correct_bbs = spark_yolo_test.filter(col('names') == col('true_img_label'))
print(f'There were {correct_bbs.count()} correct bounding boxes predicted')
correct_imgs = correct_bbs.select('img_path').distinct()
print(f'There were {correct_imgs.count()} images with at least 1 correct bounding box')

There were 1073 correct bounding boxes predicted
There were 1007 images with at least 1 correct bounding box


In [29]:
# Now, let's look at the images where all of the bounding boxes were correct
all_correct = correct_bbs.join(incorrect_bbs, correct_bbs.img_path == incorrect_bbs.img_path, how='left_anti')
print(f'There were {all_correct.count()} bounding boxes predicted for images where all of the predictions were correct')
all_correct_imgs = all_correct.select('img_path').distinct()
print(f'There were {all_correct_imgs.count()} images where all of the predicted bounding boxes were correct')

There were 1065 bounding boxes predicted for images where all of the predictions were correct
There were 999 images where all of the predicted bounding boxes were correct


In [31]:
# Filter the dataframe to hyena images and repeat the assessment of images where a prediction failed to be made and correct vs incorrect predictions
failed_df = spark.createDataFrame(failed_preds, StringType())
failed_df = failed_df.withColumnRenamed('value', 'img_path')
failed_df = failed_df.withColumn('true_img_label', when(col('img_path').contains('giraffe'), 'Giraffa_tippelskirchi')
                                                             .when(col('img_path').contains('hyena'), 'Crocuta_crocuta')
                                                             .otherwise('Panthera_pardus'))
hyena_failed = failed_df.filter(col('true_img_label') == 'Crocuta_crocuta')
hyena_predicted = spark_yolo_test.filter(col('true_img_label') == 'Crocuta_crocuta')
print(f'{hyena_failed.count()} images failed to produce a prediction, out of {hyena_predicted.count() + hyena_failed.count()} total hyena images, or {failed_df.count()} total failed images')

4 images failed to produce a prediction, out of 338 total hyena images, or 25 total failed images


In [32]:
# Find the count of bounding boxes predicted where the predicted label didn't match what was in the image
hyena_incorrect_bbs = hyena_predicted.filter(col('names') != col('true_img_label'))

In [33]:
print(f'There were {hyena_incorrect_bbs.count()} incorrect bounding boxes predicted')

There were 4 incorrect bounding boxes predicted


In [34]:
# Find the number of unique images where an incorrect bounding box was predicted
hyena_incorrect_imgs = hyena_incorrect_bbs.select('img_path').distinct()

In [35]:
print(f'There were {hyena_incorrect_imgs.count()} images where an incorrect bounding box was predicted')

There were 4 images where an incorrect bounding box was predicted


In [36]:
# Find the number of correct bounding boxes
hyena_correct_bbs = hyena_predicted.filter(col('names') == col('true_img_label'))
print(f'There were {hyena_correct_bbs.count()} correct bounding boxes predicted')
hyena_correct_imgs = hyena_correct_bbs.select('img_path').distinct()
print(f'There were {hyena_correct_imgs.count()} images with at least 1 correct bounding box')

There were 330 correct bounding boxes predicted
There were 305 images with at least 1 correct bounding box


In [37]:
# Now, let's look at the images where all of the bounding boxes were correct
hyena_all_correct = hyena_correct_bbs.join(hyena_incorrect_bbs, hyena_correct_bbs.img_path == hyena_incorrect_bbs.img_path, how='left_anti')
print(f'There were {hyena_all_correct.count()} bounding boxes predicted for images where all of the predictions were correct')
hyena_all_correct_imgs = hyena_all_correct.select('img_path').distinct()
print(f'There were {hyena_all_correct_imgs.count()} images where all of the predicted bounding boxes were correct')

There were 326 bounding boxes predicted for images where all of the predictions were correct
There were 301 images where all of the predicted bounding boxes were correct


In [38]:
# Filter the dataframe to leopard images and repeat the assessment of images where a prediction failed to be made and correct vs incorrect predictions
leopard_failed = failed_df.filter(col('true_img_label') == 'Panthera_pardus')
leopard_predicted = spark_yolo_test.filter(col('true_img_label') == 'Panthera_pardus')
print(f'{leopard_failed.count()} images failed to produce a prediction, out of {leopard_predicted.count() + leopard_failed.count()} total leopard images, or {failed_df.count()} total failed images')

17 images failed to produce a prediction, out of 698 total leopard images, or 25 total failed images


In [39]:
# Find the count of bounding boxes predicted where the predicted label didn't match what was in the image
leopard_incorrect_bbs = leopard_predicted.filter(col('names') != col('true_img_label'))

In [40]:
print(f'There were {leopard_incorrect_bbs.count()} incorrect bounding boxes predicted')

There were 5 incorrect bounding boxes predicted


In [41]:
# Find the number of unique images where an incorrect bounding box was predicted
leopard_incorrect_imgs = leopard_incorrect_bbs.select('img_path').distinct()

In [42]:
print(f'There were {leopard_incorrect_imgs.count()} images where an incorrect bounding box was predicted')

There were 5 images where an incorrect bounding box was predicted


In [43]:
# Find the number of correct bounding boxes
leopard_correct_bbs = leopard_predicted.filter(col('names') == col('true_img_label'))
print(f'There were {leopard_correct_bbs.count()} correct bounding boxes predicted')
leopard_correct_imgs = leopard_correct_bbs.select('img_path').distinct()
print(f'There were {leopard_correct_imgs.count()} images with at least 1 correct bounding box')

There were 676 correct bounding boxes predicted
There were 655 images with at least 1 correct bounding box


In [44]:
# Now, let's look at the images where all of the bounding boxes were correct
leopard_all_correct = leopard_correct_bbs.join(leopard_incorrect_bbs, leopard_correct_bbs.img_path == leopard_incorrect_bbs.img_path, how='left_anti')
print(f'There were {leopard_all_correct.count()} bounding boxes predicted for images where all of the predictions were correct')
leopard_all_correct_imgs = leopard_all_correct.select('img_path').distinct()
print(f'There were {leopard_all_correct_imgs.count()} images where all of the predicted bounding boxes were correct')

There were 672 bounding boxes predicted for images where all of the predictions were correct
There were 651 images where all of the predicted bounding boxes were correct


In [45]:
# Filter the dataframe to leopard images and repeat the assessment of images where a prediction failed to be made and correct vs incorrect predictions
giraffe_failed = failed_df.filter(col('true_img_label') == 'Giraffa_tippelskirchi')
giraffe_predicted = spark_yolo_test.filter(col('true_img_label') == 'Giraffa_tippelskirchi')
print(f'{giraffe_failed.count()} images failed to produce a prediction, out of {giraffe_predicted.count() + giraffe_failed.count()} total giraffe images, or {failed_df.count()} total failed images')

4 images failed to produce a prediction, out of 71 total giraffe images, or 25 total failed images


In [46]:
# Find the count of bounding boxes predicted where the predicted label didn't match what was in the image
giraffe_incorrect_bbs = giraffe_predicted.filter(col('names') != col('true_img_label'))

In [47]:
print(f'There were {giraffe_incorrect_bbs.count()} incorrect bounding boxes predicted')

There were 0 incorrect bounding boxes predicted


In [48]:
# Find the number of unique images where an incorrect bounding box was predicted
giraffe_incorrect_imgs = giraffe_incorrect_bbs.select('img_path').distinct()

In [49]:
print(f'There were {giraffe_incorrect_imgs.count()} images where an incorrect bounding box was predicted')

There were 0 images where an incorrect bounding box was predicted


In [50]:
# Find the number of correct bounding boxes
giraffe_correct_bbs = giraffe_predicted.filter(col('names') == col('true_img_label'))
print(f'There were {giraffe_correct_bbs.count()} correct bounding boxes predicted')
giraffe_correct_imgs = giraffe_correct_bbs.select('img_path').distinct()
print(f'There were {giraffe_correct_imgs.count()} images with at least 1 correct bounding box')

There were 67 correct bounding boxes predicted
There were 47 images with at least 1 correct bounding box


In [51]:
# Now, let's look at the images where all of the bounding boxes were correct
giraffe_all_correct = giraffe_correct_bbs.join(giraffe_incorrect_bbs, giraffe_correct_bbs.img_path == giraffe_incorrect_bbs.img_path, how='left_anti')
print(f'There were {giraffe_all_correct.count()} bounding boxes predicted for images where all of the predictions were correct')
giraffe_all_correct_imgs = giraffe_all_correct.select('img_path').distinct()
print(f'There were {giraffe_all_correct_imgs.count()} images where all of the predicted bounding boxes were correct')

There were 67 bounding boxes predicted for images where all of the predictions were correct
There were 47 images where all of the predicted bounding boxes were correct


In [52]:
# Show the counts of 
combos = spark_yolo_test.groupby('true_img_label', 'names').count()
combos.show(20, False)

+---------------------+---------------------+-----+
|true_img_label       |names                |count|
+---------------------+---------------------+-----+
|Crocuta_crocuta      |Panthera_pardus      |4    |
|Panthera_pardus      |Panthera_pardus      |676  |
|Giraffa_tippelskirchi|Giraffa_tippelskirchi|67   |
|Crocuta_crocuta      |Crocuta_crocuta      |330  |
|Panthera_pardus      |Crocuta_crocuta      |5    |
+---------------------+---------------------+-----+



In [53]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import lit
from pyspark.sql.types import DoubleType

In [55]:
spark_yolo_test = spark_yolo_test.withColumn('prediction', when(col('names') == 'Crocuta_crocuta', lit(0))
                                                             .when(col('names') == 'Panthera_pardus', lit(1))
                                                             .otherwise(lit(2)))
spark_yolo_test = spark_yolo_test.withColumn('label', when(col('true_img_label') == 'Crocuta_crocuta', lit(0))
                                                             .when(col('true_img_label') == 'Panthera_pardus', lit(1))
                                                             .otherwise(lit(2)))
spark_yolo_test = spark_yolo_test.withColumn('prediction', col('prediction').cast(DoubleType())).withColumn('label', col('label').cast(DoubleType()))
check = spark_yolo_test.select('prediction', 'label')

In [56]:
evaluator = MulticlassClassificationEvaluator()
evaluator.setPredictionCol('prediction')
print(f'Accuracy: {evaluator.evaluate(check)}')
print(evaluator.evaluate(check, {evaluator.metricName: 'weightedPrecision'}))
print(evaluator.evaluate(check, {evaluator.metricName: 'weightedRecall'}))

Accuracy: 0.9916852311137159
0.9916904279299382
0.9916820702402958


In [57]:
# Get the accuracy, recall, and precision by species
hyena_check = check.filter(col('label') == 0)
leopard_check = check.filter(col('label') == 1)
giraffe_check = check.filter(col('label') == 2)

print(f'Hyena accuracy: {evaluator.evaluate(hyena_check)}')
print(f'Leopard accuracy: {evaluator.evaluate(leopard_check)}')
print(f'Giraffe accuracy: {evaluator.evaluate(giraffe_check)}')

Hyena accuracy: 0.9939759036144579
Leopard accuracy: 0.9963154016212233
Giraffe accuracy: 1.0


In [58]:
from sklearn.metrics import precision_score, recall_score, f1_score

pd_check = check.toPandas()

In [59]:
print(f'Class precision scores: {precision_score(pd_check.label, pd_check.prediction, average=None)}')
print(f'Class recall scores: {recall_score(pd_check.label, pd_check.prediction, average=None)}')
print(f'Class f1 scores: {f1_score(pd_check.label, pd_check.prediction, average=None)}')

Class precision scores: [    0.98507     0.99412           1]
Class recall scores: [    0.98802     0.99266           1]
Class f1 scores: [    0.98655     0.99339           1]


In [60]:
print(evaluator.evaluate(check, {evaluator.metricName: 'precisionByLabel'}))
print(evaluator.evaluate(check, {evaluator.metricName: 'recallByLabel'}))

0.9850746268656716
0.9880239520958084


In [61]:
def scale_yolo(xmin, ymin, xmax, ymax):
    """Create a scaled version of the predicted bounding boxes"""
    x = xmin/640
    y = ymin/640
    w = (xmax - xmin)/640
    h = (ymax - ymin)/640
    return [x, y, w, h]

In [62]:
def convert_nonnormalized(xmin, ymin, xmax, ymax):
    """Converts predicted bounding box values to Coco format"""
    return [xmin, ymin, xmax-xmin, ymax-ymin]
scale_yolo_udf = udf(scale_yolo)
coco_nonnormalized_udf = udf(convert_nonnormalized)
spark_yolo_test = spark_yolo_test.withColumn('yolo_scaled_bxs', scale_yolo_udf(spark_yolo_test.xmin, spark_yolo_test.ymin, spark_yolo_test.xmax, spark_yolo_test.ymax))
spark_yolo_test = spark_yolo_test.withColumn('nonscaled_coco_bxs', coco_nonnormalized_udf(spark_yolo_test.xmin, spark_yolo_test.ymin, spark_yolo_test.xmax, spark_yolo_test.ymax))

In [63]:
# Create a custom UDF to cast lists as arrays
udf_array = udf(lambda row: list(row), ArrayType(FloatType()))
spark_yolo_val = spark_yolo_test.withColumn('nonscaled_coco_bxs', udf_array(spark_yolo_test.nonscaled_coco_bxs)).withColumn('yolo_scaled_bxs', udf_array(spark_yolo_test.yolo_scaled_bxs))

In [64]:
# Verify that the conversion worked
spark_yolo_val.show(20, False)

+---------------------------------------------------------------------+------------------+------------------+------------------+------------------+------------------+--------+---------------------+---------------------+----------+-----+-------------------------------------------------+--------------------------------------------+
|img_path                                                             |xmin              |ymin              |xmax              |ymax              |confidence        |bb_class|names                |true_img_label       |prediction|label|yolo_scaled_bxs                                  |nonscaled_coco_bxs                          |
+---------------------------------------------------------------------+------------------+------------------+------------------+------------------+------------------+--------+---------------------+---------------------+----------+-----+-------------------------------------------------+--------------------------------------------+
|/co

In [65]:
import matplotlib.pyplot as plt
bins, counts = spark_yolo_val.select('confidence').rdd.flatMap(lambda x: x).histogram(10)
plt.hist(bins[:-1], bins=bins, weights=counts)

(array([         30,          48,          45,          67,          92,          92,         132,         153,         228,         195]),
 array([    0.25264,      0.3224,     0.39216,     0.46192,     0.53169,     0.60145,     0.67121,     0.74097,     0.81073,     0.88049,     0.95026]),
 <a list of 10 Patch objects>)

In [66]:
spark_yolo_val.filter(col('confidence') > .5).count()

926

In [67]:
# Get the distribution of confidence scores for incorrect predictions
bins, counts = incorrect_bbs.select('confidence').rdd.flatMap(lambda x: x).histogram(10)
plt.hist(bins[:-1], bins=bins, weights=counts)

(array([          3,           1,           1,           2,           1,           0,           0,           0,           0,           1]),
 array([    0.32244,     0.36281,     0.40317,     0.44354,     0.48391,     0.52427,     0.56464,       0.605,     0.64537,     0.68574,      0.7261]),
 <a list of 10 Patch objects>)

In [68]:
# Get the min and max values for correct versus incorrect predictions
print(all_correct.groupBy().max('confidence').collect()[0].asDict()['max(confidence)'])
print(all_correct.groupBy().min('confidence').collect()[0].asDict()['min(confidence)'])

0.9502553939819336
0.2526381313800812


In [69]:
# Get the min and max values for correct versus incorrect predictions
print(incorrect_bbs.groupBy().max('confidence').collect()[0].asDict()['max(confidence)'])
print(incorrect_bbs.groupBy().min('confidence').collect()[0].asDict()['min(confidence)'])

0.7261031866073608
0.3224412500858307


In [70]:
# Get the min and max values for correct versus incorrect predictions
print(hyena_all_correct.groupBy().max('confidence').collect()[0].asDict()['max(confidence)'])
print(hyena_all_correct.groupBy().min('confidence').collect()[0].asDict()['min(confidence)'])

0.9459032416343689
0.2526381313800812
