### Google Colab Link:
- If you are running this in an IDE and wish to run this on colab here is a link:
- https://colab.research.google.com/drive/1OHagyo6YzcB0_K5hX7kqSqNKWhLvGGfv?usp=sharing

### Download Data


In [None]:
!pip install gdown

import gdown
import os

# Zipped file of the Cars Collection Dataset: 'https://www.kaggle.com/datasets/ashfaqsyed/cars-collection-dataset/data'
file_id = '1HCCwIMa1NrFT1Ogv8AuMyZe7Hy9t2nQS'
output_filename = 'cars-dataset.zip'

# Check if the file already exists
if os.path.exists(output_filename):
    print(f"Found existing file: {output_filename}. Removing it now...")
    os.remove(output_filename)
    print("Existing file removed.")
else:
    print(f"No existing file named {output_filename} found. Proceeding with download.")

gdown.download(id=file_id, output=output_filename, quiet=False)

print("Download complete.")

if os.path.exists('files/'):
    print(f"Found existing folder: files/. Removing it now...")
    !rm -r files/
    print("Existing folder removed.")
else:
    print(f"No existing folder named files/ found. Proceeding with unzip.")

print("Unzipping data")
!unzip -q cars-dataset.zip -d files/

!ls -lh

### Create a DataFrame for dataset to evaluate model


In [None]:
import pandas as pd
import os

data_dir = './files/'

data_list = []

for filename in os.listdir(data_dir):
    if filename.endswith(('.jpg', '.jpeg', '.png')):

      base_name = os.path.splitext(filename)[0]
      parts = base_name.split('_')

      # Features documented on dataset's kaggle page
      if len(parts) >= 16:
          record = {
              'filename': filename,
              'Make': parts[0],
              'Model': parts[1],
              'Year': parts[2],
              'MSRP': parts[3],
              'Front Wheel Size (in)': parts[4],
              'SAE Net Horsepower @ RPM': parts[5],
              'Displacement': parts[6],
              'Engine Type': parts[7],
              'Width, Max w/o mirrors (in)': parts[8],
              'Height, Overall (in)': parts[9],
              'Length, Overall (in)': parts[10],
              'Gas Mileage': parts[11],
              'Drivetrain': parts[12],
              'Passenger Capacity': parts[13],
              'Passenger Doors': parts[14],
              'Body Style': parts[15]
          }
          data_list.append(record)

car_df = pd.DataFrame(data_list)

car_df.head(10)

In [None]:
# Create a filtered dataframe for model
filtered_df = car_df[['Make', 'Model', 'filename']].sort_values(by='filename', ascending=True).reset_index()[['Make', 'Model', 'filename']]
print(f'filename example: {filtered_df['filename'][0]}\n')
filtered_df

In [None]:
# Create a sampled dataset
import numpy as np

# Seed is for the ability to reproduce results
np.random.seed(42)

SAMPLE_SIZE = 5000

sampled_df = filtered_df.sample(n=SAMPLE_SIZE, random_state=42)
print(f'Number of rows in sampled df: {len(sampled_df)}\n')
print(f'First 10 rows:\n{sampled_df.head(10)}')

### Import same model with same method as the main code

In [None]:
# From 'https://huggingface.co/dima806/car_models_image_detection'

def classify_car(image_path):
    from transformers import pipeline
    pipe = pipeline("image-classification", model="dima806/car_models_image_detection", device=0, use_fast=True)
    result = pipe(image_path)

    make_and_model = result[0]["label"]

    split_string = make_and_model.split(' ')

    make = split_string[0]
    model = split_string[1]

    split_string = split_string[2:]
    for x in range(len(split_string)):
        model = model + ' ' + split_string[x]

    return make.upper(), model.upper()


In [None]:
# YOU MAY NEED TO RESTART SESSION AFTER RUNNING THIS TO REMOVE ERRORS
!pip install --upgrade accelerate transformers


In [None]:
# Test if model works


make, model = classify_car("./files/Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD_5_4_4dr_aWg.jpg")


# Should print 'Acura_ILX_2013_28_16_110_15_4_70_55_179_39_FWD_5_4_4dr_aWg.jpg'
print("\nFirst file:")
!ls -1 files/ | head -n 1

print(f'\nMake: {filtered_df['Make'][0].upper()}')
print(f'Model: {filtered_df['Model'][0].upper()}\n')

print(f'\nFirst prediction:')
print(f'Make: {make.upper()}')
print(f'Model: {model.upper()}')

### Evaluate Model
- WILL TAKE A VERY LONG TIME TO RUN DEPENDING ON SAMPLE SIZE

In [None]:
# Create tables for predictions
pred_make = []
pred_model = []
count = 1

# Runs predictions for each file in the sampled dataset
for row in sampled_df['filename']:
    path = f'./files/{row}'
    make, model = classify_car(path)

    pred_make.append(make)
    pred_model.append(model)
    print(f'Files completed: {count}')
    count += 1

print(pred_make)
print(pred_model)

In [None]:
# Combine actual and predicted classes for make and model
result_df = sampled_df
result_df['Pred_Make'] = pred_make
result_df['Pred_Model'] = pred_model

result_df = result_df.drop('filename', axis=1).reset_index()
result_df['True_val'] = result_df['Make'].str.upper() + ' ' + result_df['Model'].str.upper()
result_df['Pred_val'] = result_df['Pred_Make'].str.upper() + ' ' + result_df['Pred_Model'].str.upper()
result_df

### Create Confusion Matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

classes = result_df['True_val'].unique()

cm = confusion_matrix(
    y_true = np.array(result_df['True_val']),
    y_pred = np.array(result_df['Pred_val']),
    labels = classes
)

cm_df = pd.DataFrame(cm, index=classes, columns=classes)

# To show that you can plot, but for this many classes is not ideal
# This confusion matrix is still used for main evaluation metrics
plt.figure(figsize=(9, 7))
sns.heatmap(
    cm_df,
    annot=True,
    fmt='d',
    cmap='Blues',
    cbar=True
)

plt.title('Confusion Matrix for Car Classification')
plt.ylabel('True Class (Actual)')
plt.xlabel('Predicted Class')
plt.show()

In [None]:
# Report from scikit-learn library
# Calculates: precision, recall, f1; and the from those: micro average, macro average, and weighted average
report = classification_report(
    y_true = result_df['True_val'],
    y_pred = result_df['Pred_val'],
    labels = classes,
    zero_division = 0,
    output_dict=True
)
print(report)

In [None]:
report_df = pd.DataFrame.from_dict(report).transpose()
report_df = report_df.round(3)
report_df

### Download Files
- Downloads the final metrics report as a .csv file for other use
- Downloads the sampled dataset incase reuse is desired.

In [None]:
# Downloads report as .csv file
from google.colab import files

OUTPUT_FILE = 'car_classify_metrics.csv'

print("Exportint Metrics to .csv file:")
report_df.to_csv(OUTPUT_FILE, index=True)

print(f'Downloading: {OUTPUT_FILE}')
files.download(OUTPUT_FILE)

In [None]:
# Downloads sampled data set
print(f'Downloading sampled_data.csv')
sampled_df.to_csv('sampled_data.csv', index=True)
files.download('sampled_data.csv')