# upsampling data 

In [1]:
train_directory='datasets/train'
test_directory='datasets/test'
import os
import pandas as pd
import numpy as np


In [2]:
os.listdir(train_directory)

['Dodge Dakota Crew Cab 2010',
 'GMC Canyon Extended Cab 2012',
 'Hyundai Santa Fe SUV 2012',
 'Dodge Durango SUV 2012',
 'Chevrolet Silverado 1500 Extended Cab 2012',
 'Ford Mustang Convertible 2007',
 'Buick Rainier SUV 2007',
 'Jeep Compass SUV 2012',
 'Chevrolet Silverado 2500HD Regular Cab 2012',
 'Ford Edge SUV 2012',
 'Dodge Ram Pickup 3500 Quad Cab 2009',
 'Bentley Continental Supersports Conv. Convertible 2012',
 'Land Rover Range Rover SUV 2012',
 'Hyundai Tucson SUV 2012',
 'Honda Odyssey Minivan 2007',
 'Audi 100 Wagon 1994',
 'Dodge Challenger SRT8 2011',
 'Mercedes-Benz Sprinter Van 2012',
 'Bentley Continental GT Coupe 2012',
 'Rolls-Royce Ghost Sedan 2012',
 'Lamborghini Gallardo LP 570-4 Superleggera 2012',
 '.DS_Store',
 'Dodge Charger SRT-8 2009',
 'Volkswagen Beetle Hatchback 2012',
 'Audi TT RS Coupe 2012',
 'Acura TSX Sedan 2012',
 'Jaguar XK XKR 2012',
 'Aston Martin V8 Vantage Convertible 2012',
 'Honda Accord Sedan 2012',
 'BMW X5 SUV 2007',
 'Ford F-150 Regula

In [9]:
def count_images_in_folder(directory_path):
    '''
    Args: path to the directory we want to count images 

    Returns: list of tuple containing car model and the corresponding count of it 
    '''
    img_extensions_set=set(('jpg','png'))
    lst=[]
    for i in os.listdir(directory_path):
        if i=='.DS_Store':
            continue
        count=0
        for j in os.listdir(os.path.join(directory_path,i)):
            if j.endswith(('.jpg','.png')):
                count+=1
        lst.append((i,count))
    return lst

count_lst=count_images_in_folder(train_directory)
df= pd.DataFrame(count_lst,columns=['car_name','count'])
        

df

Unnamed: 0,car_name,count
0,Dodge Dakota Crew Cab 2010,41
1,GMC Canyon Extended Cab 2012,40
2,Hyundai Santa Fe SUV 2012,42
3,Dodge Durango SUV 2012,44
4,Chevrolet Silverado 1500 Extended Cab 2012,44
...,...,...
191,Tesla Model S Sedan 2012,39
192,GMC Savana Van 2012,68
193,Chevrolet Express Van 2007,35
194,Hyundai Veracruz SUV 2012,42


In [10]:
output_csv_path = 'datasets/image_counts_by_folder.csv'
df.to_csv(output_csv_path, index=False)

In [14]:
df

Unnamed: 0,car_name,count
0,Dodge Dakota Crew Cab 2010,41
1,GMC Canyon Extended Cab 2012,40
2,Hyundai Santa Fe SUV 2012,42
3,Dodge Durango SUV 2012,44
4,Chevrolet Silverado 1500 Extended Cab 2012,44
...,...,...
191,Tesla Model S Sedan 2012,39
192,GMC Savana Van 2012,68
193,Chevrolet Express Van 2007,35
194,Hyundai Veracruz SUV 2012,42


In [16]:
print(max(df['count'].values))

68


as we can see the maximum number of car images of a certain car model of a particular year is 68 , keeping this as the threshold , we are going to upsample every image with a count less that this , to balance out the dataset

In [31]:
df

Unnamed: 0,car_name,count
0,Dodge Dakota Crew Cab 2010,41
1,GMC Canyon Extended Cab 2012,40
2,Hyundai Santa Fe SUV 2012,42
3,Dodge Durango SUV 2012,44
4,Chevrolet Silverado 1500 Extended Cab 2012,44
...,...,...
191,Tesla Model S Sedan 2012,39
192,GMC Savana Van 2012,68
193,Chevrolet Express Van 2007,35
194,Hyundai Veracruz SUV 2012,42


In [69]:
import cv2
import os
import numpy as np

main_folder = 'datasets/train'

def augment_images(model_name, threshold, datagen):
    class_folder = os.path.join(main_folder, model_name)
    images = [cv2.imread(os.path.join(class_folder, i)) for i in os.listdir(class_folder) if i.endswith(('.jpg', '.png'))]

    # Filter out None images that failed to load
    images = [img for img in images if img is not None]

    count = len(images)
    img_index = 0
    
    while count < threshold:
        print('model name is  ', model_name)

        if not images:
            print('No images found in the folder or all images failed to load.')
            break  # Exit if there are no valid images to augment

        print('count of images is ', count)
        
        # Ensure the image is loaded and has correct dimensions
        if images[img_index] is not None and images[img_index].ndim == 3:
            x = np.expand_dims(images[img_index], axis=0)
            
            # Check if the expanded image has 4 dimensions as expected
            if x.ndim == 4:
                for _ in datagen.flow(x, batch_size=1, save_prefix='aug', save_to_dir=class_folder):
                    count += 1
                    break  # Generate one image at a time and then break out of the loop
            else:
                print(f"Image at index {img_index} does not have the correct shape after expand_dims.")
        else:
            print(f"Image at index {img_index} is None or does not have the correct dimensions.")

        # Move to the next image, wrapping around if needed
        img_index = (img_index + 1) % len(images)
    
    final_len = len(os.listdir(class_folder))
    print('after augmenting ', final_len)


In [70]:
from keras.src.legacy.preprocessing.image import ImageDataGenerator

# Create an ImageDataGenerator instance
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'

)

In [71]:
import pandas as pd
threshold=68
for index, row in df.iterrows():
    # Access row data can access by row['car_name'] and row['count']
    car_name=row['car_name']
    car_images_count=row['count']
    if car_images_count<threshold:
        augment_images(car_name,threshold,datagen)
        
        

after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augmenting  68
after augment

libpng error: Read Error


model name is   Chevrolet Cobalt SS 2010
count of images is  57
model name is   Chevrolet Cobalt SS 2010
count of images is  58
model name is   Chevrolet Cobalt SS 2010
count of images is  59
model name is   Chevrolet Cobalt SS 2010
count of images is  60
model name is   Chevrolet Cobalt SS 2010
count of images is  61
model name is   Chevrolet Cobalt SS 2010
count of images is  62
model name is   Chevrolet Cobalt SS 2010
count of images is  63
model name is   Chevrolet Cobalt SS 2010
count of images is  64
model name is   Chevrolet Cobalt SS 2010
count of images is  65
model name is   Chevrolet Cobalt SS 2010
count of images is  66
model name is   Chevrolet Cobalt SS 2010
count of images is  67
after augmenting  69
model name is   Dodge Caliber Wagon 2012
count of images is  41
model name is   Dodge Caliber Wagon 2012
count of images is  42
model name is   Dodge Caliber Wagon 2012
count of images is  43
model name is   Dodge Caliber Wagon 2012
count of images is  44
model name is   Dod

In [79]:
final_look=count_images_in_folder(train_directory)
check_df= pd.DataFrame(final_look,columns=['car_name','count'])

In [81]:
check_df['count']==68

0      True
1      True
2      True
3      True
4      True
       ... 
191    True
192    True
193    True
194    True
195    True
Name: count, Length: 196, dtype: bool

## As we can see above all of the images folder have been adjusted to have the threshold amount of images