## Refer to [Dataset_preparation.md](https://github.com/YatharthDedhia/Eklavya-Smart-Stand/tree/Yatharth-programs/dataset) for detailed explaination
### **Prerequisites:**
* Download [CIFAR 100 Kaggle](https://www.kaggle.com/datasets/fedesoriano/cifar100?select=file.txt) and upload it to/
 Google Drive
 Note: Extracted dataset images have been provided [here](https://github.com/YatharthDedhia/Eklavya-Smart-Stand/tree/Yatharth-programs/dataset).

### Import all required libraries

In [1]:
import pandas as pd
import numpy as np
import os

import cv2
import shutil

#from tqdm import tqdm
# tqdm doesn't work well in colab.
# This is the solution:
# https://stackoverflow.com/questions/41707229/tqdm-printing-to-newline
import tqdm.notebook as tq
#for i in tq.tqdm(...):

import matplotlib.pyplot as plt
%matplotlib inline

### Link colab to Google drive containing dataset files

In [2]:
#For using google drive:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Extract images and labels from Dataset

In [8]:
# Define paths to train meta and test dataset files in dataset directory/folder

path_train = '/content/drive/MyDrive/cifar-100-python/train'
path_meta = '/content/drive/MyDrive/cifar-100-python/meta'
path_test = '/content/drive/MyDrive/cifar-100-python/test'

In [9]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [14]:
train_dict = unpickle(path_train)
names_dict = unpickle(path_meta)

In [15]:
fine_labels_list = train_dict[b'fine_labels']
coarse_labels_list = train_dict[b'coarse_labels']

fine_label_names_list = names_dict[b'fine_label_names']
coarse_label_names_list = names_dict[b'coarse_label_names']

In [16]:
# Create the df_train dataframe
df_train = pd.DataFrame(fine_labels_list, columns=['fine_labels'])

# Create new columns
df_train['coarse_labels'] = coarse_labels_list
df_train['image_num'] = df_train.index + 100000

# Create the image_id column
def create_imageid(row):
    
    image_id = str(row['fine_labels']) + '_' + str(row['coarse_labels']) + '_' + str(row['image_num']) +'.jpg'
    
    return image_id

df_train['image_id'] = df_train.apply(create_imageid, axis=1)


# Create the fine and coarse label names columns

def create_finelabelname(x):
    
    # this returns bytes: b'apple'
    name = fine_label_names_list[x]
    
    # convert bytes to string: 'apple'
    name = name.decode("utf-8") 
    
    return name


def create_coarselabelname(x):
    
    # this returns bytes: b'apple'
    name = coarse_label_names_list[x]
    
    # convert bytes to string: 'apple'
    name = name.decode("utf-8") 
    
    return name


df_train['fine_label_names'] = df_train['fine_labels'].apply(create_finelabelname)

df_train['coarse_label_names'] = df_train['coarse_labels'].apply(create_coarselabelname)


# Remove unnecessary columns
df_train = df_train.drop('image_num', axis=1)

# Reorder the columns
cols = ['image_id', 'fine_label_names', 'fine_labels', 'coarse_label_names', 'coarse_labels']
df_train = df_train[cols]

df_train.head()

Unnamed: 0,image_id,fine_label_names,fine_labels,coarse_label_names,coarse_labels
0,19_11_100000.jpg,cattle,19,large_omnivores_and_herbivores,11
1,29_15_100001.jpg,dinosaur,29,reptiles,15
2,0_4_100002.jpg,apple,0,fruit_and_vegetables,4
3,11_14_100003.jpg,boy,11,people,14
4,1_1_100004.jpg,aquarium_fish,1,fish,1


In [17]:
test_dict = unpickle(path_test)

In [18]:
fine_labels_list = test_dict[b'fine_labels']
coarse_labels_list = test_dict[b'coarse_labels']

fine_label_names_list = names_dict[b'fine_label_names']
coarse_label_names_list = names_dict[b'coarse_label_names']

In [19]:
# Create the df_test dataframe
df_test = pd.DataFrame(fine_labels_list, columns=['fine_labels'])

# Create new columns
df_test['coarse_labels'] = coarse_labels_list
df_test['image_num'] = df_test.index + 200000

# Create the image_id column
def create_imageid(row):
    
    image_id = str(row['fine_labels']) + '_' + str(row['coarse_labels']) + '_' + str(row['image_num']) +'.jpg'
    
    return image_id

df_test['image_id'] = df_test.apply(create_imageid, axis=1)


# Create the fine and coarse label names columns

def create_finelabelname(x):
    
    # this returns bytes: b'apple'
    name = fine_label_names_list[x]
    
    # convert bytes to string: 'apple'
    name = name.decode("utf-8") 
    
    return name


def create_coarselabelname(x):
    
    # this returns bytes: b'apple'
    name = coarse_label_names_list[x]
    
    # convert bytes to string: 'apple'
    name = name.decode("utf-8") 
    
    return name


df_test['fine_label_names'] = df_test['fine_labels'].apply(create_finelabelname)

df_test['coarse_label_names'] = df_test['coarse_labels'].apply(create_coarselabelname)


# Remove unnecessary columns
df_test = df_test.drop('image_num', axis=1)

# Reorder the columns
cols = ['image_id', 'fine_label_names', 'fine_labels', 'coarse_label_names', 'coarse_labels']
df_test = df_test[cols]

df_test.head()

Unnamed: 0,image_id,fine_label_names,fine_labels,coarse_label_names,coarse_labels
0,49_10_200000.jpg,mountain,49,large_natural_outdoor_scenes,10
1,33_10_200001.jpg,forest,33,large_natural_outdoor_scenes,10
2,72_0_200002.jpg,seal,72,aquatic_mammals,0
3,51_4_200003.jpg,mushroom,51,fruit_and_vegetables,4
4,71_10_200004.jpg,sea,71,large_natural_outdoor_scenes,10


Make CSV file

In [20]:
df_train.to_csv('train.csv', index=False)
df_test.to_csv('test.csv', index=False)

In [21]:
train_csv = pd.read_csv('/content/train.csv')
test_csv = pd.read_csv('/content/test.csv')

In [22]:
train_data = pd.DataFrame(train_csv)
test_data =  pd.DataFrame(test_csv)

#CLEAR

In [23]:
train_dict = unpickle(path_train)
train_matrix = train_dict[b'data']

test_dict = unpickle(path_test)
test_matrix = test_dict[b'data']

In [24]:
print(test_matrix.shape)
train_data.head()

(10000, 3072)


Unnamed: 0,image_id,fine_label_names,fine_labels,coarse_label_names,coarse_labels
0,19_11_100000.jpg,cattle,19,large_omnivores_and_herbivores,11
1,29_15_100001.jpg,dinosaur,29,reptiles,15
2,0_4_100002.jpg,apple,0,fruit_and_vegetables,4
3,11_14_100003.jpg,boy,11,people,14
4,1_1_100004.jpg,aquarium_fish,1,fish,1


## Filter out equal number of people and not people images from the whole dataset

In [25]:
# Create people directory
if os.path.isdir('people_images') == True:
  dir = '/content/people_images'
  for f in os.listdir(dir):
      os.remove(os.path.join(dir, f))

if os.path.isdir('people_images') == False:
    train_images = 'people_images'
    os.mkdir(train_images)


# Create not_people directory
if os.path.isdir('not_people_images') == True:
  dir = '/content/not_people_images'
  for f in os.listdir(dir):
      os.remove(os.path.join(dir, f))

else:
    train_images = 'not_people_images'
    os.mkdir(train_images)

In [26]:
#Combined dataframe of both test and train
combined_data = train_data.append(test_data)
combined_matrix = np.append(train_matrix,test_matrix)

In [27]:
random = combined_data[(combined_data['fine_labels']==0)].sample(31,replace=False)        # Initialize random dataframe
# select 31 random images from each class(fine label)
for i in range(1,100):
  label_index = combined_data[(combined_data['fine_labels']==i)].sample(31,replace=False)
  random = random.append(label_index)

In [28]:
# People dataframe from combined
people_data = combined_data[(combined_data['coarse_label_names']=='people')]

# Randomize people dataframe
people = people_data.sample(frac=1)
people.head()

Unnamed: 0,image_id,fine_label_names,fine_labels,coarse_label_names,coarse_labels
39356,11_14_139356.jpg,boy,11,people,14
13984,46_14_113984.jpg,man,46,people,14
38003,98_14_138003.jpg,woman,98,people,14
2717,11_14_102717.jpg,boy,11,people,14
8589,98_14_208589.jpg,woman,98,people,14


In [29]:
# Remove all the people elements 
not_people = random[random.coarse_label_names != 'people']

# add 55 random elements from not people to make 3000 images (equal to people)
not_people = not_people.append(not_people.sample(n=55))

#Randomize not_people
not_people = not_people.sample(frac=1)

#Convert and store not_people dataframe to csv
# not_people.to_csv('not_people.csv')
print(len(not_people))
not_people.head()

3000


Unnamed: 0,image_id,fine_label_names,fine_labels,coarse_label_names,coarse_labels
37610,23_10_137610.jpg,cloud,23,large_natural_outdoor_scenes,10
11317,87_5_111317.jpg,television,87,household_electrical_devices,5
8240,77_13_208240.jpg,snail,77,non-insect_invertebrates,13
36559,31_11_136559.jpg,elephant,31,large_omnivores_and_herbivores,11
28179,94_6_128179.jpg,wardrobe,94,household_furniture,6


In [30]:
count = 0
print(len(people))

3000


In [31]:
# Prepare train images
for i in range(0, train_matrix.shape[0]):
    if(train_data.loc[i,'image_id'] in people.values):
      
      # Get the image_id from the df_train dataframe
      image_id = train_data.loc[i, 'image_id']


      # Select an image
      row = train_matrix[i]

      # Extract each channel
      ch0 = row[0:1024] 
      ch1 = row[1024:2048]
      ch2 = row[2048:]

      # Reshape to 32x32
      ch0 = np.reshape(ch0, (32,32)) # red
      ch1 = np.reshape(ch1, (32,32)) # green
      ch2 = np.reshape(ch2, (32,32)) # blue

      # Stack the matrices along the channel axis
      image = np.dstack((ch0, ch1, ch2))

      
      # Save the image in the folder
      # that we created.
      fname = image_id
      dst = os.path.join('people_images', fname)
      
      # If cv2.COLOR_RGB2BGR is not used then the saved images appear blue.
      cv2.imwrite(dst, image)
      count = count + 1

# Not-People
    if(train_data.loc[i,'image_id'] in not_people.values):
      # Get the image_id from the df_train dataframe
      image_id = train_data.loc[i, 'image_id']


      # Select an image
      row = train_matrix[i]

      # Extract each channel
      ch0 = row[0:1024] 
      ch1 = row[1024:2048]
      ch2 = row[2048:]

      # Reshape to 32x32
      ch0 = np.reshape(ch0, (32,32)) # red
      ch1 = np.reshape(ch1, (32,32)) # green
      ch2 = np.reshape(ch2, (32,32)) # blue

      # Stack the matrices along the channel axis
      image = np.dstack((ch0, ch1, ch2))

      
      # Save the image in the folder
      # that we created.
      fname = image_id
      dst = os.path.join('not_people_images', fname)
      
      # If cv2.COLOR_RGB2BGR is not used then the saved images appear blue.
      cv2.imwrite(dst, image)
      count = count + 1

In [32]:
# Prepare test images
for i in range(0, test_matrix.shape[0]):
    if(test_data.loc[i,'image_id'] in people.values):
      
      # Get the image_id from the df_train dataframe
      image_id = test_data.loc[i, 'image_id']


      # Select an image
      row = test_matrix[i]

      # Extract each channel
      ch0 = row[0:1024] 
      ch1 = row[1024:2048]
      ch2 = row[2048:]

      # Reshape to 32x32
      ch0 = np.reshape(ch0, (32,32)) # red
      ch1 = np.reshape(ch1, (32,32)) # green
      ch2 = np.reshape(ch2, (32,32)) # blue

      # Stack the matrices along the channel axis
      image = np.dstack((ch0, ch1, ch2))

      
      # Save the image in the folder
      # that we created.
      fname = image_id
      dst = os.path.join('people_images', fname)
      
      # If cv2.COLOR_RGB2BGR is not used then the saved images appear blue.
      cv2.imwrite(dst, image)
      count = count + 1

# Not-People
    if(test_data.loc[i,'image_id'] in not_people.values):
      # Get the image_id from the df_train dataframe
      image_id = test_data.loc[i, 'image_id']


      # Select an image
      row = test_matrix[i]

      # Extract each channel
      ch0 = row[0:1024] 
      ch1 = row[1024:2048]
      ch2 = row[2048:]

      # Reshape to 32x32
      ch0 = np.reshape(ch0, (32,32)) # red
      ch1 = np.reshape(ch1, (32,32)) # green
      ch2 = np.reshape(ch2, (32,32)) # blue

      # Stack the matrices along the channel axis
      image = np.dstack((ch0, ch1, ch2))

      
      # Save the image in the folder
      # that we created.
      fname = image_id
      dst = os.path.join('not_people_images', fname)
      
      # If cv2.COLOR_RGB2BGR is not used then the saved images appear blue.
      cv2.imwrite(dst, image)
      count = count + 1

In [33]:
# Convert and store people dataframe to csv
# people.to_csv('people.csv')

## Save Dataset as zip file to download

In [34]:
!zip -r /content/people.zip /content/people_images
from google.colab import files
files.download("/content/people.zip")

!zip -r /content/not_people.zip /content/not_people_images
from google.colab import files
files.download("/content/not_people.zip")

  adding: content/people_images/ (stored 0%)
  adding: content/people_images/11_14_100599.jpg (deflated 12%)
  adding: content/people_images/98_14_111806.jpg (deflated 13%)
  adding: content/people_images/98_14_148443.jpg (deflated 14%)
  adding: content/people_images/11_14_133743.jpg (deflated 14%)
  adding: content/people_images/11_14_111325.jpg (deflated 12%)
  adding: content/people_images/11_14_201541.jpg (deflated 12%)
  adding: content/people_images/11_14_103253.jpg (deflated 12%)
  adding: content/people_images/2_14_141335.jpg (deflated 14%)
  adding: content/people_images/2_14_116187.jpg (deflated 13%)
  adding: content/people_images/46_14_115510.jpg (deflated 12%)
  adding: content/people_images/11_14_204141.jpg (deflated 15%)
  adding: content/people_images/46_14_107542.jpg (deflated 13%)
  adding: content/people_images/35_14_105650.jpg (deflated 15%)
  adding: content/people_images/11_14_104011.jpg (deflated 12%)
  adding: content/people_images/46_14_107630.jpg (deflated 14

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: content/not_people_images/ (stored 0%)
  adding: content/not_people_images/37_9_110147.jpg (deflated 13%)
  adding: content/not_people_images/39_5_142032.jpg (deflated 11%)
  adding: content/not_people_images/61_3_135601.jpg (deflated 15%)
  adding: content/not_people_images/8_18_208402.jpg (deflated 13%)
  adding: content/not_people_images/47_17_136013.jpg (deflated 13%)
  adding: content/not_people_images/6_7_121716.jpg (deflated 12%)
  adding: content/not_people_images/10_3_120052.jpg (deflated 14%)
  adding: content/not_people_images/77_13_127792.jpg (deflated 12%)
  adding: content/not_people_images/48_18_139848.jpg (deflated 13%)
  adding: content/not_people_images/3_8_201746.jpg (deflated 13%)
  adding: content/not_people_images/26_13_207387.jpg (deflated 13%)
  adding: content/not_people_images/30_0_139662.jpg (deflated 14%)
  adding: content/not_people_images/49_10_106747.jpg (deflated 15%)
  adding: content/not_people_images/96_17_102285.jpg (deflated 12%)
  adding:

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
# Total number of images:
count

5945