This file is used to shuffle the data and split it into train, validation and test datasets.

In [1]:
# Import necessary libraries

import os 
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

np.random.seed(1)

In [2]:
# Getting the name of different categories
base_path = os.curdir + "\\dataset-resized"

# Creating a dictionary of the 4 classes
categories={}
i = -1

for dirname, _, filenames in os.walk(base_path):
    for filename in filenames:
        categories[i] = dirname.split('\\')[-1]        
        break 
    
    i += 1

print("All Categories:")
for key, value in categories.items():
    print(key, "|",value)

All Categories:
0 | glass
1 | metal
2 | paper
3 | plastic


In [None]:
7746
79777
7231

In [3]:
#
def add_class_name_prefix(df, col_name):
    df[col_name] = df[col_name].apply(lambda x: x[:re.search("\d",x).start()] + '\\' + x)
    return df


# list conatining all the filenames in the dataset
filenames_list = []

# list to store the corresponding category, note that each folder of the dataset has one class of data
categories_list = []


for category in categories:
    filenames = os.listdir(base_path + "\\" + categories[category])
    filenames_list = filenames_list  + filenames
    categories_list = categories_list + [category] * len(filenames)
   

df = pd.DataFrame({
    'filename': filenames_list,
    'category': categories_list
})


df = add_class_name_prefix(df, 'filename')

# Shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)


print('number of elements = ' , len(df))
df.head(10)

number of elements =  4000


Unnamed: 0,filename,category
0,glass\glass279.jpg,0
1,metal\metal169.jpg,1
2,glass\glass648.jpg,0
3,paper\paper241.jpg,2
4,metal\metal250.jpg,1
5,paper\paper501.jpg,2
6,paper\paper943.jpg,2
7,glass\glass830.jpg,0
8,metal\metal34.jpg,1
9,glass\glass882.jpg,0


In [4]:
train_df, validate_df = train_test_split(df, test_size=0.2, random_state=1)
validate_df, test_df = train_test_split(validate_df, test_size=0.3, random_state=1)

train_df = train_df.reset_index(drop=True)
validate_df = validate_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

total_train = train_df.shape[0]
total_validate = validate_df.shape[0]

print('train size =', total_train , '\nvalidation size =', total_validate, '\ntest size =', test_df.shape[0])

train size = 3200 
validation size = 560 
test size = 240


In [5]:
train_df.to_csv("train.csv", index=False)
validate_df.to_csv("validate.csv", index=False)
test_df.to_csv("test.csv", index=False)