In [1]:
import os
import time
from datetime import datetime

In [2]:
# All sci-kit related imports 
import pandas as pd
import numpy as np

In [3]:
from threading import Thread
from multiprocessing import Process, cpu_count, Lock

In [4]:
from tqdm import tqdm

In [5]:
import cv2 

In [6]:
TRAIN_CSV_DIR = os.path.join(os.getcwd(),'guides\\isolated-dataset-csv\\IsolatedTrain.csv')
TEST_CSV_DIR = os.path.join(os.getcwd(),'guides\\isolated-dataset-csv\\IsolatedTest.csv')

In [7]:
TRAIN_CSV = pd.read_csv(TRAIN_CSV_DIR, usecols=["labels","directory"])
TEST_CSV = pd.read_csv(TEST_CSV_DIR, usecols=["labels","directory"])

In [8]:
TRAIN_CSV

Unnamed: 0,labels,directory
0,1,dataset\IsolatedTrain\1\bcc000000.bmp
1,1,dataset\IsolatedTrain\1\bcc000133.bmp
2,1,dataset\IsolatedTrain\1\bcc000134.bmp
3,1,dataset\IsolatedTrain\1\bcc000135.bmp
4,1,dataset\IsolatedTrain\1\bcc000136.bmp
...,...,...
34434,171,dataset\IsolatedTrain\171\bcc000067.bmp
34435,171,dataset\IsolatedTrain\171\bcc000068.bmp
34436,171,dataset\IsolatedTrain\171\bcc000069.bmp
34437,171,dataset\IsolatedTrain\171\bcc000071.bmp


In [9]:
class MultiProcessedLoader:
    def __init__(
        self,
        csv_file_dataframe,
        resize_dimensions=None,
    ):
        self.input = csv_file_dataframe.to_numpy()
        self.CPU_COUNT = cpu_count()
        self.splitted_array = np.array_split(self.input, self.CPU_COUNT*2)
        self._labels = []
        self._images = []
        self._directories = []
        
    
    def getItems(self):
        self._execute()
        return [self._labels,self._directories,self._images]
    
    def _execute(self):
        start = time.time()
        print("Multithreading Started")
        self._thread_runner()
        print("Time Taken to COMPLETE excecution {}".format(time.time() - start))

    def _thread_runner(self):
        thread_list = []
        for i in range(self.CPU_COUNT*2):
            name = "Thread {}".format(i)
            thread_list.append(
                Thread(
                    target=self.reader,
                    name=name,
                    args=(self.splitted_array[i], name, i),
                )
            )
            thread_list[i].start()

        for i, th in enumerate(thread_list):
            th.join()

    def reader(self, file, name, i):
        start = time.time()
        now = datetime.now().time()

        print("Starting {} at time {} \n".format(name, now))
        for i,f in enumerate(file):
            label = file[i, 0]
            result = file[i, 1]
            self._labels.append(label-1)
            self._directories.append(result.replace("\\","/"))
            self._images.append(cv2.resize(cv2.imread(result, cv2.COLOR_BGR2RGB),(224,224)))
                
        print("{} completed execution in {} s \n".format(name, time.time() - start))

In [10]:
TRAIN_FileLoader = MultiProcessedLoader(csv_file_dataframe=TRAIN_CSV)

In [11]:
TEST_FileLoader = MultiProcessedLoader(csv_file_dataframe=TEST_CSV)

In [12]:
training_data = TRAIN_FileLoader.getItems()

Multithreading Started
Starting Thread 0 at time 02:47:35.318532 

Starting Thread 1 at time 02:47:35.325512 

Starting Thread 2 at time 02:47:35.328504 

Starting Thread 3 at time 02:47:35.329502 

Starting Thread 4 at time 02:47:35.332493 

Starting Thread 5 at time 02:47:35.334487 

Starting Thread 6 at time 02:47:35.336483 

Starting Thread 7 at time 02:47:35.338478 

Thread 3 completed execution in 140.1401710510254 s 

Thread 1 completed execution in 141.99814367294312 s 

Thread 0 completed execution in 142.75194120407104 s 

Thread 5 completed execution in 142.85209774971008 s 

Thread 6 completed execution in 142.98588681221008 s 

Thread 2 completed execution in 143.14486384391785 s 

Thread 4 completed execution in 145.55478191375732 s 

Thread 7 completed execution in 145.98142290115356 s 

Time Taken to COMPLETE excecution 146.00336503982544


In [13]:
testing_data = TEST_FileLoader.getItems()

Multithreading Started
Starting Thread 0 at time 02:50:23.917145 

Starting Thread 1 at time 02:50:23.921134 

Starting Thread 2 at time 02:50:23.924125 

Starting Thread 3 at time 02:50:23.926121 

Starting Thread 4 at time 02:50:23.928115 

Starting Thread 5 at time 02:50:23.930113 

Starting Thread 6 at time 02:50:23.932106 

Starting Thread 7 at time 02:50:23.935096 

Thread 0 completed execution in 34.07504343986511 s 

Thread 3 completed execution in 35.04910469055176 s 

Thread 5 completed execution in 35.16315984725952 s 

Thread 2 completed execution in 35.94886779785156 s 

Thread 4 completed execution in 36.10015392303467 s 

Thread 1 completed execution in 36.13107180595398 s 

Thread 7 completed execution in 36.1649808883667 s 

Thread 6 completed execution in 36.209859132766724 s 

Time Taken to COMPLETE excecution 36.22581744194031


In [14]:
def data_tuple_to_dataframe(data):
    labels, directories, images = data
    dataset = pd.DataFrame()
    dataset['labels'] = labels
    dataset['images'] = images
    return dataset

In [15]:
training_dataframe = data_tuple_to_dataframe(training_data)
testing_dataframe = data_tuple_to_dataframe(testing_data)

In [16]:
training_dataframe

Unnamed: 0,labels,images
0,0,"[[[254, 254, 254], [254, 254, 254], [255, 255,..."
1,20,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
2,63,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
3,41,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
4,84,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
...,...,...
34434,170,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
34435,170,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
34436,170,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
34437,170,"[[[254, 254, 254], [254, 254, 254], [255, 255,..."


In [17]:
testing_dataframe #34439

Unnamed: 0,labels,images
0,0,"[[[255, 255, 255], [254, 254, 254], [253, 253,..."
1,20,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
2,41,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
3,63,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
4,84,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
...,...,...
8515,149,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
8516,149,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
8517,149,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
8518,149,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."


In [18]:
print(len(training_dataframe.iloc[0:15000]))
print(len(training_dataframe.iloc[15001:34439]))

15000
19438


In [20]:
training_dataframe.to_pickle('./pickle/isolated_train_data.pkl')

In [21]:
testing_dataframe.to_pickle('./pickle/isolated_test_data.pkl')

In [22]:
training_dataframe_unpickled_df = pd.read_pickle("./pickle/isolated_train_data.pkl")

In [23]:
testing_dataframe_unpickled_df = pd.read_pickle("./pickle/isolated_test_data.pkl")

In [24]:
training_dataframe_unpickled_df

Unnamed: 0,labels,images
0,0,"[[[254, 254, 254], [254, 254, 254], [255, 255,..."
1,20,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
2,63,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
3,41,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
4,84,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
...,...,...
34434,170,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
34435,170,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
34436,170,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
34437,170,"[[[254, 254, 254], [254, 254, 254], [255, 255,..."


In [25]:
testing_dataframe_unpickled_df

Unnamed: 0,labels,images
0,0,"[[[255, 255, 255], [254, 254, 254], [253, 253,..."
1,20,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
2,41,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
3,63,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
4,84,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
...,...,...
8515,149,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
8516,149,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
8517,149,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
8518,149,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
