In [1]:
import numpy as np
import pandas
from PIL import Image as pilImg
import os 
import cv2
from datetime import datetime
import matplotlib.pyplot as plt
import itertools

1. Підготовка Даних із завантаженого датасета із Англійскими словами
Source: https://www.robots.ox.ac.uk/~vgg/data/text/#sec-synth

Із file_path(шлях до txt файлу із описами картинок) отримує картинки у кількості number
Повертає тільки картинки з текстом розміром від 4 до 12 елементів

Takes the file path of images annotation txt file with the number of images names to be extracted
and returns the list of file names having label length <=12

In [2]:
def Extract_image_names(file_path, number):
    with open(file_path) as file:
        files = file.readlines()
        file.close()
        count = 0
        img_names = []
        for f in files:
            """
            Extract from 'SynthImageDataset./2425/1/115_Lube_45484.jpg 45484' only image lable - Lube
            """
            label = f.split('_')[1]
            if len(label) >= 4 and len(label) <= 12:
                img_names.append(f)
                count += 1
            if count == number:
                break
        images_names=['SynthImageDataset' + x.strip() for x in img_names]
        return images_names

In [3]:
train_images = Extract_image_names('SynthImageDataset/annotation_train.txt',200000)
train_images[:5]

['SynthImageDataset./2425/1/115_Lube_45484.jpg 45484',
 'SynthImageDataset./2425/1/114_Spencerian_73323.jpg 73323',
 'SynthImageDataset./2425/1/112_CARPENTER_11682.jpg 11682',
 'SynthImageDataset./2425/1/110_savannas_67969.jpg 67969',
 'SynthImageDataset./2425/1/109_unfix_82473.jpg 82473']

Очищує назву файлів забираючи із кінця цифри, що індикують номер картинки

In [4]:
#помістити всередину минулої функції
def clean_file_names(file_names):
    clean_files = []
    for file in file_names:
        main_folder, img_loc, extension = file.split('.')
        extension = extension.split(' ')[0]
        img_file = main_folder + img_loc + '.' + extension
        clean_files.append(img_file)
    return clean_files


train_cleaned = clean_file_names(train_images)

train_cleaned[:5]

['SynthImageDataset/2425/1/115_Lube_45484.jpg',
 'SynthImageDataset/2425/1/114_Spencerian_73323.jpg',
 'SynthImageDataset/2425/1/112_CARPENTER_11682.jpg',
 'SynthImageDataset/2425/1/110_savannas_67969.jpg',
 'SynthImageDataset/2425/1/109_unfix_82473.jpg']

In [5]:
train_data=pandas.DataFrame({'ImageName':train_cleaned})
train_data.head()

Unnamed: 0,ImageName
0,SynthImageDataset/2425/1/115_Lube_45484.jpg
1,SynthImageDataset/2425/1/114_Spencerian_73323.jpg
2,SynthImageDataset/2425/1/112_CARPENTER_11682.jpg
3,SynthImageDataset/2425/1/110_savannas_67969.jpg
4,SynthImageDataset/2425/1/109_unfix_82473.jpg


In [6]:
def extract_ground_truth(files):
    """
    Given the file names of images, extracts the Ground Truth Values and returns a list of Ground Truth Labels in All Capitals
    """
    txt_labels = []
    for file in files:
        ground_truth = file.split('_')[1]
        ground_truth = ground_truth.upper()
        txt_labels.append(ground_truth)
    return txt_labels   

Train_ground_truths = extract_ground_truth(train_cleaned)
#Add to csv Label for Images
train_data['Labels'] = Train_ground_truths
train_data.head()

Unnamed: 0,ImageName,Labels
0,SynthImageDataset/2425/1/115_Lube_45484.jpg,LUBE
1,SynthImageDataset/2425/1/114_Spencerian_73323.jpg,SPENCERIAN
2,SynthImageDataset/2425/1/112_CARPENTER_11682.jpg,CARPENTER
3,SynthImageDataset/2425/1/110_savannas_67969.jpg,SAVANNAS
4,SynthImageDataset/2425/1/109_unfix_82473.jpg,UNFIX


Create Train Data csv file

In [7]:
train_data.to_csv('Train_data.csv')

Validation Data

In [8]:
Validation_images = Extract_image_names('SynthImageDataset/annotation_val.txt',12000)
val_cleaned = clean_file_names(Validation_images)
val_data = pandas.DataFrame({'ImageName':val_cleaned})
Val_ground_truths = extract_ground_truth(val_cleaned)
val_data['Labels'] = Val_ground_truths
val_data.head()

Unnamed: 0,ImageName,Labels
0,SynthImageDataset/2697/6/466_MONIKER_49537.jpg,MONIKER
1,SynthImageDataset/2697/6/464_FIRESTORM_29099.jpg,FIRESTORM
2,SynthImageDataset/2697/6/462_Repurchases_64997...,REPURCHASES
3,SynthImageDataset/2697/6/461_PIGTAIL_57575.jpg,PIGTAIL
4,SynthImageDataset/2697/6/460_landladies_43270.jpg,LANDLADIES


In [9]:
val_data.to_csv('Validation_data.csv')

Test Data

In [10]:
test_images = Extract_image_names('SynthImageDataset/annotation_test.txt',15000)
test_cleaned = clean_file_names(test_images)
test_data = pandas.DataFrame({'ImageName':test_cleaned})
test_ground_truths = extract_ground_truth(test_cleaned)
test_data['Labels'] = test_ground_truths
test_data.head()

Unnamed: 0,ImageName,Labels
0,SynthImageDataset/3000/7/182_slinking_71711.jpg,SLINKING
1,SynthImageDataset/3000/7/181_REMODELERS_64541.jpg,REMODELERS
2,SynthImageDataset/3000/7/180_Chronographs_1353...,CHRONOGRAPHS
3,SynthImageDataset/3000/7/179_Impeaching_38222.jpg,IMPEACHING
4,SynthImageDataset/3000/7/177_Loots_45256.jpg,LOOTS


In [11]:
test_data.to_csv('Test_data.csv')

Image Processing

In [12]:
def img_store_single_channel(destination_folder, files):
    """
    Takes the images in a folder, distination folder path and 
    converts the image to single channel gray scale,
    stores the image in the destination folder and returns image destination list
    """
    start = datetime.now()
    destination_list = []
    count = 1
    for file in files:
        #Removing the extra folder structures
        Name = file.split('/')[3]
        img = Name.split('_')[1]
        destination = destination_folder + str(count) + '_' + img + '.jpg'
        cv_img = cv2.imread(file)
        #So extracting image from any 1 channel gives a single channel Grayscale image
        cv_img_sc = cv_img[:, :, 1]
        cv2.imwrite(destination, cv_img_sc)
        destination_list.append(destination)
        count += 1
#         if count%10000==0:
#             print("Processed Images: ",count)
    print('Time Taken for Processing: ', datetime.now() - start)
    return destination_list

In [17]:
val_data = pandas.read_csv('Validation_data.csv')
val_files = val_data['ImageName'].values
val_files[0]
cv_img = cv2.imread(val_files[0])
# Displaying the image
cv2.imshow('image', cv_img)
cv2.waitKey()
cv_img_sc = cv_img[:, :, 1]
cv2.imwrite(r'Val_data\0_.jpg', cv_img_sc)

False

Processing Validation Data

In [13]:
val_data = pandas.read_csv('Validation_data.csv')
val_data.drop(['Unnamed: 0'], axis = 1, inplace = True)
val_files = val_data['ImageName'].values
val_dest = img_store_single_channel('Val_data/', val_files)

: 