In [42]:
import os
import pandas as pd
from tqdm import tqdm
from shutil import copyfile as copy

df = pd.read_csv('final.csv')
df.head()

Unnamed: 0,tags,path
0,"Clouds, Sky, Atmosphere, Blue Sky",Imgs/A00000000.jpg
1,"Bird, Ornithology, Hummingbird",Imgs/A00000001.jpg
2,"Sea, Rainbow, Rainfall, Subtropical",Imgs/A00000002.jpg
3,"Cherry Blossoms, Road, Japan, Sakura",Imgs/A00000003.jpg
4,"Cape Marguerite, Flower, Plant",Imgs/A00000004.jpg


In [44]:
df.columns

Index(['tags', 'path'], dtype='object')

### 1. Remove Unnecessary Columns

In [47]:
del df['img_link']
del df['img_id']
del df['likes']
del df['comments']

df.head()

KeyError: 'img_link'

In [49]:
for col in ['img_link', 'img_id', 'likes']:
    if col in df.columns:
        del df[col]
df.head()

Unnamed: 0,tags,path
0,"Clouds, Sky, Atmosphere, Blue Sky",Imgs/A00000000.jpg
1,"Bird, Ornithology, Hummingbird",Imgs/A00000001.jpg
2,"Sea, Rainbow, Rainfall, Subtropical",Imgs/A00000002.jpg
3,"Cherry Blossoms, Road, Japan, Sakura",Imgs/A00000003.jpg
4,"Cape Marguerite, Flower, Plant",Imgs/A00000004.jpg


In [51]:
#2. Finding all the tags
for tags in df['tags'][:20]:
    print([tag.strip() for tag in tags.split(',')])


['Clouds', 'Sky', 'Atmosphere', 'Blue Sky']
['Bird', 'Ornithology', 'Hummingbird']
['Sea', 'Rainbow', 'Rainfall', 'Subtropical']
['Cherry Blossoms', 'Road', 'Japan', 'Sakura']
['Cape Marguerite', 'Flower', 'Plant']
['Relaxing Rest Under The Tree']
['Cottontail Rabbit', 'Wild Rabbit', 'Grass']
['Crocus', 'Flowers', 'Spring', 'Plant']
['Coffee', 'Café', 'Vacation', 'Drink', 'Table']
['Waterfall', 'Fall', 'Epic', 'Nature', 'Light']
['Field', 'Morning', 'Sunrise', 'Dawn', 'Nature']
['Bird', 'Robin', 'Birdwatching', 'Animal']
['Seedlings', 'Soil', 'Spring Flowers', 'Bloom']
['Road', 'Highway', 'Countryside', 'Hills']
['Cityscape', 'Monochrome', 'Bridge', 'City']
['Sakura', 'Cherry Blossoms', 'Spring', 'Water']
['Background', 'Easter', 'Eggs', 'Template', 'Art']
['Bird', 'Indian Chat', 'Ornithology', 'Species']
['Town', 'Night', 'Winter', 'Village', 'Season']
['Flower', 'Viola', 'Pansy', 'Blossom', 'Bloom']


In [53]:
unique_tags = set()
for tags in df['tags'][:20]:
    print([tag.strip() for tag in tags.split(',')])



['Clouds', 'Sky', 'Atmosphere', 'Blue Sky']
['Bird', 'Ornithology', 'Hummingbird']
['Sea', 'Rainbow', 'Rainfall', 'Subtropical']
['Cherry Blossoms', 'Road', 'Japan', 'Sakura']
['Cape Marguerite', 'Flower', 'Plant']
['Relaxing Rest Under The Tree']
['Cottontail Rabbit', 'Wild Rabbit', 'Grass']
['Crocus', 'Flowers', 'Spring', 'Plant']
['Coffee', 'Café', 'Vacation', 'Drink', 'Table']
['Waterfall', 'Fall', 'Epic', 'Nature', 'Light']
['Field', 'Morning', 'Sunrise', 'Dawn', 'Nature']
['Bird', 'Robin', 'Birdwatching', 'Animal']
['Seedlings', 'Soil', 'Spring Flowers', 'Bloom']
['Road', 'Highway', 'Countryside', 'Hills']
['Cityscape', 'Monochrome', 'Bridge', 'City']
['Sakura', 'Cherry Blossoms', 'Spring', 'Water']
['Background', 'Easter', 'Eggs', 'Template', 'Art']
['Bird', 'Indian Chat', 'Ornithology', 'Species']
['Town', 'Night', 'Winter', 'Village', 'Season']
['Flower', 'Viola', 'Pansy', 'Blossom', 'Bloom']


In [55]:
unique_tags = set()
unique_tags.add('Cloud')
unique_tags.add('Bloom')
unique_tags.add('Village')
unique_tags.add('Winter')
unique_tags.add('Pansy')
unique_tags

{'Bloom', 'Cloud', 'Pansy', 'Village', 'Winter'}

### 2. Finding all the tags

In [58]:
t = []

for tags in df['tags'][:20]:
    
    t += [tag.strip() for tag in tags.split(',')]
print(len(t))
print(len(set(t)))
tags = list(set(t))

80
69


In [60]:
t = []

for tags in df['tags']:
    if isinstance(tags, str):   # skip NaN / floats
        t += [tag.strip() for tag in tags.split(',')]


In [61]:
df['tags'] = df['tags'].fillna('')

t = []

for tags in df['tags']:
    t += [tag.strip() for tag in tags.split(',')]


In [63]:
df = df.dropna(subset=['tags'])

t = []

for tags in df['tags']:
    t += [tag.strip() for tag in tags.split(',')]


In [65]:
t = []

for tags in df['tags'].dropna():
    t.extend(tag.strip() for tag in tags.split(','))
df.head()

Unnamed: 0,tags,path
0,"Clouds, Sky, Atmosphere, Blue Sky",Imgs/A00000000.jpg
1,"Bird, Ornithology, Hummingbird",Imgs/A00000001.jpg
2,"Sea, Rainbow, Rainfall, Subtropical",Imgs/A00000002.jpg
3,"Cherry Blossoms, Road, Japan, Sakura",Imgs/A00000003.jpg
4,"Cape Marguerite, Flower, Plant",Imgs/A00000004.jpg


### 3. Creating Folders for each Tag

In [67]:
for tag in tqdm(tags):
    
    try:
        os.mkdir('Dataset/' + tag)
    except:
        pass

100%|██████████| 39/39 [00:00<00:00, 5976.76it/s]


### 4. Saving Images in Specific Folders

In [72]:
for data in df.values[:5]:

    tags = data[0]
    tags = ['Dataset/' + tag.strip() + '/' for tag in tags.split(',')]

    src = data[1]
    for i in tags:
        dst = i + src.split('/')[-1]

        print(src)
        print(dst)
    print('------')
    #break    

Imgs/A00000000.jpg
Dataset/Clouds/A00000000.jpg
Imgs/A00000000.jpg
Dataset/Sky/A00000000.jpg
Imgs/A00000000.jpg
Dataset/Atmosphere/A00000000.jpg
Imgs/A00000000.jpg
Dataset/Blue Sky/A00000000.jpg
------
Imgs/A00000001.jpg
Dataset/Bird/A00000001.jpg
Imgs/A00000001.jpg
Dataset/Ornithology/A00000001.jpg
Imgs/A00000001.jpg
Dataset/Hummingbird/A00000001.jpg
------
Imgs/A00000002.jpg
Dataset/Sea/A00000002.jpg
Imgs/A00000002.jpg
Dataset/Rainbow/A00000002.jpg
Imgs/A00000002.jpg
Dataset/Rainfall/A00000002.jpg
Imgs/A00000002.jpg
Dataset/Subtropical/A00000002.jpg
------
Imgs/A00000003.jpg
Dataset/Cherry Blossoms/A00000003.jpg
Imgs/A00000003.jpg
Dataset/Road/A00000003.jpg
Imgs/A00000003.jpg
Dataset/Japan/A00000003.jpg
Imgs/A00000003.jpg
Dataset/Sakura/A00000003.jpg
------
Imgs/A00000004.jpg
Dataset/Cape Marguerite/A00000004.jpg
Imgs/A00000004.jpg
Dataset/Flower/A00000004.jpg
Imgs/A00000004.jpg
Dataset/Plant/A00000004.jpg
------


In [74]:
error = 0

for data in tqdm(df.values):
    
    tags = data[0]
    tags = ['Dataset/' + tag.strip() + '/' for tag in tags.split(',')]
    
    src = data[1]
    
    for i in tags:
        
        dst = i + src.split('/')[-1]
        
        try:
            copy(src, dst)
        except:
            error += 1

100%|██████████| 205866/205866 [01:59<00:00, 1718.32it/s]


### 5. Checking Number of Folders

In [81]:
folders = os.listdir('Dataset')

print(len(folders))

15


### 6. Checking number of Images in Each Folder

In [84]:
for folder in folders[:10]:
    print(folder)
    print(len(os.listdir('Dataset/')))

,
15
a
15
c
15
e
15
g
15
i
15
l
15
m
15
n
15
o
15


In [125]:
folder_ = []
freq    = []

for folder in tqdm(folders):
    
    try:
        freq.append(len(os.listdir('Dataset/' + folder)))
        folder_.append(folder)

    except:
        pass

100%|██████████████████████████████████| 64010/64010 [00:01<00:00, 33199.12it/s]


### 7. Top 10 Folders with most number of Images

In [132]:
df_ = pd.DataFrame()

df_['folder'] = folder_
df_['freq']   = freq

df_.sort_values(by = 'freq', ascending = False).head(10)

Unnamed: 0,folder,freq
29522,Nature,29598
18068,Flower,19423
46010,Animal,10040
14111,Plant,8514
38982,Bird,7894
50386,Landscape,7539
13600,Water,7305
22399,Flowers,6868
55534,Sea,6431
29188,Bloom,6194


### 8. Top 10 Folders with least number of Images

In [134]:
df_.sort_values(by = 'freq', ascending = True).head(10)

Unnamed: 0,folder,freq
31999,Karroo,1
37018,Snow Woman,1
37019,Don'T Like,1
37021,Arboreal,1
37023,Nandine,1
37024,Girl With Mobile,1
37025,Topkapä± Palace,1
37027,Ekh,1
37028,Fischer-Art In Sebnitz,1
37029,Five Grain Rice,1


### 9. How many folders are having images more than 50

In [140]:
df_[df_['freq'] >= 50]

Unnamed: 0,folder,freq
36,Usa,444
55,Dried,95
61,Volkswagen,53
88,Hunter,54
109,Porsche,52
...,...,...
63909,Frogs,69
63923,Flow,551
63944,Poultry,332
63967,Police,61


### 10. Removing the Folders with less than 50 Images

In [156]:
for i in tqdm(df_[df_['freq'] < 50]['folder']):
    
    
    src = 'Dataset/' + i
    dst = 'Temp/' + i
    
    shutil.move(src, dst)    

100%|██████████████████████████████████| 62028/62028 [00:05<00:00, 10766.93it/s]


In [163]:
df = pd.read_csv('final.csv')

del df['img_link']
del df['img_id']
del df['likes']
del df['comments']

df.to_csv('final.csv', index = False)