# Notebook is designed to combine image names with scraped data into a single CSV/DataFrame

In [1]:
import pandas as pd
import numpy as np
import os, fnmatch
import re

In [2]:
file_list = sorted(fnmatch.filter(os.listdir('scrape_images/'), '*.jpg'))

In [3]:
file_list[:5]

['agaricusarvensis1.jpg',
 'agaricusarvensis2.jpg',
 'agaricusarvensis3.jpg',
 'agaricusarvensis4.jpg',
 'agaricusarvensis5.jpg']

In [4]:
try:
    file_list.remove('.DS_Store')
    file_list.remove('.ipynb_checkpoints')
except:
    print('File did not exist to be removed')

File did not exist to be removed


In [5]:
file_df = pd.DataFrame(file_list, columns = ['image_name'])

In [6]:
file_df.head()

Unnamed: 0,image_name
0,agaricusarvensis1.jpg
1,agaricusarvensis2.jpg
2,agaricusarvensis3.jpg
3,agaricusarvensis4.jpg
4,agaricusarvensis5.jpg


In [7]:
mush_df = pd.read_csv('CSVs/my_mushrooms.csv')

In [8]:
mush_df.shape

(139, 11)

In [9]:
mush_df.head()

Unnamed: 0,family,location,min_cap_cm,max_cap_cm,min_stem_hgt_cm,max_stem_hgt_cm,min_stem_diam_cm,max_stem_diam_cm,edibility,long_desc,name
0,Agaricaceae,"North America, Europe",8.0,20.0,8.0,10.0,2.0,3.0,Edible and good,"Agaricus arvensis, commonly known as the horse...",agaricus_arvensis
1,Agaricaceae,"North America, Europe",10.0,20.0,10.0,20.0,2.0,4.0,Edible and good,"Agaricus augustus, also known as the prince, i...",agaricus_augustus
2,Polyporaceae,"North America, Europe",7.0,18.0,3.0,7.0,1.0,3.0,Inedible,"From above, this pale orange polypore looks li...",albatrellus_confluens
3,Polyporaceae,"North America, Europe",7.0,18.0,3.0,7.0,1.0,3.0,Edible,"Albatrellus ovinus, also known as Sheep Polypo...",albatrellus_ovinus
4,Pluteaceae,"North America, Europe",7.0,12.0,8.0,13.0,1.5,2.0,Inedible,"The cap of this large, grayish brown, fleshy ...",amanita_ceciliae


In [16]:
# Create a short name column in order to merge/join dataframes
mush_df['short_name'] = mush_df['name'].str.replace('_','')

In [10]:
# Create a short name column in order to merge/join dataframes
file_df['short_name']=file_df["image_name"].apply(lambda x: (re.search(r'[^\d]*',x)).group() )

In [12]:
# Seperate image number off of file incase its needed later
file_df['image_num']=file_df["image_name"].apply(lambda x: (re.search(r'([\d-]+)',x)).group() )

In [13]:
# lower case everything in file dataframe to ensure no issues with merge
file_df = file_df.applymap(lambda s:s.lower() if type(s) == str else s)

In [15]:
file_df.head()

Unnamed: 0,image_name,short_name,image_num
0,agaricusarvensis1.jpg,agaricusarvensis,1
1,agaricusarvensis2.jpg,agaricusarvensis,2
2,agaricusarvensis3.jpg,agaricusarvensis,3
3,agaricusarvensis4.jpg,agaricusarvensis,4
4,agaricusarvensis5.jpg,agaricusarvensis,5


In [18]:
mush_df.head()

Unnamed: 0,family,location,min_cap_cm,max_cap_cm,min_stem_hgt_cm,max_stem_hgt_cm,min_stem_diam_cm,max_stem_diam_cm,edibility,long_desc,name,short_name
0,Agaricaceae,"North America, Europe",8.0,20.0,8.0,10.0,2.0,3.0,Edible and good,"Agaricus arvensis, commonly known as the horse...",agaricus_arvensis,agaricusarvensis
1,Agaricaceae,"North America, Europe",10.0,20.0,10.0,20.0,2.0,4.0,Edible and good,"Agaricus augustus, also known as the prince, i...",agaricus_augustus,agaricusaugustus
2,Polyporaceae,"North America, Europe",7.0,18.0,3.0,7.0,1.0,3.0,Inedible,"From above, this pale orange polypore looks li...",albatrellus_confluens,albatrellusconfluens
3,Polyporaceae,"North America, Europe",7.0,18.0,3.0,7.0,1.0,3.0,Edible,"Albatrellus ovinus, also known as Sheep Polypo...",albatrellus_ovinus,albatrellusovinus
4,Pluteaceae,"North America, Europe",7.0,12.0,8.0,13.0,1.5,2.0,Inedible,"The cap of this large, grayish brown, fleshy ...",amanita_ceciliae,amanitaceciliae


In [19]:
final_df = file_df.merge(mush_df, on='short_name')

In [20]:
final_df[['image_name','short_name','image_num','name']]

Unnamed: 0,image_name,short_name,image_num,name
0,agaricusarvensis1.jpg,agaricusarvensis,1,agaricus_arvensis
1,agaricusarvensis2.jpg,agaricusarvensis,2,agaricus_arvensis
2,agaricusarvensis3.jpg,agaricusarvensis,3,agaricus_arvensis
3,agaricusarvensis4.jpg,agaricusarvensis,4,agaricus_arvensis
4,agaricusarvensis5.jpg,agaricusarvensis,5,agaricus_arvensis
...,...,...,...,...
622,tylopilusfelleus3.jpg,tylopilusfelleus,3,tylopilus_felleus
623,tylopilusfelleus4.jpg,tylopilusfelleus,4,tylopilus_felleus
624,tylopilusfelleus5.jpg,tylopilusfelleus,5,tylopilus_felleus
625,tylopilusfelleus6.jpg,tylopilusfelleus,6,tylopilus_felleus


In [21]:
final_df.to_csv(f'CSVs/all_pic_and_mush.csv', index=False)