# Prepare Data

If you ran _Download available data.ipynb_ you end up with a bunch of directories, tar and zip files in your data directory.  
In this notebook we'll extract and clean the data so you can use it for your model.  
The end result is a big csv file with all file paths and labels

In [138]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.vision import *
from fastai.basics import *
import shutil
from PIL import Image
from tqdm import tqdm

In [80]:
path = Path('/home/jupyter/data')

## Import data


### UPMC Food-101

#### Get data

In [3]:
! mkdir -p {path}/UPMC_Food101

In [4]:
! tar -C {path}/UPMC_Food101 -zxf {path}/UPMC_Food101.tar.gz

tar (child): /home/jupyter/data/UPMC_Food101.tar.gz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now


In [5]:
! rm -rf {path}/UPMC_Food101/texts_html
! rm -rf {path}/UPMC_Food101/texts_txt

Remove tar file if you want to save space on your machine

In [18]:
! rm -rf {path}/UPMC_Food101.tar.gz

#### Process data

In [81]:
def create_img_df(subdir, path):
    cats = [str(x).split('/')[-1] for x in (path/subdir).ls()]
    files = []; labels = []

    for cat in cats:
        tmp = [cat + '/' +  str(x).split('/')[-1] for x in (path/subdir/cat).ls() if x.match('*.jpg')]
        files += tmp; labels += [cat]*len(tmp)
    
    df = pd.DataFrame({'file':files, 'label':labels})
    df['file'] = df['file'].apply(lambda x: subdir+'/'+x)
    return df

In [82]:
img_dir = 'UPMC_Food101/images'
train = create_img_df(img_dir+'/train', path)
test = create_img_df(img_dir+'/test', path)
train['test'], test['test'] = False, True
upmc_labels = pd.concat([train, test], axis=0)

In [83]:
upmc_labels.head()

Unnamed: 0,file,label,test
0,UPMC_Food101/images/train/paella/paella_488.jpg,paella,False
1,UPMC_Food101/images/train/paella/paella_408.jpg,paella,False
2,UPMC_Food101/images/train/paella/paella_380.jpg,paella,False
3,UPMC_Food101/images/train/paella/paella_394.jpg,paella,False
4,UPMC_Food101/images/train/paella/paella_482.jpg,paella,False


#### write to csv 

In [84]:
upmc_labels.to_csv(path/'UPMC_Food101/upmc101.csv', index=False)

### UEC 256

#### Get data

In [45]:
! mkdir -p path/'UEC_256'

In [46]:
! unzip -qq {path}/'dataset256.zip' -d {path}/'UEC_256'

#### Process data

In [85]:
img_dir = path/'UEC_256/UECFOOD256'

In [86]:
category_fn = path/'UEC_256/UECFOOD256/category.txt'

In [87]:
cats = pd.read_csv(category_fn, sep='\t')
id2cat = {i:n for i, n in zip(cats['id'], cats['name'])}
cat2id = {n:i for i, n in zip(cats['id'], cats['name'])}

In [88]:
files, labels = [], []

for i in id2cat.keys():
    i_dir = str(i)
    subf = img_dir/i_dir
    tmp = [subf/str(x).split('/')[-1] for x in (path/subf).ls() if x.match('*.jpg')]
    files += tmp; labels += [id2cat[i]]*len(tmp)

In [89]:
uec256 = pd.DataFrame({'file':files, 'label':labels})
uec256.head()

Unnamed: 0,file,label
0,/home/jupyter/data/UEC_256/UECFOOD256/1/14885.jpg,rice
1,/home/jupyter/data/UEC_256/UECFOOD256/1/68.jpg,rice
2,/home/jupyter/data/UEC_256/UECFOOD256/1/14263.jpg,rice
3,/home/jupyter/data/UEC_256/UECFOOD256/1/9112.jpg,rice
4,/home/jupyter/data/UEC_256/UECFOOD256/1/14350.jpg,rice


#### Write to csv

In [90]:
uec256.to_csv(path/'UEC_256/uec256.csv', index=False)

### Google Images Old


#### Get data

In [91]:
dir_google_single =  path/'downloaded_imgs/train_single'
dir_google_double =  path/'downloaded_imgs/train_double'

In [92]:
labs_google_single = pd.read_csv(path/'downloaded_imgs/labels_single.csv')
labs_google_double = pd.read_csv(path/'downloaded_imgs/labels_double.csv')

#### Process data

In [93]:
labs_google_single.rename({'name':'file'}, axis=1, copy=False, inplace=True)
labs_google_double.rename({'name':'file'}, axis=1, copy=False, inplace=True)

In [94]:
def format_lab(x):
    labs = x.split()
    labs = [lab.replace('_', ' ').strip() for lab in labs]
    return ';'.join(labs)

In [95]:
labs_google_single['label'] = labs_google_single['label'].apply(format_lab)
labs_google_double['label'] = labs_google_double['label'].apply(format_lab)

labs_google_single['file'] = labs_google_single['file'].apply(lambda x: dir_google_single/x)
labs_google_double['file'] = labs_google_double['file'].apply(lambda x: dir_google_double/x)

In [96]:
labs_google_single.head()

Unnamed: 0,file,label
0,/home/jupyter/data/downloaded_imgs/train_singl...,Apple
1,/home/jupyter/data/downloaded_imgs/train_singl...,Apple
2,/home/jupyter/data/downloaded_imgs/train_singl...,Apple
3,/home/jupyter/data/downloaded_imgs/train_singl...,Apple
4,/home/jupyter/data/downloaded_imgs/train_singl...,Apple


In [97]:
labs_google_double.head()

Unnamed: 0,file,label
0,/home/jupyter/data/downloaded_imgs/train_doubl...,Apple;Apple Slices
1,/home/jupyter/data/downloaded_imgs/train_doubl...,Batavia Salad;Salad
2,/home/jupyter/data/downloaded_imgs/train_doubl...,Boiled Potatoes;Cookies
3,/home/jupyter/data/downloaded_imgs/train_doubl...,Bread;Bread Rolls
4,/home/jupyter/data/downloaded_imgs/train_doubl...,Bread Rolls;Pasta


#### Write to CSV

In [98]:
out_google_single = path/'downloaded_imgs/labels_google_img_single.csv'
out_google_double = path/'downloaded_imgs/labels_google_img_double.csv'

In [99]:
labs_google_single.to_csv(out_google_single, index=False, header=True)
labs_google_double.to_csv(out_google_double, index=False, header=True)

### Google images new

#### Get data

In [100]:
! unzip -qq {path}/'new_google_imgs2.zip' -d {path}/'new_google_imgs'

replace /home/jupyter/data/new_google_imgs/new_google_imgs/googleimg_single/googleimg_labels_single_3.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


#### Process data

In [101]:
new_img_path = '{}/new_google_imgs/new_google_imgs/'.format(path)
single_map = 'googleimg_single/'
double_map = 'googleimg_double/'
single_fig_out ='googleimg_single_total/'
double_fig_out ='googleimg_double_total/'

In [102]:
single_csv = [x for x in os.listdir(new_img_path + single_map) if x.endswith('.csv')]
double_csv = [x for x in os.listdir(new_img_path + double_map) if x.endswith('.csv')]
single_maps = [x for x in os.listdir(new_img_path + single_map) if os.path.isdir(new_img_path + single_map + x)]
double_maps = [x for x in os.listdir(new_img_path + double_map) if os.path.isdir(new_img_path + double_map + x)]

In [103]:
def concatenate_csv(csv_list,path):
    df = pd.DataFrame({'file':[], 'label':[]})
    for csv in csv_list:
        temp_df = pd.read_csv(path + csv)
        df = pd.concat([df,temp_df], axis = 0)
    return df    
    
def move_files (in_list,base_path,out_path):
    out_path = base_path + out_path
    if not os.path.isdir(out_path):
        os.makedirs(out_path)
    for folder in in_list:
        in_path = base_path + folder
        for file in os.listdir(in_path):
            if os.path.isfile(in_path + '/' + file):
                shutil.move(in_path + '/' + file, out_path + '/' +file)        

In [104]:
move_files(single_maps,new_img_path + single_map,single_fig_out)
move_files(double_maps,new_img_path + double_map,double_fig_out)
newgoogleimg_single_df = concatenate_csv(single_csv,new_img_path + single_map)
newgoogleimg_double_df = concatenate_csv(double_csv,new_img_path + double_map)
newgoogleimg_single_df['file'] =newgoogleimg_single_df['file'].apply(lambda x:new_img_path + single_map + single_fig_out + '/' +x)
newgoogleimg_double_df['file'] =newgoogleimg_double_df['file'].apply(lambda x:new_img_path + double_map + double_fig_out + '/' +x)
newgoogleimg_single_df['label'] = newgoogleimg_single_df['label'].apply(format_lab)
newgoogleimg_double_df['label'] = newgoogleimg_double_df['label'].apply(format_lab)

In [105]:
newgoogleimg_single_df.head()

Unnamed: 0,file,label
0,/home/jupyter/data/new_google_imgs/new_google_...,peas
1,/home/jupyter/data/new_google_imgs/new_google_...,peas
2,/home/jupyter/data/new_google_imgs/new_google_...,peas
3,/home/jupyter/data/new_google_imgs/new_google_...,peas
4,/home/jupyter/data/new_google_imgs/new_google_...,peas


In [106]:
newgoogleimg_double_df.head()

Unnamed: 0,file,label
0,/home/jupyter/data/new_google_imgs/new_google_...,chicken;salad
1,/home/jupyter/data/new_google_imgs/new_google_...,chicken;salad
2,/home/jupyter/data/new_google_imgs/new_google_...,chicken;salad
3,/home/jupyter/data/new_google_imgs/new_google_...,chicken;salad
4,/home/jupyter/data/new_google_imgs/new_google_...,chicken;salad


#### write to CSV

In [107]:
newgoogleimg_single_df.to_csv(new_img_path + single_map +  'single_total.csv', index=False, header=True)
newgoogleimg_double_df.to_csv(new_img_path + single_map +  'double_total.csv', index=False, header=True)

### Food Recipe

#### Get data
__Note__: These two files are not added Google Storage yet. Download them via Slack and make sure they are in your path directory

In [108]:
dir_food_recipe = path/'Food_Recipe'
labs_food_recipe = pd.read_csv(path/'food_recipe_paths.csv')
labs_food_recipe_trans = pd.read_csv(path/'german_to_english.csv')

#### Process data

In [109]:
labs_food_recipe['file'] = labs_food_recipe['short_path'].apply(lambda x: x.replace('Food_Recipe/', ''))

Translate german categories to English

In [110]:
trans_dict = {labs_food_recipe_trans.iloc[i, 0]:labs_food_recipe_trans.iloc[i, 1] for i in 
              range(len(labs_food_recipe_trans.index))}

Manual Correction to translation dictionary

In [111]:
trans_dict['brotaufstrich'] = 'spread'
trans_dict['spaghettisalat'] = 'spaghetti salad'
trans_dict['käsesuppe'] = 'cheese soup'
trans_dict['curry'] = 'curry'
trans_dict['tomatensuppe'] = 'tomato soup'
trans_dict['cannelloni'] = 'cannelloni'
trans_dict['kasseler'] = 'smoked and salted pork'
trans_dict['maultaschen'] = 'pasta dough filled with meat, spinach or onions'
trans_dict['zwiebelkuchen'] = 'onion cake'

In [112]:
labs_food_recipe['eng_cat'] = labs_food_recipe['category'].apply(lambda x: trans_dict[x])

Paste & risotto more categories than it should (like deserts, couscous, etc.)  
Same for the 'Vegetables category'

In [113]:
labs_food_recipe.loc[labs_food_recipe['reduced_category'] == 'Pasta & risotto', 'reduced_category'] = ''
labs_food_recipe.loc[labs_food_recipe['reduced_category'] == 'Vegetables', 'reduced_category'] = ''

Some categories are 'error'. Let's get rid of these

In [114]:
labs_food_recipe.loc[labs_food_recipe['eng_cat'] == 'Error', 'eng_cat'] = ''

Some categories say 'no_cat'. Drop rows where both category columns contain 'no_cat'  
If only one column contains it, replace with empty string

In [115]:
labs_food_recipe = labs_food_recipe[(labs_food_recipe['reduced_category'] != 'no_cat') |
                                   (labs_food_recipe['category'] != 'no_cat')]

labs_food_recipe.loc[labs_food_recipe['eng_cat'] == 'No_cat', 'eng_cat'] = ''
labs_food_recipe.loc[labs_food_recipe['reduced_category'] == 'no_cat', 'reduced_category'] = ''

Replace _ with spaces; replace & with and

In [116]:
labs_food_recipe['reduced_category'] = labs_food_recipe['reduced_category'].apply(lambda x: 
                                                                                  x.replace('_', ' ').
                                                                                  replace('&', 'and')) 
labs_food_recipe['eng_cat'] = labs_food_recipe['eng_cat'].apply(lambda x: x.replace('_', ' ').
                                                                replace('&', 'and')) 

Add Label

In [117]:
labs_food_recipe['label'] = '<start>'+ labs_food_recipe['reduced_category'] + ';' + labs_food_recipe['eng_cat']+'<end>'
labs_food_recipe['label'] = labs_food_recipe['label'].apply(lambda x: x.replace('<start>;', '').
                                                                        replace(';<end>', '').
                                                                        replace('<start>', '').
                                                                        replace('<end>', ''))

labs_food_recipe['label'] = labs_food_recipe['label'].apply(lambda x: x.lower())

labs_food_recipe = labs_food_recipe[['file', 'label', 'recipe_id']]
labs_food_recipe.reset_index(drop=True, inplace=True)

In [118]:
labs_food_recipe['file'] = labs_food_recipe['file'].apply(lambda x: dir_food_recipe/x)

Drop `recipe_id` column

In [119]:
labs_food_recipe.drop('recipe_id', axis=1, inplace=True)

In [120]:
labs_food_recipe.head()

Unnamed: 0,file,label
0,/home/jupyter/data/Food_Recipe/000/recipe-1007...,penne
1,/home/jupyter/data/Food_Recipe/005/recipe-1007...,penne
2,/home/jupyter/data/Food_Recipe/012/recipe-1007...,penne
3,/home/jupyter/data/Food_Recipe/043/recipe-1007...,penne
4,/home/jupyter/data/Food_Recipe/200/recipe-1007...,penne


#### Write to CSV

In [121]:
labs_food_recipe.to_csv(path/'Food_Recipe/labels_food_recipe.csv', index=False, header=True)

### Food-101

#### Get data


In [122]:
def get_lab_df(txt_path):
    
    labs = pd.read_csv(txt_path, header = None, sep='\t')
    labs[1] = labs[0].apply(lambda x: x.split('/')[0])  # Add label
    labs.columns = ['file', 'label']
    
    return labs

For food101, training and testing image names are in separate text files.  
We put in in 1 data frame, with an indicatory column separating train/test examples

In [123]:
fns_train_food101 = path/'food-101/meta/train.txt'
fns_test_food101 = path/'food-101/meta/test.txt'

In [124]:
dir_food101 =  path/'food-101/images'

#### Process data 

In [125]:
train_labs = get_lab_df(fns_train_food101)
test_labs = get_lab_df(fns_test_food101)

In [126]:
train_labs['test'] = False
test_labs['test'] = True
labs_food101 = pd.concat([train_labs, test_labs], axis = 0)

In [127]:
labs_food101['file'] = labs_food101['file'].apply(lambda x: x+'.jpg')
labs_food101['label'] = labs_food101['label'].apply(format_lab)

In [128]:
labs_food101['file'] = labs_food101['file'].apply(lambda x: dir_food101/x)

In [129]:
labs_food101.head()

Unnamed: 0,file,label,test
0,/home/jupyter/data/food-101/images/apple_pie/1...,apple pie,False
1,/home/jupyter/data/food-101/images/apple_pie/1...,apple pie,False
2,/home/jupyter/data/food-101/images/apple_pie/1...,apple pie,False
3,/home/jupyter/data/food-101/images/apple_pie/1...,apple pie,False
4,/home/jupyter/data/food-101/images/apple_pie/1...,apple pie,False


#### Write to CSV

In [130]:
labs_food101.to_csv(path/'food-101/labels_food101.csv', index=False, header=True)

## Generate combined dataframe and clean data

In [177]:
labels = [uec256, upmc_labels, labs_food101, labs_google_double, labs_google_single,newgoogleimg_single_df,
          newgoogleimg_double_df, labs_food_recipe]


In [178]:
list(map(lambda x: len(x), labels))


[31395, 90704, 101000, 12358, 7044, 50172, 19049, 492228]

In [179]:
labels = pd.concat(labels, axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Change some classes

In [180]:
labels['label'] = labels['label'].apply(lambda x: x.replace("chicken-'n'-egg on rice", 'chicken egg rice')
                                                   .replace('baby guling', 'roast baby pork')
                                                   .replace('trunip pudding', 'turnip pudding'))

labels['label'] = labels['label'].apply(lambda x: x.replace("_", " "))
labels['test'] = True
labels['valid'] = True

In [181]:
sys.getsizeof(labels) / (1024**2)


143.32507991790771

clean data

In [182]:
def check_imgs_exist(df):
    drop_list = []
    for count, row in tqdm(df.iterrows()):
        try:
            img = Image.open(row['file'])
        except (IOError, SyntaxError, UnboundLocalError) as e: 
            drop_list.append(count)            
    df_red = df.drop(index = drop_list)
    return df_red

In [183]:
labels = labels[~labels.label.isna()]
len(labels), len(labels[labels.label.isna()])
labels = check_imgs_exist(labels)

803950it [02:28, 5406.42it/s] 


In [189]:
labels.isnull().values.any()

False

Create dataset with reduced classes

In [190]:
def create_dict(df):
    class_dict = {}
    for itemcount,row in class_red.iterrows():
        class_dict[row['original']] = row['reduced']
    return class_dict    

def reduce_classes(row,class_dict):
    split_food_item = row.split(';')
    new_food_item = ''
    for count, food_item in enumerate(split_food_item):
        try:
            food_tf = class_dict[food_item]
        except:
            food_tf = food_item
        if count == 0:    
            new_food_item += food_tf
        else:
            new_food_item += ';' + food_tf
    return new_food_item        
                

In [191]:
class_red = pd.read_csv(str(path) + '/food_classes_orig_red.csv')
class_dict = create_dict(class_red)
labels_red = labels.copy()
labels_red['label'] = labels_red['label'].apply(lambda x: reduce_classes(x,class_dict))
labels_red = labels_red[labels_red.label != 'invalid']

## Write data to csv

In [192]:
labels_red.to_csv(path/'food_label_concat_new_red.csv', index=False)
labels.to_csv(path/'food_label_concat_new.csv', index=False)

In [193]:
labels_red.isnull().values.any()

False