In [1]:
# imports

import os
import pandas as pd
import numpy as np
import cv2

from PIL import Image

from sklearn.model_selection import train_test_split

# For reproducibility
np.random.seed(42)

In [3]:
# Read in the JSON

photos = pd.read_json('../../../data/yelp/photo.json', lines = True)

In [4]:
photos.head()

Unnamed: 0,caption,photo_id,business_id,label
0,,MllA1nNpcp1kDteVg6OGUw,rcaPajgKOJC2vo_l3xa42A,inside
1,,YjxBE88Bf6CmTEF2LP1UNA,Kn23LDd740SBVJ7mum0fwg,inside
2,,1f7izSjM0WjkDRIVbPy1yw,ZkGDCVKSdf8m76cnnalL-A,food
3,,NcSlcDTEEeOaixotOPk-rA,bF8gv7k_rwZtiDLP2ZB04w,inside
4,,5IiIo5UKEW0lWqZ6sWrY_A,50Anorn0DJXFhBr9a9_gHQ,inside


In [5]:
# Look at the split of labels

photos['label'].value_counts()

food       114874
inside      52448
drink       18121
outside     11534
menu         3023
Name: label, dtype: int64

In [6]:
# Drop the caption and the business ID
# We will seek to classify the images strictly on the content of the image

photos = photos.drop(columns = ['caption', 'business_id'])

In [7]:
photos.head()

Unnamed: 0,photo_id,label
0,MllA1nNpcp1kDteVg6OGUw,inside
1,YjxBE88Bf6CmTEF2LP1UNA,inside
2,1f7izSjM0WjkDRIVbPy1yw,food
3,NcSlcDTEEeOaixotOPk-rA,inside
4,5IiIo5UKEW0lWqZ6sWrY_A,inside


In [8]:
# Map the labels to numerical values

photos['label'] = photos['label'].map({'food': 0, 'inside': 1, 'drink': 2, 'outside': 3, 'menu': 4})

In [9]:
photos.head()

Unnamed: 0,photo_id,label
0,MllA1nNpcp1kDteVg6OGUw,1
1,YjxBE88Bf6CmTEF2LP1UNA,1
2,1f7izSjM0WjkDRIVbPy1yw,0
3,NcSlcDTEEeOaixotOPk-rA,1
4,5IiIo5UKEW0lWqZ6sWrY_A,1


In [10]:
# Look at percentages

photos['label'].value_counts(normalize = True)

0    0.574370
1    0.262240
2    0.090605
3    0.057670
4    0.015115
Name: label, dtype: float64

In [11]:
# Save the revised CSV - keep it commented to not overwrite

# photos.to_csv('../../../data/yelp/photos_with_labels.csv', index = False)

In [14]:
# Split individual photos into their respective categories

food = photos[photos['label'] == 0]
inside = photos[photos['label'] == 1]
drink = photos[photos['label'] == 2]
outside = photos[photos['label'] == 3]
menu = photos[photos['label'] == 4]

In [15]:
# Save the individual CSVs - keep them commented so to not overwrite

# food.to_csv('../../../data/yelp/food_with_labels.csv', index = False)
# inside.to_csv('../../../data/yelp/inside_with_labels.csv', index = False)
# drink.to_csv('../../../data/yelp/drink_with_labels.csv', index = False)
# outside.to_csv('../../../data/yelp/outside_with_labels.csv', index = False)
# menu.to_csv('../../../data/yelp/menu_with_labels.csv', index = False)