# PreProcess Metadata

## Imports

In [None]:
import os
import json
import csv
import re
from tqdm import tqdm


## Definitions

In [None]:
use_google_drive = False

try:
  import google.colab
  from google.colab import drive
  use_google_drive = True
except Exception:
  pass


In [None]:
if use_google_drive:
  metadata_folder = "/content/gdrive/MyDrive/ColabData/amazon/"
  metadata_source = "/content/gdrive/MyDrive/ColabData/amazon/metadata.csv"
  images_source = "/content/gdrive/MyDrive/ColabData/amazon/images.csv"

  drive.mount("/content/gdrive")
else:
  metadata_folder = "metadata\\"
  metadata_source = "metadata\\metadata.csv"
  images_source = "metadata\\images.csv"


In [None]:
images = {}
product_numbers = {}
product_color_numbers = {}

valid_colors = ["multicolor", "black", "white", "gray", "red", "green", "blue", "orange", "purple", "yellow", "pink", "brown"]
colors = {
    "multicolor": ["multi"],
    "other": ["other"],
    "black": ["black", "asphalt", "caviar", "graphite", "Schwarz"],
    "white": ["white", "ivory"],
    "gray": ["gray", "grey", "chrome", "silver", "steel", "charcoal", "nickel", "aluminium", "anthracite", "ash", "dove", "fog",
             "iron", "pewter", "platinum", "slate", "sliver", "smoke", "stainless"],
    "red": ["red", "rose", "bordeaux", "burgundy", "maroon", "merlot", "autumn", "berry", "brick", "burgandy", "cherry",
            "garnet", "mahogany", "maron"],
    "green": ["green", "mint", "olive", "alligator", "aloe", "cadet", "emerald", "lagoon", "lemongrass", "sage", "seafoam", "sod",
              "teal", "turquoise"],
    "blue": ["blue", "navy", "aqua", "denim", "azure", "blau", "bule", "sapphire", "sky"],
    "orange": ["orange", "fire", "flame", "fawn", "pumpkin", "rust"],
    "purple": ["purple", "amethyst", "fuchsia", "heather", "lavender", "lilac", "magenta"],
    "yellow": ["yellow", "gold", "amber", "brass", "butter", "canary", "citrine", "flax"],
    "pink": ["pink", "blush", "champagne", "linen"],
    "brown": ["brown", "beige", "biege" "braun", "bronze", "camel", "caramel", "sand", "tan", "walnut", "acorn",
              "antique", "barnwood", "chestnut", "chocolate", "cognac", "ecru", "hemp", "khaki", "oak", "saddle", "taupe", "wenge"]
}


## Helper Functions

In [None]:
def get_value(obj, field):
  obj['item_id']
  if field in obj.keys():
    return obj[field]
  else:
    return None

def find_value(values):
  if values == None:
    return ""

  for value in values:
    if not "language_tag" in value.keys():
      return value["value"]
    elif str(value["language_tag"]).startswith("en_"):
      return value["value"]
  return ""

def iterate_value(values, index):
  if values == None:
    return ""

  if type(values) is list and len(values) > index:
    return values[index]
  else:
    return ""

def extract_value(value: str, options={}, notdefined="notdefined"):
  if value == None:
    return notdefined

  found = None

  for fkey, fvalue in options.items():
    for color in fvalue:
      if re.search(color, value, re.IGNORECASE):
        if found != None and found != fkey:
          return "multicolor"
        else:
          found = fkey

  if found == None:
    return notdefined
  else:
    return found

def get_next_number(numbers: dict, class1, class2):
  key = f"{class1}{class2}"
  if key in numbers.keys():
    numbers[key] += 1
  else:
    numbers[key] = 1

  return numbers[key]


## Processing Metadata

In [None]:
with open(images_source, newline='') as imagesfile:
  temp = csv.reader(imagesfile, delimiter=',')
  for line in temp:
    images[line[0]] = "images/small/" + line[3][0:-4]


with open(metadata_source, 'w', newline='') as csvfile:
  csvheader = ["item_id", "color", "extracted-color", "valid-color", "product_type", "main_image_id", "country", "product_image_sort"]
  csvwriter = csv.writer(csvfile, dialect='excel')
  csvwriter.writerow(csvheader)

  for file in tqdm(os.listdir(metadata_folder), desc="Files"):
    if not file.endswith(".json"):
      continue

    with open(os.path.join(metadata_folder, file), 'r') as jsonfile:
      for line in tqdm(jsonfile.readlines(), desc="Lines"):
        obj = json.loads(line)

        color = find_value(get_value(obj, 'color')).strip().replace("\n", "").replace("\t", "")
        extracted_color = extract_value(color, options=colors)
        valid_color = extracted_color if (extracted_color in valid_colors) else "not-valid"
        image_id = get_value(obj, 'main_image_id')
        product_type = find_value(get_value(obj, 'product_type'))

        if image_id == None or image_id == "":
          continue

        product_number = get_next_number(product_numbers, product_type, None)
        product_color_number = get_next_number(product_numbers, product_type, valid_color)

        csvwriter.writerow(
            [get_value(obj, 'item_id'),
             color,
             extracted_color,
             valid_color,
             product_type,
             images[image_id],
             get_value(obj, 'country'),
             0])

        for i in range(5):
          additional_image = iterate_value(get_value(obj, 'other_image_id'), i)

          if additional_image != "":
            product_number = get_next_number(product_numbers, product_type, None)
            product_color_number = get_next_number(product_numbers, product_type, valid_color)
            csvwriter.writerow(
                [get_value(obj, 'item_id'),
                 color,
                 extracted_color,
                 valid_color,
                 product_type,
                 images[additional_image],
                 get_value(obj, 'country'),
                 i + 1
                 ])
