In [None]:
import requests
from datetime import datetime, timedelta
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(nb_workers = 16, progress_bar=True)
from tqdm import tqdm
from glob import glob
from pathlib import Path
import ast
import json
import os
import requests
from PIL import Image
from sklearn.model_selection import train_test_split

from io import BytesIO
tqdm.pandas()

### Call data gov API to the trademark image URLs

In [None]:
start_date = datetime.strptime("2009-01-01", "%Y-%m-%d")
end_date = datetime.strptime("2017-12-31", "%Y-%m-%d")  # inclusive
url = "https://api.data.gov.sg/v1/technology/ipos/trademarks"

In [None]:

def get_data_for_date(row):
    date_str = row["date_str"]
    params = {"lodgement_date": date_str}
    try:
        response = requests.get(url, params=params, timeout=30)
        # print(response.json())
        if response.status_code == 200:
            data = response.json()
            item_data = []
            if len(data['items']) >0:
                for item in data['items']:
                    if item['markIndex'] is None:
                        continue
                    item_data.append((item['markIndex'][0]['wordsInMark'], item['markIndex'][0]['chineseCharacter'], 
                    item['documents'][0]['url'], item['markIndex'][0]['descrOfDevice']))
                return item_data
            else:
                return None
        else:
            return None 
    except:
        return None
    

def expand(row):
    data_points = row['data_points']
    return data_points[0], data_points[1], data_points[2], data_points[3]


In [None]:
all_dates = []
current_date = start_date
while current_date <= end_date:
    all_dates.append(current_date.strftime("%Y-%m-%d"))
    current_date += timedelta(days=1)

df = pd.DataFrame({"date_str":all_dates})
df["data_points"] = df.parallel_apply(lambda row: get_data_for_date(row), axis=1)

In [None]:
df = df.loc[~df.data_points.isna()]
df = df.loc[df.data_points.apply(lambda x: len(x) >0)]
df = df.explode(['data_points'])
df[['wordsInMark', 'chineseCharacter', 'url', 'descrOfDevice']] = df.apply(lambda row: expand(row), axis=1, result_type="expand")
df = df.drop(columns=["data_points"])
df.reset_index(inplace=True, drop=True)
df.to_csv("./trademark_data_p3.csv", index=False)

### Download images

In [None]:
base_path = Path(f"./images")
Path(base_path).mkdir(parents=True, exist_ok=True)

In [None]:
def download_images(row):
    try:
        image_url = row["url"]
        file_name = image_url.split("/")[-1]

        if os.path.exists(f"{str(base_path)}/{file_name}"):
            return True
        else:
            response = requests.get(image_url, timeout=10)
            img = Image.open(BytesIO(response.content))
            img.save(f"{str(base_path)}/{file_name}",  optimize=True, quality=70) 
            return True
    except:
        return False

In [None]:
trademark = pd.read_csv("./tradmark_data.csv")
trademark["downloaded"] = trademark.parallel_apply(download_images, axis=1)
trademark.loc[trademark["downloaded"]].to_csv("trademark_with_images.csv")
trademark = trademark.loc[trademark["downloaded"]]
trademark.fillna("", inplace=True)

### Create train test datasets

In [None]:
def get_answer(row):
    answer = {
    "wordsInMark": row["wordsInMark"],
    "chineseCharacter" : row["chineseCharacter"],
    "descrOfDevice" : row["descrOfDevice"]
    }
    return json.dumps(answer)

def get_size(row):
    image_path = row["image_path"]
    img = Image.open(image_path)
    width = img.size[0]
    img.close()
    return width

In [None]:
trademark["answer"] = trademark.apply(get_answer, axis=1)
trademark.to_csv("trademark_with_images.csv", index=False)

In [None]:
trademark = pd.read_csv("trademark_with_images.csv")
trademark = trademark.loc[~trademark.url.isna()]
trademark.fillna("", inplace=True)
trademark["answer"] = trademark.apply(get_answer, axis=1)
trademark["image_path"] = trademark.url.apply(lambda x: f"./images/{x.split('/')[-1]}")
trademark = trademark.loc[~trademark["image_path"].str.contains('tif')]
trademark["img_width"]  = trademark.parallel_apply(get_size, axis=1)

In [None]:
trademark = trademark.loc[trademark["img_width"] <2400]

In [None]:
train, test = train_test_split(trademark, test_size=0.2, random_state=42)
train.to_csv("trademark_train.csv", index=False)
test.to_csv("trademark_test.csv", index=False)
trademark.to_csv("trademark_with_images.csv", index=False)