In [3]:
! pip3 install opencv-python==3.4.2.17
! pip3 install opencv-contrib-python==3.4.2.17
! pip3 install requests 
! pip3 install beautifulsoup4
! apt install -y libsm6 libxext6 libxrender-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libsm6 is already the newest version (2:1.2.2-1).
libxext6 is already the newest version (2:1.3.3-1).
libxrender-dev is already the newest version (1:0.9.10-1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.


In [4]:
! pip3 install elasticsearch



In [5]:
import requests
from bs4 import BeautifulSoup
import http
from bs4 import element
import urllib
import pprint
from elasticsearch import Elasticsearch

#opencv-contrib-python==3.4.2.17
#opencv-python==3.4.2.17
import cv2 as cv

import numpy as np
import re

In [6]:
def create_index(es,name):
    if not es.indices.exists(name):
        response = es.indices.create(name)

        if response["acknowledged"]:
            print("to create index is successful : index name = '{}'".format(response["index"]))

In [7]:
def index_data(es,index,data,doc_type,id):
    return es.index(index=index, doc_type=doc_type, body=data,id=id)

In [8]:
# METHOD #1: OpenCV, NumPy, and urllib
def url_to_image(url):
    # download the image, convert it to a NumPy array, and then read
    # it into OpenCV format
    resp = urllib.request.urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv.imdecode(image, cv.IMREAD_COLOR)

    # return the image
    return image

In [9]:
def img_url2hash(url):

    img = url_to_image(url)

    gray= cv.cvtColor(img,cv.COLOR_BGR2GRAY)
    sift = cv.xfeatures2d.SIFT_create()

    # Initiate BRIEF extractor
    brief = cv.xfeatures2d.BriefDescriptorExtractor_create()


    kp = sift.detect(img,None)

    # compute the descriptors with BRIEF
    kp, des = brief.compute(img, kp)


    hash_list = []
    for x in des:
        hash_list.append('{}'.format(x.tobytes().hex()))
    return hash_list


category_map = {
    1011010000 : "TV/영상가전"
}

In [10]:
himark_url = 'http://www.e-himart.co.kr'
category_endpoint = 'http://www.e-himart.co.kr/app/display/showDisplayCategory?dispNo='
page_counting_param = '#pageCount={}'

In [11]:
def add_title_info(title_div,product_info):
    str_filter = re.compile('[^0-9a-zA-Zㄱ-힗\[\] ]')
    title = str_filter.sub(' ', title_div.h2.text).lstrip().rstrip()
    product_info['title'] = title

In [12]:
def add_promote_info(title_div,product_info):
    #promote optional
    if title_div.div:
        str_filter = re.compile('[^0-9a-zA-Zㄱ-힗\[\] ]')
        promote = str_filter.sub(' ', title_div.div.text).lstrip().rstrip()
        product_info['promote'] = promote

In [13]:
def add_title_and_promote_info(soup, product_info):
    title_div = soup.find("div", attrs={"class": "prdName"})
    add_title_info(title_div,product_info)
    add_promote_info(title_div,product_info)

In [14]:
def add_model_name_info(soup,product_info):
    if soup.find("div", attrs={"id": "divModelName"}):
        model_name = soup.find("div", attrs={"id": "divModelName"}).text
        model_name = model_name.lstrip().rstrip()
        product_info['model_name'] = model_name
    elif soup.find("span", attrs={"class": "foL"}):
        model_name = soup.find("span", attrs={"class": "foL"}).text
        model_name = model_name.lstrip().rstrip()
        product_info['model_name'] = model_name

In [15]:
def add_price_info(soup,product_info):
    ## 가격
    price_area = soup.find("li", attrs={"class": "priceArea"}).find_all("span", attrs={"class": "price"})
    sale_price = int(price_area[0].text.replace(',',''))
    product_info['sale_price'] = sale_price
    advantage_price = int(price_area[1].text.replace(',',''))
    product_info['advantage_price'] = advantage_price

In [16]:
def add_star_point(soup,product_info):
    #별점 optional
    if soup.find("div", attrs={"class": "gmL"}):
        star_point = float(soup.find("div", attrs={"class": "gmL"}).text)
        product_info['star_point'] = star_point


In [17]:
def add_img_info(soup,product_info):
    #image
    img_link = soup.find("img",attrs={"id": "imgGoodsBigImage"})["src"]
    product_info['img_link'] = img_link
    img_hash = img_url2hash(img_link)
    product_info['img_hashs'] = img_hash

In [18]:
def add_product_info_by_soup(soup,product_id):
    product_info = {}
    add_title_and_promote_info(soup, product_info)
    add_model_name_info(soup, product_info)
    add_price_info(soup, product_info)
    add_star_point(soup, product_info)
    add_img_info(soup, product_info)
    return product_info

In [19]:
def get_product_info_url(url):
    product_id = int(url.split('=')[1])
    req = requests.get(url)
    if req.status_code == http.HTTPStatus.OK:
        html = req.text
        soup = BeautifulSoup(html, 'html.parser')
        product_info = add_product_info_by_soup(soup,product_id)
        product_info['product_id'] = product_id
        return product_info

In [20]:
def index_each_element(es,product_list):
    for product_item in product_list:
        if type(product_item) == element.Tag:
            if product_item.div:
                if product_item.div.a:
                    url = himark_url+product_item.div.a['href']
                    product_info = get_product_info_url(url)
                    
                    if product_info:
                        result = index_data(es,'himart',product_info,'product',product_info['product_id'])
                        print(result)

In [21]:
def index_himart_by_page(es,url,page_num):    
    req = requests.get(url+page_counting_param.format(page_num))
    if req.status_code == http.HTTPStatus.OK:
        # HTML 소스 가져오기
        html = req.text
        # BeautifulSoup으로 html소스를 python객체로 변환하기
        # 첫 인자는 html소스코드, 두 번째 인자는 어떤 parser를 이용할지 명시.
        # 이 글에서는 Python 내장 html.parser를 이용했다.
        soup = BeautifulSoup(html, 'html.parser')
        product_list = soup.find("ul", attrs={"id": "productList"})
        index_each_element(es,product_list)

In [24]:
es = Elasticsearch("localhost:9200",http_auth=('elastic', 'changeme'))


In [None]:
index_himart_by_page(es,'http://www.e-himart.co.kr/app/display/showDisplayCategory?dispNo=1011010000',1)

## 이미지 서치

In [31]:
def search_image(url):
    hash_list = img_url2hash(url)

    hash_query = ""
    for i, hash in enumerate(hash_list):
        if i > 10:
            hash_query += hash + " "
            break;
    query_body = {
        'query':{
            'match':{
                "img_hashs": hash_query
            }
        }
    }

    return es.search(index="himart", body=query_body)

In [32]:
res = search_image('http://static2.e-himart.co.kr/contents/goods/00/01/65/04/32/0001650432__UN65NU7180FXKR__M_450_450.jpg')
print(res['hits']['hits'])

TypeError: 'int' object is not iterable