In [30]:
import sys
import json
import fastavro  # 0.17.9
import os
import pandas as pd
from multiprocessing import Pool
import logging
from pyspark.sql.types import *
from pyspark.sql.functions import *

sc.addFile("data/model/fake/lv1/",recursive=True)
sc.addFile("data/model/fake/lv2/",recursive=True)
sc.addFile("data/model/fake/lv3/",recursive=True)
sc.addFile("data/dictionary/dict_token.txt")
sc.addFile("data/dictionary/dict_garbage.txt")

In [31]:
from fasttext_pyspark import const, predict

FORMAT = const.LOG_MSG_FORMAT
logging.basicConfig(format=FORMAT, datefmt=const.LOG_DATE_FORMAT)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

model_lv1 = None
model_lv2 = None
model_lv3 = None

In [32]:
from fasttext_pyspark.predict import predict

dataFrame = None
lst = list()
with open(const.get_cqi_input_file_path() + 'part-00000-ef5244bc-86ef-49a1-91e0-cc95a8f5cd95-c000.avro', 'rb') as fo:
    reader = fastavro.reader(fo)
    for record in reader:
        lst.append(
            [record['itemId'],
             record['productId'],
             record['categoryCode'],
             record['originalAttr'],
             record['normalizedAttr'],
             record['excludeType'],
             record['categoryCodeLv1'],
             record['categoryNameLv1'],
             0])
    cSchema = StructType([StructField("itemId", IntegerType())\
                      ,StructField("productId", IntegerType()),
                       StructField("categoryCode", StringType()),
                       StructField("originalAttr", StringType()),
                       StructField("normalizedAttr", StringType()),
                       StructField("excludeType", StringType()),
                       StructField("categoryCodeLv1", StringType()),
                       StructField("categoryNameLv1", StringType()),
                       StructField("isCleansed", IntegerType())])    
    dataFrame = spark.createDataFrame(lst, cSchema)

dataFrame.show()

+-------+---------+------------+--------------------+--------------------+-----------+---------------+---------------+----------+
| itemId|productId|categoryCode|        originalAttr|      normalizedAttr|excludeType|categoryCodeLv1|categoryNameLv1|isCleansed|
+-------+---------+------------+--------------------+--------------------+-----------+---------------+---------------+----------+
|  33307|    11276|       65872|메라독 퓨어 구디스낵 애견간식 ...|메라독 퓨어 구디스낵 애견간식 ...|     RETAIL|          65799|        반려/애완용품|         0|
|  33307|    11276|       65872|메라독 퓨어 구디스낵 애견간식 ...|메라독 퓨어 구디스낵 애견간식 ...|     RETAIL|          65799|        반려/애완용품|         0|
|  33307|    11276|       65872|메라독 퓨어 구디스낵 애견간식 ...|메라독 퓨어 구디스낵 애견간식 ...|     RETAIL|          65799|        반려/애완용품|         0|
|9096629|    11938|       63901|녹스 자연이좋은 3겹데코 롤화장...|녹스 자연이좋은 3겹데코 롤화장...|     RETAIL|          63897|           생활용품|         0|
|9096629|    11938|       63901|녹스 자연이좋은 3겹데코 롤화장...|녹스 자연이좋은 3겹데코 롤화장...|     RETAIL|    

In [33]:
schema = StructType([
    StructField('ml_lv1_catecode', StringType()),
    StructField('ml_lv1_score', StringType())
])
udf_predict = udf(predict, schema)
udf_predict_result = dataFrame.withColumn("finalresult", udf_predict('normalizedAttr','productId','itemId'))
udf_predict_result.show()

+-------+---------+------------+--------------------+--------------------+-----------+---------------+---------------+----------+------------+
| itemId|productId|categoryCode|        originalAttr|      normalizedAttr|excludeType|categoryCodeLv1|categoryNameLv1|isCleansed| finalresult|
+-------+---------+------------+--------------------+--------------------+-----------+---------------+---------------+----------+------------+
|  33307|    11276|       65872|메라독 퓨어 구디스낵 애견간식 ...|메라독 퓨어 구디스낵 애견간식 ...|     RETAIL|          65799|        반려/애완용품|         0|[80285,null]|
|  33307|    11276|       65872|메라독 퓨어 구디스낵 애견간식 ...|메라독 퓨어 구디스낵 애견간식 ...|     RETAIL|          65799|        반려/애완용품|         0|[80285,null]|
|  33307|    11276|       65872|메라독 퓨어 구디스낵 애견간식 ...|메라독 퓨어 구디스낵 애견간식 ...|     RETAIL|          65799|        반려/애완용품|         0|[80285,null]|
|9096629|    11938|       63901|녹스 자연이좋은 3겹데코 롤화장...|녹스 자연이좋은 3겹데코 롤화장...|     RETAIL|          63897|           생활용품|         0|[69182,null]|