In [139]:
import eigen_fastText
from fastText import load_model
from jieba import Tokenizer

In [46]:
class Item(object):
    """
    淘宝商品分类模型
    类别包含：
    ['__label__全身护理|口腔护理','__label__全身护理|洗发',
    
     '__label__内衣|保暖内衣', '__label__内衣|女式内裤','__label__内衣|家居', '__label__内衣|情趣内衣', '__label__内衣|文胸',
     '__label__内衣|男式内裤','__label__内衣|睡衣','__label__内衣|背心', '__label__内衣|连裤袜',
     
     '__label__女装|T恤','__label__女装|中老年装','__label__女装|休闲裤','__label__女装|半身裙', '__label__女装|卫衣',
     '__label__女装|大码装','__label__女装|大衣', '__label__女装|打底裤','__label__女装|牛仔裤', '__label__女装|羽绒服',
     '__label__女装|衬衫','__label__女装|西服', '__label__女装|连衣裙', '__label__女装|针织衫', '__label__女装|风衣', '__label__女装|马甲',
     
     '__label__女鞋|休闲鞋', '__label__女鞋|凉鞋', '__label__女鞋|单鞋', '__label__女鞋|女靴', '__label__女鞋|高跟鞋',
     
     '__label__护肤|化妆水/爽肤水', '__label__护肤|洁面', '__label__护肤|眼部护理', '__label__护肤|精华', '__label__护肤|面膜',
     '__label__护肤|面霜',
     
     '__label__男装|POLO衫', '__label__男装|T恤', '__label__男装|休闲裤', '__label__男装|卫衣', '__label__男装|夹克',
     '__label__男装|牛仔裤', '__label__男装|短裤', '__label__男装|羽绒服', '__label__男装|衬衫', '__label__男装|针织衫',
     
     '__label__男鞋|休闲鞋', '__label__男鞋|正装鞋',
     
     '__label__童装|功能鞋', '__label__童装|单鞋', '__label__童装|运动鞋',
     
     '__label__运动女鞋|训练鞋', '__label__运动女鞋|跑步鞋', '__label__运动男鞋|篮球鞋', '__label__运动男鞋|训练鞋', '__label__运动男鞋|跑步鞋',
     
     '__label__配饰|其他配件', '__label__配饰|腰带']
    """
    def __init__(self, model_path = None):
        self.punct = set(u''':!),.:;?]}¢'"、。〉》」』】〕〗〞︰︱︳﹐､﹒﹔﹕﹖﹗﹚/﹜﹞！），．：；？｜｝︴︶︸︺︼︾﹀﹂﹄﹏､～￠々‖•·ˇˉ―--′’”([{£¥'"‵〈《「『【〔〖（［｛￡￥〝︵︷︹︻︽︿﹁﹃\t﹙+﹛﹝（｛“‘-—_…''')
        self.stopwords = set('的和呢了吧')
        if model_path is None:
            self.model = eigen_fastText.load_model('/data/share/model_clothes_others.bin')
        else:
            self.model = eigen_fastText.load_model(model_path)
            
    def preprocess(self, text):
        """
        given a string or a list , return the string that can be processed by fasttext
        """
        re = []
        if type(text) == str:
            return self.preprocess_str(text)
        else:
            for t in text:
                re.append(self.preprocess_str(t))
        return re
    
    def preprocess_str(self, text):
        text = self.rmpunct(text)
        text = self.rmstopwords(text)
        text = self.splitwords(text)
        return text
    
    def predict(self, text, k = 1, prob = True):
        """
        given a string or a list, return their predicted label and probality
        """
        if len(text) == 0:
            raise ValueError(
            "Text should not be None")
        if type(text) == list:
            labels = []
            probas = []
            text = self.preprocess(text)
            for t in text:
                #print(text)
                label, proba = self.model.predict(t, k)
                labels.append(label)
                probas.append(proba)
            if prob is True:
                return labels, probas
            else:
                return labels
        else:
            text = self.preprocess(text)
            #print(text)
            label, proba = self.model.predict(text, k)
            if prob is True:
                return label, proba
            else:
                return label
            
    def rmpunct(self, text):
        """
        remove punct from given string
        """
        return  ''.join(filter(lambda x: x not in self.punct, text))
    
    def rmstopwords(self, text):
        """
        remove stopwords from given string
        """
        return  ''.join(filter(lambda x: x not in self.stopwords, text))
    
    def splitwords(self, text):
        """
        split words from given string using jieba
        """
        return ' '.join(jieba.cut(text))
                        
                   

In [91]:
model = Item()
print(model.predict(['车载手机架汽车手机支架车用导航支撑架磁性吸盘式出风口万能通用', '茉莉和扶苏原创"墨渊棕色版"复古棉麻连衣裙森系高腰长裙'], k=2))

([('__label__汽车用品', '__label__手机/手机配件'), ('__label__女装|连衣裙', '__label__女装|半身裙')], [array([1.00000811e+00, 1.11995787e-05]), array([1.00000906e+00, 1.10001511e-05])])


In [90]:
model.predict(['冬季情侣高领毛衣男韩版潮流个性宽松羊毛针织衫男士外套加厚线衣', '喜黛姿2018新款女装春装小香风名媛气质显瘦修身蕾丝连衣裙a字裙'], k = 3)

([('__label__男装|针织衫', '__label__男装|卫衣', '__label__内衣|保暖内衣'),
  ('__label__女装|连衣裙', '__label__女装|半身裙', '__label__女装|衬衫')],
 [array([9.99980569e-01, 2.11532370e-05, 1.68814859e-05]),
  array([9.99753296e-01, 2.45982694e-04, 2.06450422e-05])])

In [89]:
model.model.get_labels()

['__label__女装|连衣裙',
 '__label__女装|衬衫',
 '__label__运动男鞋|训练鞋',
 '__label__女装|针织衫',
 '__label__女鞋|单鞋',
 '__label__运动女鞋|训练鞋',
 '__label__女装|T恤',
 '__label__童装|运动鞋',
 '__label__男装|衬衫',
 '__label__童装|功能鞋',
 '__label__童装|单鞋',
 '__label__女装|大衣',
 '__label__内衣|文胸',
 '__label__女装|半身裙',
 '__label__运动男鞋|跑步鞋',
 '__label__女装|休闲裤',
 '__label__男鞋|休闲鞋',
 '__label__男装|夹克',
 '__label__男装|T恤',
 '__label__女装|牛仔裤',
 '__label__女装|风衣',
 '__label__男装|休闲裤',
 '__label__护肤|面膜',
 '__label__内衣|女式内裤',
 '__label__男装|针织衫',
 '__label__女装|卫衣',
 '__label__箱包',
 '__label__数码',
 '__label__游戏',
 '__label__运动户外',
 '__label__珠宝钟表',
 '__label__汽车用品',
 '__label__美食特产',
 '__label__医疗保健',
 '__label__电脑/办公',
 '__label__家用电器',
 '__label__家居家纺',
 '__label__手机/手机配件',
 '__label__家具建材',
 '__label__日用百货',
 '__label__文化娱乐',
 '__label__母婴用品',
 '__label__运动男鞋|篮球鞋',
 '__label__女鞋|女靴',
 '__label__男装|牛仔裤',
 '__label__运动女鞋|跑步鞋',
 '__label__内衣|睡衣',
 '__label__男鞋|正装鞋',
 '__label__全身护理|口腔护理',
 '__label__男装|卫衣',
 '__label__内衣|保暖内衣',
 '__label__女装|

In [67]:
list(zip(labels, probs))

[(('__label__游戏', '__label__文化娱乐', '__label__电脑/办公'),
  array([9.95380580e-01, 3.41625558e-03, 7.60550553e-04])),
 (('__label__男装|T恤', '__label__运动户外', '__label__男装|卫衣'),
  array([9.99994874e-01, 2.47645767e-05, 1.02627673e-05]))]

In [69]:
 for i,(label,prob) in enumerate(zip(labels,probs)):
        print(i, list(zip(label,prob)))

0 [('__label__游戏', 0.9953805804252625), ('__label__文化娱乐', 0.003416255582123995), ('__label__电脑/办公', 0.0007605505525134504)]
1 [('__label__男装|T恤', 0.9999948740005493), ('__label__运动户外', 2.4764576664892957e-05), ('__label__男装|卫衣', 1.0262767318636179e-05)]


In [84]:
predict = [{'prob' : 1, 'name': 'zhuyuhe'}, {'prob': 2, 'name': 'shenjiajia'}]
max(predict, key = lambda x: x['prob'])

{'name': 'shenjiajia', 'prob': 2}

In [140]:
class TaobaoCommodityCategory():
    def __init__(self, config):
        model_path = config.get('model_path', '/data/share/model_clothes_others.bin')
        self.model = eigen_fastText.load_model(model_path)
        labels = self.model.get_labels()
        self.k = len(labels)
        self.category2id = {name:i for i, name in enumerate(labels)}
        self.id2category = {v:k for k, v in self.category2id.items()}
        self.punct = set(u''':!),.:;?]}¢'"、。〉》」』】〕〗〞︰︱︳﹐､﹒﹔﹕﹖﹗﹚/﹜﹞！），．：；？｜｝︴︶︸︺︼︾﹀﹂﹄﹏､～￠々‖•·ˇˉ―--′’”([{£¥'"‵〈《「『【〔〖（［｛￡￥〝︵︷︹︻︽︿﹁﹃\t﹙+﹛﹝（｛“‘-—_…''')
        self.stopwords = set('的和呢了吧')
            
    def preprocess(self, text):
        """
        given a string or a list , return the string that can be processed by fasttext
        """
        res = []
        if isinstance(text, str):
            return self.preprocess_str(text)
        elif isinstance(text, list):
            for t in text:
                res.append(self.preprocess_str(t))
        return res
    
    def preprocess_str(self, text):
        text = self.rmpunct(text)
        text = self.rmstopwords(text)
        text = self.splitwords(text)
        return text

    def rmpunct(self, text):
        """
        remove punct from given string
        """
        if not isinstance(text, str):
            return text
        return  ''.join(filter(lambda x: x not in self.punct, text))
    
    def rmstopwords(self, text):
        """
        remove stopwords from given string
        """
        if not isinstance(text, str):
            return text
        return  ''.join(filter(lambda x: x not in self.stopwords, text))
    
    def splitwords(self, text):
        """
        split words from given string using jieba
        """
        if not isinstance(text, str):
            return text
        cutter = Tokenizer().cut
        return ' '.join(cutter(text))
    
    def classification(self, query, k = 3, version = 0, **kwargs):
        """
        given a string or a list, return their predicted label and probality
        """
        if not query:
            return None

        is_list = True
        if not isinstance(query, list):
            query = [query]
            is_list = False

        tokenize_query = self.preprocess(query)
        labels, probs = self.model.predict(tokenize_query, k)
        res = []

        for i, (label, prob) in enumerate(zip(labels, probs)):
            predict = [{"category": self.category2id[l], "prob": p, "name": l.replace('__label__', '').replace('|', '_')} for l,p in zip(label, prob)]
            res.append({"query": query[i], "version": version, "predict": predict})

        if is_list:
            return res
        else:
            return res[0]
        

In [141]:
config = {'model_path': "/data/share/model_clothes_others_0209.bin"}
model = TaobaoCommodityCategory(config)

query1 = "喜黛姿2018新款女装春装小香风名媛气质显瘦修身蕾丝连衣裙a字裙"
predict1 = model.classification(query1)
max(predict1['predict'], key = lambda x: x['prob'])['name'] == '女装_连衣裙'

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.688 seconds.
DEBUG:jieba:Loading model cost 0.688 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


True

In [116]:
query2 = "冬季情侣高领毛衣男韩版潮流个性宽松羊毛针织衫男士外套加厚线衣"
predict2 = model.classification(query2)
predict2

{'predict': [{'category': 24, 'name': '男装_针织衫', 'prob': 0.9999386072158813},
  {'category': 17, 'name': '男装_夹克', 'prob': 3.9851820474723354e-05},
  {'category': 3, 'name': '女装_针织衫', 'prob': 2.735344423854258e-05}],
 'query': '冬季情侣高领毛衣男韩版潮流个性宽松羊毛针织衫男士外套加厚线衣',
 'version': 0}

In [118]:
print(str(model))

<__main__.TaobaoCommodityCategory object at 0x7f655d393a20>


In [131]:
items = 'oss://modelzoo/fashion/model_sku.bin'.split('/')

In [132]:
prefix = '/'.join(items[3:])

In [133]:
prefix

'fashion/model_sku.bin'

In [134]:
model.model.predict('喜黛姿 20 18 新款 女装 春装 小香风 名媛气质 显瘦 修身 蕾丝连衣裙 a字裙', 3)

(('__label__女装|牛仔裤', '__label__女装|休闲裤', '__label__女装|西服'),
 array([0.20534591, 0.15200402, 0.1284312 ]))

In [None]:
['喜黛姿 20 18 新款 女装 春装 小香风 名媛气质 显瘦 修身 蕾丝连衣裙 a字裙']
ENV: 调试环境
['冬季 情侣 高领毛衣 男 韩版 潮流 个性 宽松 羊毛 针织衫 男士 外套 加厚 线衣']
ENV: 调试环境
['车载 手机 架 汽车 手机 支架 车用 导航 支撑架 磁性 吸盘式 出风口 万能 通用']
ENV: 调试环境

In [None]:
['喜黛姿 2018 新款 女装 春装 小 香风 名媛 气质 显瘦 修身 蕾丝 连衣裙 a 字 裙']
ENV: 调试环境
['冬季 情侣 高领 毛衣 男 韩版 潮流 个性 宽松 羊毛 针织衫 男士 外套 加厚 线衣']
ENV: 调试环境
['车载 手机 架 汽车 手机 支架 车用 导航 支撑架 磁性 吸盘式 出风口 万能 通用']
ENV: 调试环境

In [143]:
import requests
test = '喜黛姿2018新款女装春装小香风名媛气质显瘦修身蕾丝连衣裙a字裙'
requests.post("https://alpha-surreal.aidigger.com/api/v1/classification/category/sku/0",json={"query":test}).json()

{'predict': [{'intent': 0, 'name': '女装_连衣裙', 'prob': 0.9996693134307861},
  {'intent': 12, 'name': '女装_半身裙', 'prob': 0.0003374156658537686},
  {'intent': 1, 'name': '女装_衬衫', 'prob': 1.5870700735831633e-05}],
 'query': '喜黛姿2018新款女装春装小香风名媛气质显瘦修身蕾丝连衣裙a字裙',
 'version': '0'}