In [8]:
! pip install openpyxl gensim fasttext lightgbm multipledispatch razdel nmslib

In [9]:
#папка с необходимыми файлами для обучения модели
path = "/kaggle/input/hacksai-3/"
temp = "/kaggle/working/"
#мапинг колонок из файла на колонки в коде
columns = {
    'Коды ТН ВЭД ЕАЭС':'ved_code',
    'Группа продукции':'product_group',
    'Общее наименование продукции':'product_name'
}

In [10]:
import pandas as pd
import numpy as np
import fasttext
import razdel
import os

import fasttext
from gensim.models import FastText

from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, average_precision_score
from sklearn.model_selection import StratifiedShuffleSplit
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
import joblib

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

In [11]:
data = pd.read_excel(os.path.join(path,'dataset.xlsx'))
data = data.rename(columns=columns)
data = data.dropna(subset=['product_name'])
data = data[['ved_code', 'product_group', 'product_name']]

Предобработка текстов

In [12]:
import re
russian_stopwords = open(os.path.join(path, 'stopwords-ru.txt'), 'r').read().split('\n')

def delete_stopwords(s):
    return ' '.join([word for word in (re.sub(r'[()\s+]', u' ', s)).split() if word.lower() not in russian_stopwords]).split()

def delete_punctuation(s):
    symbols = [
           '\t', '!','%','&',"'",'(',')','*','+',',','-','.', '\\', '®',
           '/', '~','«','\xad','¯','°','`','±','²','³','·','º', '»', ':',';','<','=','?','@',
           'É','Ó','Ö','×','Ø','Ü','ä','é','ö','÷','İ','Š','˂','˚','̆','Ι', 'Λ', '[','\\',']','_','`',
          '\u200e','‐','–', '—', '‘', '’', '“', '”', '•', '…', '‧', '⁰', '₂', '℃', '№', '™', 
           'Ⅰ', 'Ⅱ', 'Ⅲ', 'Ⅳ', '↑', '−', '∞', '≤', '\uf0d2' '️','（', '）', '，', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
          ]
    
    return re.sub(r'[{}\s+]'.format(''.join(symbols)), u' ', s.replace('\xad', ' '))

def tokenize_with_razdel(text):
    tokens = [token.text for token in razdel.tokenize(text)]
    
    return tokens

In [13]:
data['clean_product_name'] = data['product_name'].str.lower().apply(lambda x: ' '.join(delete_stopwords(delete_punctuation((x)))))

Подгрузка словаря по ved

In [None]:
ved_dict = pd.read_csv(os.path.join(path, 'ved_dict.csv'), sep=';')

In [None]:
all_text_that_we_have = pd.DataFrame()
all_text_that_we_have['clean_text'] = ved_dict['NAIM1'].dropna().str.lower().apply(lambda x: ' '.join(delete_stopwords(delete_punctuation((x))))).tolist() +\
                    ved_dict['NAIM2'].dropna().str.lower().apply(lambda x: ' '.join(delete_stopwords(delete_punctuation((x))))).tolist()+\
                    ved_dict['NAIM3'].dropna().str.lower().apply(lambda x: ' '.join(delete_stopwords(delete_punctuation((x))))).tolist()+\
                    ved_dict['NAIM4'].dropna().str.lower().apply(lambda x: ' '.join(delete_stopwords(delete_punctuation((x))))).tolist()+\
                    data['product_group'].dropna().str.lower().apply(lambda x: ' '.join(delete_stopwords(delete_punctuation((x))))).tolist()+\
                    data['clean_product_name'].tolist()

all_text_that_we_have = all_text_that_we_have.dropna()

In [None]:
all_text_that_we_have['clean_text'].head()

## Обучение fasttext

In [None]:
import fasttext
from gensim.models import FastText

version1

In [None]:
%%time
open('all_text.txt', 'w').write('\n'.join(all_text_that_we_have['clean_text'].tolist()))
fb_model_v1 = fasttext.train_unsupervised(os.path.join(temp, 'all_text.txt'),
                                       dim=150, ws=5, minCount=5, neg=15, 
                                       minn=3, maxn=15, wordNgrams=2, lr=0.15, epoch=35, loss='hs',
                                       model='skipgram', verbose=3)
fb_model_v1.save_model(os.path.join(temp,'fb_model_v1.bin'))

version2

In [None]:
%%time
open(os.path.join(temp, 'all_text_v2.txt'), 'w').write('\n'.join(data['clean_product_name'].tolist()))
fb_model_v2 = fasttext.train_unsupervised(os.path.join(temp, os.path.join(temp, 'all_text_v2.txt')),
                                       dim=150, ws=5, minCount=5, neg=15, 
                                       minn=3, maxn=15, wordNgrams=2, lr=0.15, epoch=25, loss='hs',
                                       model='skipgram', verbose=3)
fb_model_v2.save_model(os.path.join(temp,'fb_model_v2.bin'))

In [None]:
ft_model_v2 = fasttext.load_model(os.path.join(path, 'fb_model_v2.bin'))

version3

In [None]:
open(os.path.join(temp, 'all_text_v3.txt'), 'w').write('\n'.join(data['product_name'].tolist()))
fb_model_v3 = fasttext.train_unsupervised(os.path.join(temp, os.path.join(temp, 'all_text_v3.txt')),
                                       dim=150, ws=5, minCount=5, neg=15, 
                                       minn=3, maxn=15, wordNgrams=2, lr=0.15, epoch=25, loss='hs',
                                       model='skipgram', verbose=3)
fb_model_v3.save_model(os.path.join(temp,'fb_model_v3.bin'))

## Готовим датасет и целевую переменную ved1

Берем все строчки без дублей и пропусков в столбце Коды ТН ВЭД ЕАЭС

In [14]:
data = data.dropna(subset=['ved_code', 'clean_product_name'])#.drop_duplicates()

In [15]:
data = data[data['ved_code']!='']
data = data[data['clean_product_name']!='']

Превращаем одну строку с несколькими кодами в несколько строк с одиним

In [16]:
data['ved_code'] = data['ved_code'].fillna('').astype(str).str.split("; ").apply(lambda x: list(set(x)))
data = data.explode(column='ved_code')

In [17]:
data = data[data['ved_code']!='']



GRUPPA(2) + TOV_POZ(2) + SUB_POZ(6)

In [18]:
data.shape

In [19]:
data = data[data['ved_code'].str.isdigit()]
data.shape

In [20]:
data['ved1'] = data['ved_code'].apply(lambda x: x[:2])
data['ved2'] = data['ved_code'].apply(lambda x: x[2:4])
data['ved3'] = data['ved_code'].apply(lambda x: x[4:])

In [21]:
data = data[data['ved1']!='']
data = data[data['ved2']!='']
data = data[data['ved3']!='']

In [22]:
data.head(3)

In [23]:
cnt_ved1 = data['ved1'].value_counts().reset_index()
cnt_ved2 = data['ved2'].value_counts().reset_index()
cnt_ved3 = data['ved3'].value_counts().reset_index()

## ved1

In [24]:
from sklearn.model_selection import train_test_split
import numpy as np
from lightgbm import LGBMClassifier
import joblib

In [None]:
train_ved1, test_ved1 = train_test_split(data[['clean_product_name', 'ved1']], test_size=0.3, random_state=42)
train_ved1, val_ved1 = train_test_split(train_ved1[['clean_product_name', 'ved1']], test_size=0.10, random_state=42)

In [None]:
%%time
train_vectors = np.array([ft_model_v2.get_sentence_vector(text) for text in train_ved1['clean_product_name']])
val_vectors = np.array([ft_model_v2.get_sentence_vector(text) for text in val_ved1['clean_product_name']])
test_vectors = np.array([ft_model_v2.get_sentence_vector(text) for text in test_ved1['clean_product_name']])

In [None]:
lgbm_params = {
    'n_estimators': 300,
    'max_depth': 10,
    'learning_rate': 0.02,
    'n_jobs': 7,
    'random_state':42,
    'first_metric_only':True,
    'is_unbalance':True
}

model = LGBMClassifier(**lgbm_params)
model.fit(train_vectors, train_ved1['ved1'], verbose=True, eval_set=(val_vectors, val_ved1['ved1']),  early_stopping_rounds = 15)

In [None]:
from sklearn import metrics

y_predicted = model.predict(test_vectors)
print(metrics.classification_report(test_ved1['ved1'], y_predicted))

## ML не катит, попробуем поиск похожих для ved1, ved2, ved3

Сплитим на train и тест

In [None]:
data.head()

In [71]:
data = data[data['ved1']!='']
data = data[data['ved2']!='']
data = data[data['ved3']!='']
data = data[data['ved3'].apply(len)==6]

In [73]:
import gensim
import fasttext
data = data.reset_index(drop=True).reset_index()


ft_model_v2 = fasttext.load_model(os.path.join(path, 'fb_model_v2.bin'))
train, test = train_test_split(data[['clean_product_name', 'ved1', 'ved2', 'ved3', 'index']], test_size=0.20, random_state=42)
indexed_data = train#.drop_duplicates(subset = ['clean_product_name'])
indexed_data_vectors  = np.array([ft_model_v2.get_sentence_vector(text) for text in indexed_data['clean_product_name']])
test_vectors = np.array([ft_model_v2.get_sentence_vector(text) for text in test['clean_product_name']])

indexed_data_dict = dict(zip(indexed_data['index'].values, indexed_data[['ved1', 'ved2', 'ved3']].values.tolist()))
len(indexed_data_vectors), len(test_vectors)

Строим индекс по трейну

In [74]:
import nmslib
import numpy

from statistics import mode

In [75]:
# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='napp', space='cosinesimil')
index.addDataPointBatch(data=indexed_data_vectors[:], ids=indexed_data['index'].values)
index.createIndex(print_progress=True)

In [76]:
def get_metrics(test_1):

    print('-'*10, 'ved1')
    print('micro:', metrics.f1_score(test_1['ved1'], test_1[0], average='micro'))
    print('weighted:', metrics.f1_score(test_1['ved1'], test_1[0], average='weighted'))
    print('-'*10, 'ved2')
    print('micro:', metrics.f1_score(test_1['ved2'], test_1[1], average='micro'))
    print('weighted:', metrics.f1_score(test_1['ved2'], test_1[1], average='weighted'))
    print('-'*10, 'ved3')
    print('micro:', metrics.f1_score(test_1['ved3'], test_1[2], average='micro'))
    print('weighted:', metrics.f1_score(test_1['ved3'], test_1[2], average='weighted'))

def get_ved_by_n(neighbours, ved_cnt, indexed_data_dict):
    neighbours = [indexed_data_dict[xi] for xi in neighbours]
    if len(set(neighbours))==1:
        return neighbours[0]
    else:
        try:
            return mode(neighbours)
        except:
            return ved_cnt[ved_cnt['index'].isin(neighbours)].head(1)['index'].tolist()[0] 
      


Считаем 1 соседа для каждого примера из теста

In [77]:
%%time
neighbours = index.knnQueryBatch(test_vectors, k=1, num_threads=10)
test['index'] = np.array(neighbours)[:, 0].reshape(-1)
test['distance'] = np.array(neighbours)[:, 1].reshape(-1)
test_1 = test.join(test['index'].map(indexed_data_dict).apply(pd.Series))

In [43]:
from sklearn import metrics

get_metrics(test_1)

In [78]:
from sklearn import metrics

get_metrics(test_1)

In [80]:
nmslib.saveIndex(index, 'index_ved')

In [81]:
joblib.dump(indexed_data_dict, 'index_map.pkl')

## Не чищенный текст:

In [None]:
train, test = train_test_split(data[['product_name', 'ved1', 'ved2', 'ved3', 'index']], test_size=0.20, random_state=42)
indexed_data = train#.drop_duplicates(subset = ['clean_product_name'])
indexed_data_vectors  = np.array([fb_model_v3.get_sentence_vector(text) for text in indexed_data['product_name']])
test_vectors = np.array([fb_model_v3.get_sentence_vector(text) for text in test['product_name']])

indexed_data_dict = dict(zip(indexed_data['index'].values, indexed_data[['ved1', 'ved2', 'ved3']].values.tolist()))
len(indexed_data_vectors), len(test_vectors)

In [None]:
# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='napp', space='cosinesimil')
index.addDataPointBatch(data=indexed_data_vectors[:], ids=indexed_data['index'].values)
index.createIndex(print_progress=True)

In [None]:
%%time
neighbours = index.knnQueryBatch(test_vectors, k=1, num_threads=10)
test['index'] = np.array(neighbours)[:, 0].reshape(-1)
test['distance'] = np.array(neighbours)[:, 1].reshape(-1)
test_2 = test.join(test['index'].map(indexed_data_dict).apply(pd.Series))

In [None]:
get_metrics(test_2)

In [None]:
  nmslib.saveIndex(index, os.path.join(temp, 'index'))

## ved1, ved2, ved3

In [35]:
import tqdm
import gc
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import scipy as sp
from sklearn import metrics 

def tokenize_with_razdel(text):
    tokens = [token.text for token in razdel.tokenize(text)]
    
    return tokens

def evaluate_vectorizer(vectorizer, train, test):
    vectorizer = joblib.load(os.path.join(path, 'vectorizer.pkl'))
    train_vectors = vectorizer.transform(train['clean_product_name'])
    
#     clf = LinearSVC(random_state=42)
#     clf2 = LinearSVC(random_state=42)
    clf3 = LinearSVC(random_state=42)

    
#     clf.fit(train_vectors, train['ved1'])
#     print('ved1 done')
#     joblib.dump(clf, 'model_ved1.pkl')
    clf = joblib.load(path+'model_ved1.pkl')

    
#     train_ved2 = sp.sparse.hstack([train_vectors, train['ved1'].astype(int).values.reshape(-1,1)]) 
#     clf2.fit(train_ved2,  train['ved2'])
#     print('ved2 done')
#     joblib.dump(clf2, 'model_ved2.pkl')
    clf2 = joblib.load(path+'model_ved2.pkl')

    
    train_ved3 = sp.sparse.hstack([train_vectors, train['ved1'].astype(int).values.reshape(-1,1), train['ved2'].astype(int).values.reshape(-1,1)]) 
    clf3.fit(train_ved3,  train['ved3'])
    joblib.dump(clf3, 'model_ved3.pkl')
    print('model3_read')
    
    test_vectors = vectorizer.transform(test['clean_product_name'])
    test[0] = clf.predict(test_vectors)
    test_ved2 = sp.sparse.hstack([test_vectors,  test[0].astype(int).values.reshape(-1,1)])
    test[1] = clf2.predict(test_ved2)
    test_ved3 =  sp.sparse.hstack([test_vectors, test[0].astype(int).values.reshape(-1,1), test[1].astype(int).values.reshape(-1,1)])
    test[2] = clf3.predict(test_ved3)

    get_metrics(test)   

In [36]:
train, test = train_test_split(data[['clean_product_name', 'ved1', 'ved2', 'ved3']], test_size=0.20, random_state=42)
evaluate_vectorizer(TfidfVectorizer(min_df=2, tokenizer=tokenize_with_razdel), train, test)