In [None]:
!pip install openpyxl textdistance

In [None]:
%matplotlib inline
import fasttext
import matplotlib.pyplot as plt
import numpy as np 
from nltk.corpus import stopwords
import pandas as pd 
import pickle
import re
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm

stopwords = stopwords.words("russian")

In [None]:
VENDORS_DICT_CUSTOM_PATH = "/kaggle/input/hacksai-3/ved_dict.csv"
DATASET_PATH = "/kaggle/input/hacksai-3/dataset.xlsx"
REGLAMENT_PATH = "../input/hacksai-3/reglament.csv"

# Read data

In [None]:
ved_dict = pd.read_csv(VENDORS_DICT_CUSTOM_PATH, sep=";") \
    .dropna(subset=["VED"])

for code_col in ["GRUPPA", "TOV_POZ", "SUB_POZ", "VED", "RAZDEL"]:
    ved_dict.loc[ved_dict[code_col].notna(), code_col] =\
        ved_dict.loc[ved_dict[code_col].notna(), code_col].astype(int).astype(str)


ved_dict.head(5)

In [None]:
reglament = pd.read_csv(REGLAMENT_PATH, sep=";", encoding="cp1251", header=None)
reglament.columns = ["regulations_id", "regulations_name"]
reglament.head(5)

In [None]:
df = pd.read_excel(DATASET_PATH)

df.head(5)

In [None]:
df = df[~df[df.columns[1:]].duplicated()].reset_index(drop=True)
df["Номер продукции"] = df["Номер продукции"].str.replace(" Продукция", "")

In [None]:
df = df.dropna(subset=["Коды ТН ВЭД ЕАЭС"]).reset_index(drop=True)

In [None]:
df["Коды ТН ВЭД ЕАЭС"] = df["Коды ТН ВЭД ЕАЭС"].astype(str) \
    .str.split("; ") \
    .apply(set) \
    .apply(list)

In [None]:
df = df.dropna(subset=["Технические регламенты"]).reset_index(drop=True)

In [None]:
df["Технические регламенты"] = df["Технические регламенты"].str.split("; ") \
    .apply(lambda x: list(set([i.strip() for i in x])))

In [None]:
df = df.dropna(subset=["Группа продукции"]).reset_index(drop=True)

In [None]:
df["Группа продукции"] = df["Группа продукции"].str.split(";") \
    .apply(lambda x: list(set([i.strip() for i in x])))

In [None]:
data = df[df.columns[:3].tolist() + ["Общее наименование продукции"]]

In [None]:
data = data.explode("Коды ТН ВЭД ЕАЭС") \
    .dropna() \
    .explode("Технические регламенты") \
    .dropna() \
    .reset_index(drop=True)

In [None]:
data = data[data["Коды ТН ВЭД ЕАЭС"].isin(ved_dict["VED"])] \
    .reset_index(drop=True)

In [None]:
ved_dict.sample(5)

In [None]:
data.head(5)

In [None]:
data = data.merge(
    ved_dict[["VED", "NAIM1"]].rename(columns={"VED": "Коды ТН ВЭД ЕАЭС"}),
    on="Коды ТН ВЭД ЕАЭС", how="inner"
)

In [None]:
data["regulations_id"] = data["Технические регламенты"].apply(lambda x: re.split(r"(?<=/[\d]{4}) ", x)[0])
data["regulations_name"] = data["Технические регламенты"].apply(lambda x: re.split(r"(?<=/[\d]{4}) ", x)[1])

# Predict VED-code

In [None]:
def delete_stopwords(s):
    return ' '.join([word for word in (re.sub(r'[()\s+]', u' ', s)).split() if word.lower() not in stopwords]).strip()

def delete_punctuation(s):
    symbols = [
           '\t', '!','%','&',"'",'(',')','*','+',',','-','.', '\\', '®',
           '/', '~','«','\xad','¯','°','`','±','²','³','·','º', '»', ':',';','<','=','?','@',
           'É','Ó','Ö','×','Ø','Ü','ä','é','ö','÷','İ','Š','˂','˚','̆','Ι', 'Λ', '[','\\',']','_','`',
          '\u200e','‐','–', '—', '‘', '’', '“', '”', '•', '…', '‧', '⁰', '₂', '℃', '№', '™', 
           'Ⅰ', 'Ⅱ', 'Ⅲ', 'Ⅳ', '↑', '−', '∞', '≤', '\uf0d2' '️','（', '）', '，', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
          ]
    s = s.lower()
    return delete_stopwords(re.sub(r'[{}\s+]'.format(''.join(symbols)), u' ', s.replace('\xad', ' ')))

In [None]:
X = data[["Технические регламенты", "regulations_id", "regulations_name", 
          "Общее наименование продукции", "NAIM1"]] \
    .drop_duplicates() \
    .reset_index(drop=True)
X.shape

In [None]:
X_simple = X[["regulations_id", "Общее наименование продукции"]].drop_duplicates() \
    .reset_index(drop=True)

X_simple["clean_Общее наименование продукции"] = X_simple["Общее наименование продукции"] \
    .apply(delete_punctuation)

In [None]:
sns.boxplot(X_simple["regulations_id"].value_counts())
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_simple["clean_Общее наименование продукции"], X_simple["regulations_id"],
    test_size=.2, stratify=X_simple["regulations_id"], random_state=42
)

### Augment dataset

In [None]:
for i in range(10):
    low_sample = y_train.value_counts().where(lambda x: x < 1000).dropna().index.tolist()
    low_sample = X_simple[
        (X_simple["regulations_id"].isin(low_sample)) &\
        (X_simple.index.isin(X_train.index))
    ]

    low_sample["clean_Общее наименование продукции"] = low_sample["clean_Общее наименование продукции"].apply(
        lambda x: ' '.join(np.random.choice(x.split(), len(x.split()), replace=False))
    )

    X_train = pd.concat([X_train, low_sample["clean_Общее наименование продукции"]], axis=0)
    y_train = pd.concat([y_train, low_sample["regulations_id"]], axis=0)

    m = ~X_train.duplicated()
    X_train = X_train[m]
    y_train = y_train[m]

In [None]:
for i in range(10):
    low_sample = y_train.value_counts().where(lambda x: x < 1000).dropna().index.tolist()
    low_sample = X_simple[
        (X_simple["regulations_id"].isin(low_sample)) &\
        (X_simple.index.isin(X_train.index))
    ]

    low_sample["clean_Общее наименование продукции"] = low_sample["clean_Общее наименование продукции"].apply(
        lambda x: ' '.join(np.random.choice(x.split(), max(1, np.random.randint(0, len(x.split()))), replace=False))
    )

    X_train = pd.concat([X_train, low_sample["clean_Общее наименование продукции"]], axis=0)
    y_train = pd.concat([y_train, low_sample["regulations_id"]], axis=0)

    m = ~X_train.duplicated()
    X_train = X_train[m]
    y_train = y_train[m]

In [None]:
sns.boxplot(y_train.value_counts())
plt.show()

In [None]:
with open("fasttext_train.txt", "w", encoding="utf-8") as f:
    for i, row in tqdm(enumerate(X_train)):
        f.write(f"__label__{y_train.iloc[i].replace(' ', '_')} {row}\n")
        
model = fasttext.train_supervised(input="fasttext_train.txt", minCount=5)

In [None]:
y_preds = model.predict(X_test.tolist())

In [None]:
print(classification_report(y_test, [(i[0][9:]).replace('_', ' ') for i in y_preds[0]]))

In [None]:
ved_dict = X[["regulations_id", "Технические регламенты"]].drop_duplicates() \
    .set_index("regulations_id")["Технические регламенты"].to_dict()

In [None]:
model.save_model("ved_predictor.model")

with open("ved_dict.pickle", "wb") as handle:
    pickle.dump(ved_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import fasttext
import pickle
import re


def load_model(path):
    """ Load classifier
    
    Args:
        path - path to model.
    """
    return fasttext.load_model(path)


def load_stopwords(path):
    """ Load stopwords
    
    Args:
        path - path to stopwords.
    """
    return open(path, "r").read().split("\n")


def load_ved_dict(path):
    """ Load ved dict
    
    Args:
        path - path to ved dict.
    """
    with open(path, "rb") as handle:
        ved_dict = pickle.load(handle)
    return ved_dict


def delete_stopwords(s):
    """ Delete russian stopwords wrom string using NTLK.
    
    Args:
        s - input string.
    Returns:
        (str)
    """
    return ' '.join([word for word in (re.sub(r'[()\s+]', u' ', s)).split() if word.lower() not in stopwords]).strip()


def preprocess(s):
    """ Delete invalid punctuation and stopwords
    
    Args:
        s - input string.
    Returns:
        (str)
    """
    symbols = [
           '\t', '!','%','&',"'",'(',')','*','+',',','-','.', '\\', '®',
           '/', '~','«','\xad','¯','°','`','±','²','³','·','º', '»', ':',';','<','=','?','@',
           'É','Ó','Ö','×','Ø','Ü','ä','é','ö','÷','İ','Š','˂','˚','̆','Ι', 'Λ', '[','\\',']','_','`',
          '\u200e','‐','–', '—', '‘', '’', '“', '”', '•', '…', '‧', '⁰', '₂', '℃', '№', '™', 
           'Ⅰ', 'Ⅱ', 'Ⅲ', 'Ⅳ', '↑', '−', '∞', '≤', '\uf0d2' '️','（', '）', '，', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
          ]
    s = s.lower()
    return delete_stopwords(re.sub(r'[{}\s+]'.format(''.join(symbols)), u' ', s.replace('\xad', ' ')))


def predict_ved(s):
    """ Predict TN VED identifier based on product name
    
    Args:
        s - input string.
    Returns:
        (str)
    """
    s = preprocess(s)
    label = model.predict(s)[0]
    return ved_dict.get(label[0][9:].replace('_', ' '))

In [None]:
%%time
predict_ved("нижнее белье и ночнушнки пижамы")