In [37]:
%cd /content/drive/MyDrive/NLP Made

/content/drive/MyDrive/NLP Made


Вручную проставлены теги на основе столбца "category" - это категория, которую проставляет сам ресторан. И называет ее как угодно


In [38]:
!pip install pymystem3==0.1.10 -q
#!pip install gensim --upgrade -q

In [None]:
import numpy as np
import pandas as pd
import html
from pymystem3 import Mystem


ALLOWED_ALPHABET=list(map(chr, range(ord('а'), ord('я') + 1)))
ALLOWED_ALPHABET.extend(map(chr, range(ord('a'), ord('z') + 1)))
ALLOWED_ALPHABET.extend(list(map(str.upper, ALLOWED_ALPHABET)))
ALLOWED_ALPHABET = set(ALLOWED_ALPHABET)


def normalize(sentence):
    def validate_char(c):
        return c in ALLOWED_ALPHABET
    
    sentence = ''.join(map(lambda c: c if validate_char(c) else ' ', sentence.lower()))
    return ''.join(sentence).strip()

def collect(row, word_to_lemma, count, stemmer, correct_spelling):
    # Заменить html сущности на символы
    row = html.unescape(row)

    # Оставить только символы Аа_Яя Aa_Zz и привести в нижний регистр
    normalized = normalize(row)

    # Разделить предложения на слова по ' ', убрать пустые элементы, заменить опечатки
    list_row = list()
    for word in normalized.split(' '):
        if word != '':
            correct_form = correct_spelling[word]
            # Здесь одно слово может быть разбито на 2, "напримертакое" -> "например такое". Их нужно собрать
            for part in correct_form.split(' '):
                list_row.append(part)

    # Лемматизация слов и сохранение пары слово:лемма в словарь для ускорения работы.
    lemmatized = list()
    for word in list_row:
        if word not in word_to_lemma:
            word_to_lemma[word] = stemmer.lemmatize(word.strip())[0]
        lemma = word_to_lemma[word]

        if lemma in count:
            count[lemma] += 1
        else:
            count[lemma] = 1
        lemmatized.append(lemma)
    
    return ' '.join(lemmatized)

def keep_common(sent, counts, dictionary_threshold, unk_token, keep_order):
    common_words = [unk_token if counts[word] < dictionary_threshold else word for word in sent.split()]

    # Если отсортировать слова, то уберется много дубликатов в корпусе
    if not keep_order:
        common_words = sorted(common_words)
    
    return ' '.join(common_words)

def clean_price(data):
    # Много строк без информации и большой ценой
    data['name_dish'] = data['name_dish'].where(data.name_dish != 'Третий набор в подарок')
    data.dropna(inplace = True)

    # Дороже 100к
    data.price = data.price.apply(lambda x: x/1000 if x>100000 else x)

    # Дороже 15к и не сеты
    idx = data.loc[(data.price > 15000) & ~(data.category.isin(['Рыба и морепродукты','Морепродукты', 'Сеты', 'Комбо', 'Наборы','Торты', 'Мясо']))].index
    data['price'] = data['price'].where(~data.index.isin(idx), data['price']/100)

    # Оставшиеся сеты
    data['price'] = data['price'].apply(lambda x: x/100 if x>35000 else x)

    data = data.astype({'price': int})
    return data

def clean_words(data, dictionary_threshold, unk_token, keep_order, to_spell_path):
    stemmer = Mystem()

    # Словарь с заменой опечаток
    correct_spelling = np.load(to_spell_path, allow_pickle=True).item()
    
    # Нормализация и подсчет встречаемости слов
    word_to_lemma = dict()
    count_description = dict()
    count_name = dict()
    data['product_description'] = data['product_description'].apply(lambda x: collect(x, word_to_lemma, count_description, stemmer, correct_spelling))
    data['name_dish'] = data['name_dish'].apply(lambda x: collect(x, word_to_lemma, count_name, stemmer, correct_spelling))

    # Замена редких слов на <UNK>
    data['product_description'] = data['product_description'].apply(lambda x: keep_common(x, count_description, dictionary_threshold, unk_token, keep_order))
    data['name_dish'] = data['name_dish'].apply(lambda x: keep_common(x, count_name, dictionary_threshold, unk_token, keep_order))

    # Удаление дубликатов
    data.drop_duplicates(subset = ['category','name_dish','product_description','tags_menu'], inplace = True)
    return data

def clean_df(data, dictionary_threshold = 30, unk_token = '<UNK>', keep_order = True, to_spell_path = 'to_spell.npy'):
    data = data.copy()

    # Замена словаря в поле tags_menu на его значение
    data['tags_menu'] = data['tags_menu'].apply(lambda x: [k for k in x.keys()][0])

    # Удаление одной строки с пустыми name_dish, product_description, price
    data.dropna(inplace = True)
    data.reset_index(drop=True, inplace=True)

    # Очистка price
    data = clean_price(data)
    
    # Очистка name_dish, product_description
    data = clean_words(data, dictionary_threshold, unk_token, keep_order, to_spell_path)

    return data

In [39]:
import pandas as pd
pd.set_option('display.max_rows', 500)

data = pd.read_pickle('menu_df_for_made.pkl')

In [40]:
from preprocess import clean_df
data_c = clean_df(data, dictionary_threshold=5, unk_token='', keep_order = True)

In [None]:
eng_words = {k: None for k,v in w_t_l.items() if any([x in list(map(chr, range(ord('a'), ord('z') + 1))) for x in k])}

In [None]:
%cd /content/drive/MyDrive/NLP Made

/content/drive/MyDrive/NLP Made


In [None]:
!pip install googletrans==3.1.0a0 -q

[K     |████████████████████████████████| 55 kB 2.9 MB/s 
[K     |████████████████████████████████| 1.3 MB 14.7 MB/s 
[K     |████████████████████████████████| 42 kB 1.7 MB/s 
[K     |████████████████████████████████| 53 kB 2.7 MB/s 
[K     |████████████████████████████████| 65 kB 4.4 MB/s 
[?25h  Building wheel for googletrans (setup.py) ... [?25l[?25hdone


In [None]:
!python3 translator_threads.py

# Bert finetuning

In [41]:
data_c['X'] = data_c['name_dish'] + ' ' + data_c['product_description']

In [46]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

data_c['y'] = enc.fit_transform(data_c.tags_menu.values.reshape(-1, 1))
data_c = data_c.astype({'y': int})

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_c['X'], data_c['y'], test_size=0.2, stratify = data_c['tags_menu'], random_state=42, shuffle = True)

In [49]:
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify = y_test, random_state=42, shuffle = True)

In [50]:
pd.DataFrame({'X':X_train, 'y': y_train}).to_csv('train.csv')
pd.DataFrame({'X':X_valid, 'y': y_valid}).to_csv('valid.csv')
pd.DataFrame({'X':X_test, 'y': y_test}).to_csv('test.csv')

# Naive bayes

In [None]:
data_c

Unnamed: 0,product_id,category,name_dish,product_description,price,tags_menu
0,300146805,Пицца,милан пицца фирменный,болгарский в вырезка гриб клюквенный мариноват...,620,пицца
1,300146840,Пицца,пицца фермерский,ветчина говяжий гриб грудка индейка корнишон к...,650,пицца
2,300146870,Пицца,пицца сезон четыре,болгарский гриб лук маслина моцарелл оливка пе...,550,пицца
3,300146918,Пицца,деревенский пицца,бекон ветчина гриб индейка лук моцарелл пицца ...,620,пицца
4,300146996,Пицца,маргарита пицца,моцарелл пицца соус сыр томатный,490,пицца
...,...,...,...,...,...,...
2050004,331415633,Драники,драник и картофельный лосось слабосоленый слив...,,530,драники
2050005,331415634,Драники,драник из свекла,,195,драники
2050006,331415635,Драники,драник из капуста,,195,драники
2050013,333962918,Драники,драник картофельный,шт,139,драники


In [None]:
data_c['X'] = data_c['name_dish'] + ' ' + data_c['product_description']

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

enc = OrdinalEncoder()

y = enc.fit_transform(data_c.tags_menu.values.reshape(-1, 1))

#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(data_c['X'].values)

N_SPLITS = 5
skf = StratifiedKFold(n_splits = N_SPLITS)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    model = MultinomialNB()
    model.fit(X[train_index, :], y[train_index,:])

    y_pred = model.predict(X[test_index,:])
    y_true = y[test_index,:]

    if i == 0:
        report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True, target_names = enc.categories_[0])).T
    else:
        report += pd.DataFrame(classification_report(y_true, y_pred, output_dict=True, target_names = enc.categories_[0])).T

report = report / N_SPLITS

In [None]:
#CountVectorizer()@100
report

Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.09838,0.144444,0.113813,18.0
блины и оладьи,0.769784,0.885111,0.823139,1396.2
бургер,0.874415,0.932144,0.902253,6334.0
вегетарианская еда,0.242173,0.246549,0.243001,912.6
вок и лапша,0.8433,0.944623,0.891075,5626.8
гарниры,0.800228,0.751307,0.774906,5393.0
драники,0.394853,0.193103,0.251751,29.0
закуски,0.618536,0.483897,0.542289,4179.4
каша и гранола,0.550259,0.58125,0.563168,64.0
кофе,0.25875,0.65153,0.320943,443.6


In [None]:
#CountVectorizer()@50
report

Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.120606,0.077778,0.087576,18.0
блины и оладьи,0.763271,0.882502,0.818231,1397.4
бургер,0.872067,0.933382,0.90159,6346.6
вегетарианская еда,0.253324,0.244858,0.24778,914.0
вок и лапша,0.842521,0.946795,0.891607,5627.2
гарниры,0.801519,0.754969,0.777465,5411.6
драники,0.288095,0.110345,0.15845,29.0
закуски,0.618458,0.495971,0.549566,4194.2
каша и гранола,0.603414,0.5125,0.55108,64.0
кофе,0.26113,0.625711,0.317608,448.8


In [None]:
#CountVectorizer()@10
report

Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.0,0.0,0.0,18.0
блины и оладьи,0.770012,0.867096,0.815319,1398.0
бургер,0.866042,0.937072,0.900069,6362.8
вегетарианская еда,0.298028,0.220643,0.252348,916.4
вок и лапша,0.836864,0.949603,0.889663,5627.4
гарниры,0.799323,0.76267,0.780516,5438.0
драники,0.2,0.006897,0.013333,29.0
закуски,0.61194,0.516011,0.558889,4209.6
каша и гранола,0.775935,0.355913,0.487654,64.6
кофе,0.2744,0.511314,0.302423,459.6


In [None]:
#CountVectorizer()
report

Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.128571,0.044444,0.061364,18.0
блины и оладьи,0.764817,0.878081,0.817184,1397.6
бургер,0.870929,0.934538,0.90152,6354.8
вегетарианская еда,0.269522,0.244432,0.255089,915.6
вок и лапша,0.84034,0.947864,0.890859,5627.4
гарниры,0.801811,0.757699,0.77906,5421.4
драники,0.192424,0.062069,0.09305,29.0
закуски,0.617268,0.503761,0.553835,4200.8
каша и гранола,0.648593,0.468942,0.542504,64.4
кофе,0.26709,0.597046,0.315865,454.2


In [None]:
# TfidfVectorizer
report

Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.0,0.0,0.0,18.0
блины и оладьи,0.847408,0.736265,0.787297,1397.6
бургер,0.860016,0.928306,0.892781,6354.8
вегетарианская еда,0.579891,0.097638,0.166424,915.6
вок и лапша,0.858697,0.938232,0.896701,5627.4
гарниры,0.791114,0.730437,0.759468,5421.4
драники,0.0,0.0,0.0,29.0
закуски,0.629543,0.412588,0.498001,4200.8
каша и гранола,0.4,0.009327,0.018182,64.4
кофе,0.290234,0.036994,0.062953,454.2


# LogReg tf-idf

In [None]:
data_c

Unnamed: 0,product_id,category,name_dish,product_description,price,tags_menu
0,300146805,Пицца,милан фирменный пицца,болгарский перец гриб помидор свиной вырезка м...,620,пицца
1,300146840,Пицца,фермерский пицца,пицца соус сливочный ветчина индейка говяжий я...,650,пицца
2,300146870,Пицца,пицца четыре сезон,болгарский перец гриб лук маслина оливка помид...,550,пицца
3,300146918,Пицца,деревенский пицца,бекон ветчина индейка гриб лук помидор салями ...,620,пицца
4,300146996,Пицца,пицца маргарита,пицца соус томатный сыр моцарелл,490,пицца
...,...,...,...,...,...,...
2050004,331415633,Драники,картофельный драник со слабосоленый лосось и с...,,530,драники
2050005,331415634,Драники,драник из свекла,,195,драники
2050006,331415635,Драники,драник из капуста,,195,драники
2050013,333962918,Драники,драник картофельный,шт,139,драники


In [None]:
data_c['X'] = data_c['name_dish'] + ' ' + data_c['product_description']

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

enc = OrdinalEncoder()

y = enc.fit_transform(data_c.tags_menu.values.reshape(-1, 1))

vectorizer = TfidfVectorizer(ngram_range = (1,1), min_df = 5, max_features=12000, )
X = vectorizer.fit_transform(data_c['X'].values)

N_SPLITS = 5
skf = StratifiedKFold(n_splits = N_SPLITS)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    model = LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial', random_state=17, n_jobs=-1)
    model.fit(X[train_index, :], y[train_index,:])

    y_pred = model.predict(X[test_index,:])
    y_true = y[test_index,:]

    if i == 0:
        report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True, target_names = enc.categories_[0])).T
    else:
        report += pd.DataFrame(classification_report(y_true, y_pred, output_dict=True, target_names = enc.categories_[0])).T

    break

In [None]:
# TfidfVectorizer(ngram_range = (1,1), min_df = 5, max_features=12000, )
report

Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.166667,0.055556,0.083333,18.0
блины и оладьи,0.93985,0.885897,0.912076,1411.0
бургер,0.953807,0.924493,0.938921,6410.0
вегетарианская еда,0.426087,0.212581,0.283647,922.0
вок и лапша,0.889251,0.914694,0.901793,5662.0
гарниры,0.801679,0.82638,0.813842,5777.0
драники,0.45,0.62069,0.521739,29.0
закуски,0.622103,0.655553,0.63839,4259.0
каша и гранола,0.491525,0.446154,0.467742,65.0
кофе,0.5,0.019231,0.037037,468.0


In [None]:
import eli5
eli5.show_weights(estimator=model, 
                  feature_names= list(vectorizer.get_feature_names()),
                  target_names = enc.categories_[0],
                 top=(20, 20))

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Unnamed: 31_level_0,Unnamed: 32_level_0,Unnamed: 33_level_0,Unnamed: 34_level_0,Unnamed: 35_level_0,Unnamed: 36_level_0,Unnamed: 37_level_0,Unnamed: 38_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3,Unnamed: 30_level_3,Unnamed: 31_level_3,Unnamed: 32_level_3,Unnamed: 33_level_3,Unnamed: 34_level_3,Unnamed: 35_level_3,Unnamed: 36_level_3,Unnamed: 37_level_3,Unnamed: 38_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4,Unnamed: 24_level_4,Unnamed: 25_level_4,Unnamed: 26_level_4,Unnamed: 27_level_4,Unnamed: 28_level_4,Unnamed: 29_level_4,Unnamed: 30_level_4,Unnamed: 31_level_4,Unnamed: 32_level_4,Unnamed: 33_level_4,Unnamed: 34_level_4,Unnamed: 35_level_4,Unnamed: 36_level_4,Unnamed: 37_level_4,Unnamed: 38_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5,Unnamed: 23_level_5,Unnamed: 24_level_5,Unnamed: 25_level_5,Unnamed: 26_level_5,Unnamed: 27_level_5,Unnamed: 28_level_5,Unnamed: 29_level_5,Unnamed: 30_level_5,Unnamed: 31_level_5,Unnamed: 32_level_5,Unnamed: 33_level_5,Unnamed: 34_level_5,Unnamed: 35_level_5,Unnamed: 36_level_5,Unnamed: 37_level_5,Unnamed: 38_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6,Unnamed: 20_level_6,Unnamed: 21_level_6,Unnamed: 22_level_6,Unnamed: 23_level_6,Unnamed: 24_level_6,Unnamed: 25_level_6,Unnamed: 26_level_6,Unnamed: 27_level_6,Unnamed: 28_level_6,Unnamed: 29_level_6,Unnamed: 30_level_6,Unnamed: 31_level_6,Unnamed: 32_level_6,Unnamed: 33_level_6,Unnamed: 34_level_6,Unnamed: 35_level_6,Unnamed: 36_level_6,Unnamed: 37_level_6,Unnamed: 38_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7,Unnamed: 20_level_7,Unnamed: 21_level_7,Unnamed: 22_level_7,Unnamed: 23_level_7,Unnamed: 24_level_7,Unnamed: 25_level_7,Unnamed: 26_level_7,Unnamed: 27_level_7,Unnamed: 28_level_7,Unnamed: 29_level_7,Unnamed: 30_level_7,Unnamed: 31_level_7,Unnamed: 32_level_7,Unnamed: 33_level_7,Unnamed: 34_level_7,Unnamed: 35_level_7,Unnamed: 36_level_7,Unnamed: 37_level_7,Unnamed: 38_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8,Unnamed: 20_level_8,Unnamed: 21_level_8,Unnamed: 22_level_8,Unnamed: 23_level_8,Unnamed: 24_level_8,Unnamed: 25_level_8,Unnamed: 26_level_8,Unnamed: 27_level_8,Unnamed: 28_level_8,Unnamed: 29_level_8,Unnamed: 30_level_8,Unnamed: 31_level_8,Unnamed: 32_level_8,Unnamed: 33_level_8,Unnamed: 34_level_8,Unnamed: 35_level_8,Unnamed: 36_level_8,Unnamed: 37_level_8,Unnamed: 38_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9,Unnamed: 18_level_9,Unnamed: 19_level_9,Unnamed: 20_level_9,Unnamed: 21_level_9,Unnamed: 22_level_9,Unnamed: 23_level_9,Unnamed: 24_level_9,Unnamed: 25_level_9,Unnamed: 26_level_9,Unnamed: 27_level_9,Unnamed: 28_level_9,Unnamed: 29_level_9,Unnamed: 30_level_9,Unnamed: 31_level_9,Unnamed: 32_level_9,Unnamed: 33_level_9,Unnamed: 34_level_9,Unnamed: 35_level_9,Unnamed: 36_level_9,Unnamed: 37_level_9,Unnamed: 38_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10,Unnamed: 20_level_10,Unnamed: 21_level_10,Unnamed: 22_level_10,Unnamed: 23_level_10,Unnamed: 24_level_10,Unnamed: 25_level_10,Unnamed: 26_level_10,Unnamed: 27_level_10,Unnamed: 28_level_10,Unnamed: 29_level_10,Unnamed: 30_level_10,Unnamed: 31_level_10,Unnamed: 32_level_10,Unnamed: 33_level_10,Unnamed: 34_level_10,Unnamed: 35_level_10,Unnamed: 36_level_10,Unnamed: 37_level_10,Unnamed: 38_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11,Unnamed: 18_level_11,Unnamed: 19_level_11,Unnamed: 20_level_11,Unnamed: 21_level_11,Unnamed: 22_level_11,Unnamed: 23_level_11,Unnamed: 24_level_11,Unnamed: 25_level_11,Unnamed: 26_level_11,Unnamed: 27_level_11,Unnamed: 28_level_11,Unnamed: 29_level_11,Unnamed: 30_level_11,Unnamed: 31_level_11,Unnamed: 32_level_11,Unnamed: 33_level_11,Unnamed: 34_level_11,Unnamed: 35_level_11,Unnamed: 36_level_11,Unnamed: 37_level_11,Unnamed: 38_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12,Unnamed: 18_level_12,Unnamed: 19_level_12,Unnamed: 20_level_12,Unnamed: 21_level_12,Unnamed: 22_level_12,Unnamed: 23_level_12,Unnamed: 24_level_12,Unnamed: 25_level_12,Unnamed: 26_level_12,Unnamed: 27_level_12,Unnamed: 28_level_12,Unnamed: 29_level_12,Unnamed: 30_level_12,Unnamed: 31_level_12,Unnamed: 32_level_12,Unnamed: 33_level_12,Unnamed: 34_level_12,Unnamed: 35_level_12,Unnamed: 36_level_12,Unnamed: 37_level_12,Unnamed: 38_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13,Unnamed: 18_level_13,Unnamed: 19_level_13,Unnamed: 20_level_13,Unnamed: 21_level_13,Unnamed: 22_level_13,Unnamed: 23_level_13,Unnamed: 24_level_13,Unnamed: 25_level_13,Unnamed: 26_level_13,Unnamed: 27_level_13,Unnamed: 28_level_13,Unnamed: 29_level_13,Unnamed: 30_level_13,Unnamed: 31_level_13,Unnamed: 32_level_13,Unnamed: 33_level_13,Unnamed: 34_level_13,Unnamed: 35_level_13,Unnamed: 36_level_13,Unnamed: 37_level_13,Unnamed: 38_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14,Unnamed: 18_level_14,Unnamed: 19_level_14,Unnamed: 20_level_14,Unnamed: 21_level_14,Unnamed: 22_level_14,Unnamed: 23_level_14,Unnamed: 24_level_14,Unnamed: 25_level_14,Unnamed: 26_level_14,Unnamed: 27_level_14,Unnamed: 28_level_14,Unnamed: 29_level_14,Unnamed: 30_level_14,Unnamed: 31_level_14,Unnamed: 32_level_14,Unnamed: 33_level_14,Unnamed: 34_level_14,Unnamed: 35_level_14,Unnamed: 36_level_14,Unnamed: 37_level_14,Unnamed: 38_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15,Unnamed: 18_level_15,Unnamed: 19_level_15,Unnamed: 20_level_15,Unnamed: 21_level_15,Unnamed: 22_level_15,Unnamed: 23_level_15,Unnamed: 24_level_15,Unnamed: 25_level_15,Unnamed: 26_level_15,Unnamed: 27_level_15,Unnamed: 28_level_15,Unnamed: 29_level_15,Unnamed: 30_level_15,Unnamed: 31_level_15,Unnamed: 32_level_15,Unnamed: 33_level_15,Unnamed: 34_level_15,Unnamed: 35_level_15,Unnamed: 36_level_15,Unnamed: 37_level_15,Unnamed: 38_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16,Unnamed: 18_level_16,Unnamed: 19_level_16,Unnamed: 20_level_16,Unnamed: 21_level_16,Unnamed: 22_level_16,Unnamed: 23_level_16,Unnamed: 24_level_16,Unnamed: 25_level_16,Unnamed: 26_level_16,Unnamed: 27_level_16,Unnamed: 28_level_16,Unnamed: 29_level_16,Unnamed: 30_level_16,Unnamed: 31_level_16,Unnamed: 32_level_16,Unnamed: 33_level_16,Unnamed: 34_level_16,Unnamed: 35_level_16,Unnamed: 36_level_16,Unnamed: 37_level_16,Unnamed: 38_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17,Unnamed: 18_level_17,Unnamed: 19_level_17,Unnamed: 20_level_17,Unnamed: 21_level_17,Unnamed: 22_level_17,Unnamed: 23_level_17,Unnamed: 24_level_17,Unnamed: 25_level_17,Unnamed: 26_level_17,Unnamed: 27_level_17,Unnamed: 28_level_17,Unnamed: 29_level_17,Unnamed: 30_level_17,Unnamed: 31_level_17,Unnamed: 32_level_17,Unnamed: 33_level_17,Unnamed: 34_level_17,Unnamed: 35_level_17,Unnamed: 36_level_17,Unnamed: 37_level_17,Unnamed: 38_level_17
Weight?,Feature,Unnamed: 2_level_18,Unnamed: 3_level_18,Unnamed: 4_level_18,Unnamed: 5_level_18,Unnamed: 6_level_18,Unnamed: 7_level_18,Unnamed: 8_level_18,Unnamed: 9_level_18,Unnamed: 10_level_18,Unnamed: 11_level_18,Unnamed: 12_level_18,Unnamed: 13_level_18,Unnamed: 14_level_18,Unnamed: 15_level_18,Unnamed: 16_level_18,Unnamed: 17_level_18,Unnamed: 18_level_18,Unnamed: 19_level_18,Unnamed: 20_level_18,Unnamed: 21_level_18,Unnamed: 22_level_18,Unnamed: 23_level_18,Unnamed: 24_level_18,Unnamed: 25_level_18,Unnamed: 26_level_18,Unnamed: 27_level_18,Unnamed: 28_level_18,Unnamed: 29_level_18,Unnamed: 30_level_18,Unnamed: 31_level_18,Unnamed: 32_level_18,Unnamed: 33_level_18,Unnamed: 34_level_18,Unnamed: 35_level_18,Unnamed: 36_level_18,Unnamed: 37_level_18,Unnamed: 38_level_18
Weight?,Feature,Unnamed: 2_level_19,Unnamed: 3_level_19,Unnamed: 4_level_19,Unnamed: 5_level_19,Unnamed: 6_level_19,Unnamed: 7_level_19,Unnamed: 8_level_19,Unnamed: 9_level_19,Unnamed: 10_level_19,Unnamed: 11_level_19,Unnamed: 12_level_19,Unnamed: 13_level_19,Unnamed: 14_level_19,Unnamed: 15_level_19,Unnamed: 16_level_19,Unnamed: 17_level_19,Unnamed: 18_level_19,Unnamed: 19_level_19,Unnamed: 20_level_19,Unnamed: 21_level_19,Unnamed: 22_level_19,Unnamed: 23_level_19,Unnamed: 24_level_19,Unnamed: 25_level_19,Unnamed: 26_level_19,Unnamed: 27_level_19,Unnamed: 28_level_19,Unnamed: 29_level_19,Unnamed: 30_level_19,Unnamed: 31_level_19,Unnamed: 32_level_19,Unnamed: 33_level_19,Unnamed: 34_level_19,Unnamed: 35_level_19,Unnamed: 36_level_19,Unnamed: 37_level_19,Unnamed: 38_level_19
Weight?,Feature,Unnamed: 2_level_20,Unnamed: 3_level_20,Unnamed: 4_level_20,Unnamed: 5_level_20,Unnamed: 6_level_20,Unnamed: 7_level_20,Unnamed: 8_level_20,Unnamed: 9_level_20,Unnamed: 10_level_20,Unnamed: 11_level_20,Unnamed: 12_level_20,Unnamed: 13_level_20,Unnamed: 14_level_20,Unnamed: 15_level_20,Unnamed: 16_level_20,Unnamed: 17_level_20,Unnamed: 18_level_20,Unnamed: 19_level_20,Unnamed: 20_level_20,Unnamed: 21_level_20,Unnamed: 22_level_20,Unnamed: 23_level_20,Unnamed: 24_level_20,Unnamed: 25_level_20,Unnamed: 26_level_20,Unnamed: 27_level_20,Unnamed: 28_level_20,Unnamed: 29_level_20,Unnamed: 30_level_20,Unnamed: 31_level_20,Unnamed: 32_level_20,Unnamed: 33_level_20,Unnamed: 34_level_20,Unnamed: 35_level_20,Unnamed: 36_level_20,Unnamed: 37_level_20,Unnamed: 38_level_20
Weight?,Feature,Unnamed: 2_level_21,Unnamed: 3_level_21,Unnamed: 4_level_21,Unnamed: 5_level_21,Unnamed: 6_level_21,Unnamed: 7_level_21,Unnamed: 8_level_21,Unnamed: 9_level_21,Unnamed: 10_level_21,Unnamed: 11_level_21,Unnamed: 12_level_21,Unnamed: 13_level_21,Unnamed: 14_level_21,Unnamed: 15_level_21,Unnamed: 16_level_21,Unnamed: 17_level_21,Unnamed: 18_level_21,Unnamed: 19_level_21,Unnamed: 20_level_21,Unnamed: 21_level_21,Unnamed: 22_level_21,Unnamed: 23_level_21,Unnamed: 24_level_21,Unnamed: 25_level_21,Unnamed: 26_level_21,Unnamed: 27_level_21,Unnamed: 28_level_21,Unnamed: 29_level_21,Unnamed: 30_level_21,Unnamed: 31_level_21,Unnamed: 32_level_21,Unnamed: 33_level_21,Unnamed: 34_level_21,Unnamed: 35_level_21,Unnamed: 36_level_21,Unnamed: 37_level_21,Unnamed: 38_level_21
Weight?,Feature,Unnamed: 2_level_22,Unnamed: 3_level_22,Unnamed: 4_level_22,Unnamed: 5_level_22,Unnamed: 6_level_22,Unnamed: 7_level_22,Unnamed: 8_level_22,Unnamed: 9_level_22,Unnamed: 10_level_22,Unnamed: 11_level_22,Unnamed: 12_level_22,Unnamed: 13_level_22,Unnamed: 14_level_22,Unnamed: 15_level_22,Unnamed: 16_level_22,Unnamed: 17_level_22,Unnamed: 18_level_22,Unnamed: 19_level_22,Unnamed: 20_level_22,Unnamed: 21_level_22,Unnamed: 22_level_22,Unnamed: 23_level_22,Unnamed: 24_level_22,Unnamed: 25_level_22,Unnamed: 26_level_22,Unnamed: 27_level_22,Unnamed: 28_level_22,Unnamed: 29_level_22,Unnamed: 30_level_22,Unnamed: 31_level_22,Unnamed: 32_level_22,Unnamed: 33_level_22,Unnamed: 34_level_22,Unnamed: 35_level_22,Unnamed: 36_level_22,Unnamed: 37_level_22,Unnamed: 38_level_22
Weight?,Feature,Unnamed: 2_level_23,Unnamed: 3_level_23,Unnamed: 4_level_23,Unnamed: 5_level_23,Unnamed: 6_level_23,Unnamed: 7_level_23,Unnamed: 8_level_23,Unnamed: 9_level_23,Unnamed: 10_level_23,Unnamed: 11_level_23,Unnamed: 12_level_23,Unnamed: 13_level_23,Unnamed: 14_level_23,Unnamed: 15_level_23,Unnamed: 16_level_23,Unnamed: 17_level_23,Unnamed: 18_level_23,Unnamed: 19_level_23,Unnamed: 20_level_23,Unnamed: 21_level_23,Unnamed: 22_level_23,Unnamed: 23_level_23,Unnamed: 24_level_23,Unnamed: 25_level_23,Unnamed: 26_level_23,Unnamed: 27_level_23,Unnamed: 28_level_23,Unnamed: 29_level_23,Unnamed: 30_level_23,Unnamed: 31_level_23,Unnamed: 32_level_23,Unnamed: 33_level_23,Unnamed: 34_level_23,Unnamed: 35_level_23,Unnamed: 36_level_23,Unnamed: 37_level_23,Unnamed: 38_level_23
Weight?,Feature,Unnamed: 2_level_24,Unnamed: 3_level_24,Unnamed: 4_level_24,Unnamed: 5_level_24,Unnamed: 6_level_24,Unnamed: 7_level_24,Unnamed: 8_level_24,Unnamed: 9_level_24,Unnamed: 10_level_24,Unnamed: 11_level_24,Unnamed: 12_level_24,Unnamed: 13_level_24,Unnamed: 14_level_24,Unnamed: 15_level_24,Unnamed: 16_level_24,Unnamed: 17_level_24,Unnamed: 18_level_24,Unnamed: 19_level_24,Unnamed: 20_level_24,Unnamed: 21_level_24,Unnamed: 22_level_24,Unnamed: 23_level_24,Unnamed: 24_level_24,Unnamed: 25_level_24,Unnamed: 26_level_24,Unnamed: 27_level_24,Unnamed: 28_level_24,Unnamed: 29_level_24,Unnamed: 30_level_24,Unnamed: 31_level_24,Unnamed: 32_level_24,Unnamed: 33_level_24,Unnamed: 34_level_24,Unnamed: 35_level_24,Unnamed: 36_level_24,Unnamed: 37_level_24,Unnamed: 38_level_24
Weight?,Feature,Unnamed: 2_level_25,Unnamed: 3_level_25,Unnamed: 4_level_25,Unnamed: 5_level_25,Unnamed: 6_level_25,Unnamed: 7_level_25,Unnamed: 8_level_25,Unnamed: 9_level_25,Unnamed: 10_level_25,Unnamed: 11_level_25,Unnamed: 12_level_25,Unnamed: 13_level_25,Unnamed: 14_level_25,Unnamed: 15_level_25,Unnamed: 16_level_25,Unnamed: 17_level_25,Unnamed: 18_level_25,Unnamed: 19_level_25,Unnamed: 20_level_25,Unnamed: 21_level_25,Unnamed: 22_level_25,Unnamed: 23_level_25,Unnamed: 24_level_25,Unnamed: 25_level_25,Unnamed: 26_level_25,Unnamed: 27_level_25,Unnamed: 28_level_25,Unnamed: 29_level_25,Unnamed: 30_level_25,Unnamed: 31_level_25,Unnamed: 32_level_25,Unnamed: 33_level_25,Unnamed: 34_level_25,Unnamed: 35_level_25,Unnamed: 36_level_25,Unnamed: 37_level_25,Unnamed: 38_level_25
Weight?,Feature,Unnamed: 2_level_26,Unnamed: 3_level_26,Unnamed: 4_level_26,Unnamed: 5_level_26,Unnamed: 6_level_26,Unnamed: 7_level_26,Unnamed: 8_level_26,Unnamed: 9_level_26,Unnamed: 10_level_26,Unnamed: 11_level_26,Unnamed: 12_level_26,Unnamed: 13_level_26,Unnamed: 14_level_26,Unnamed: 15_level_26,Unnamed: 16_level_26,Unnamed: 17_level_26,Unnamed: 18_level_26,Unnamed: 19_level_26,Unnamed: 20_level_26,Unnamed: 21_level_26,Unnamed: 22_level_26,Unnamed: 23_level_26,Unnamed: 24_level_26,Unnamed: 25_level_26,Unnamed: 26_level_26,Unnamed: 27_level_26,Unnamed: 28_level_26,Unnamed: 29_level_26,Unnamed: 30_level_26,Unnamed: 31_level_26,Unnamed: 32_level_26,Unnamed: 33_level_26,Unnamed: 34_level_26,Unnamed: 35_level_26,Unnamed: 36_level_26,Unnamed: 37_level_26,Unnamed: 38_level_26
Weight?,Feature,Unnamed: 2_level_27,Unnamed: 3_level_27,Unnamed: 4_level_27,Unnamed: 5_level_27,Unnamed: 6_level_27,Unnamed: 7_level_27,Unnamed: 8_level_27,Unnamed: 9_level_27,Unnamed: 10_level_27,Unnamed: 11_level_27,Unnamed: 12_level_27,Unnamed: 13_level_27,Unnamed: 14_level_27,Unnamed: 15_level_27,Unnamed: 16_level_27,Unnamed: 17_level_27,Unnamed: 18_level_27,Unnamed: 19_level_27,Unnamed: 20_level_27,Unnamed: 21_level_27,Unnamed: 22_level_27,Unnamed: 23_level_27,Unnamed: 24_level_27,Unnamed: 25_level_27,Unnamed: 26_level_27,Unnamed: 27_level_27,Unnamed: 28_level_27,Unnamed: 29_level_27,Unnamed: 30_level_27,Unnamed: 31_level_27,Unnamed: 32_level_27,Unnamed: 33_level_27,Unnamed: 34_level_27,Unnamed: 35_level_27,Unnamed: 36_level_27,Unnamed: 37_level_27,Unnamed: 38_level_27
Weight?,Feature,Unnamed: 2_level_28,Unnamed: 3_level_28,Unnamed: 4_level_28,Unnamed: 5_level_28,Unnamed: 6_level_28,Unnamed: 7_level_28,Unnamed: 8_level_28,Unnamed: 9_level_28,Unnamed: 10_level_28,Unnamed: 11_level_28,Unnamed: 12_level_28,Unnamed: 13_level_28,Unnamed: 14_level_28,Unnamed: 15_level_28,Unnamed: 16_level_28,Unnamed: 17_level_28,Unnamed: 18_level_28,Unnamed: 19_level_28,Unnamed: 20_level_28,Unnamed: 21_level_28,Unnamed: 22_level_28,Unnamed: 23_level_28,Unnamed: 24_level_28,Unnamed: 25_level_28,Unnamed: 26_level_28,Unnamed: 27_level_28,Unnamed: 28_level_28,Unnamed: 29_level_28,Unnamed: 30_level_28,Unnamed: 31_level_28,Unnamed: 32_level_28,Unnamed: 33_level_28,Unnamed: 34_level_28,Unnamed: 35_level_28,Unnamed: 36_level_28,Unnamed: 37_level_28,Unnamed: 38_level_28
Weight?,Feature,Unnamed: 2_level_29,Unnamed: 3_level_29,Unnamed: 4_level_29,Unnamed: 5_level_29,Unnamed: 6_level_29,Unnamed: 7_level_29,Unnamed: 8_level_29,Unnamed: 9_level_29,Unnamed: 10_level_29,Unnamed: 11_level_29,Unnamed: 12_level_29,Unnamed: 13_level_29,Unnamed: 14_level_29,Unnamed: 15_level_29,Unnamed: 16_level_29,Unnamed: 17_level_29,Unnamed: 18_level_29,Unnamed: 19_level_29,Unnamed: 20_level_29,Unnamed: 21_level_29,Unnamed: 22_level_29,Unnamed: 23_level_29,Unnamed: 24_level_29,Unnamed: 25_level_29,Unnamed: 26_level_29,Unnamed: 27_level_29,Unnamed: 28_level_29,Unnamed: 29_level_29,Unnamed: 30_level_29,Unnamed: 31_level_29,Unnamed: 32_level_29,Unnamed: 33_level_29,Unnamed: 34_level_29,Unnamed: 35_level_29,Unnamed: 36_level_29,Unnamed: 37_level_29,Unnamed: 38_level_29
Weight?,Feature,Unnamed: 2_level_30,Unnamed: 3_level_30,Unnamed: 4_level_30,Unnamed: 5_level_30,Unnamed: 6_level_30,Unnamed: 7_level_30,Unnamed: 8_level_30,Unnamed: 9_level_30,Unnamed: 10_level_30,Unnamed: 11_level_30,Unnamed: 12_level_30,Unnamed: 13_level_30,Unnamed: 14_level_30,Unnamed: 15_level_30,Unnamed: 16_level_30,Unnamed: 17_level_30,Unnamed: 18_level_30,Unnamed: 19_level_30,Unnamed: 20_level_30,Unnamed: 21_level_30,Unnamed: 22_level_30,Unnamed: 23_level_30,Unnamed: 24_level_30,Unnamed: 25_level_30,Unnamed: 26_level_30,Unnamed: 27_level_30,Unnamed: 28_level_30,Unnamed: 29_level_30,Unnamed: 30_level_30,Unnamed: 31_level_30,Unnamed: 32_level_30,Unnamed: 33_level_30,Unnamed: 34_level_30,Unnamed: 35_level_30,Unnamed: 36_level_30,Unnamed: 37_level_30,Unnamed: 38_level_30
Weight?,Feature,Unnamed: 2_level_31,Unnamed: 3_level_31,Unnamed: 4_level_31,Unnamed: 5_level_31,Unnamed: 6_level_31,Unnamed: 7_level_31,Unnamed: 8_level_31,Unnamed: 9_level_31,Unnamed: 10_level_31,Unnamed: 11_level_31,Unnamed: 12_level_31,Unnamed: 13_level_31,Unnamed: 14_level_31,Unnamed: 15_level_31,Unnamed: 16_level_31,Unnamed: 17_level_31,Unnamed: 18_level_31,Unnamed: 19_level_31,Unnamed: 20_level_31,Unnamed: 21_level_31,Unnamed: 22_level_31,Unnamed: 23_level_31,Unnamed: 24_level_31,Unnamed: 25_level_31,Unnamed: 26_level_31,Unnamed: 27_level_31,Unnamed: 28_level_31,Unnamed: 29_level_31,Unnamed: 30_level_31,Unnamed: 31_level_31,Unnamed: 32_level_31,Unnamed: 33_level_31,Unnamed: 34_level_31,Unnamed: 35_level_31,Unnamed: 36_level_31,Unnamed: 37_level_31,Unnamed: 38_level_31
Weight?,Feature,Unnamed: 2_level_32,Unnamed: 3_level_32,Unnamed: 4_level_32,Unnamed: 5_level_32,Unnamed: 6_level_32,Unnamed: 7_level_32,Unnamed: 8_level_32,Unnamed: 9_level_32,Unnamed: 10_level_32,Unnamed: 11_level_32,Unnamed: 12_level_32,Unnamed: 13_level_32,Unnamed: 14_level_32,Unnamed: 15_level_32,Unnamed: 16_level_32,Unnamed: 17_level_32,Unnamed: 18_level_32,Unnamed: 19_level_32,Unnamed: 20_level_32,Unnamed: 21_level_32,Unnamed: 22_level_32,Unnamed: 23_level_32,Unnamed: 24_level_32,Unnamed: 25_level_32,Unnamed: 26_level_32,Unnamed: 27_level_32,Unnamed: 28_level_32,Unnamed: 29_level_32,Unnamed: 30_level_32,Unnamed: 31_level_32,Unnamed: 32_level_32,Unnamed: 33_level_32,Unnamed: 34_level_32,Unnamed: 35_level_32,Unnamed: 36_level_32,Unnamed: 37_level_32,Unnamed: 38_level_32
Weight?,Feature,Unnamed: 2_level_33,Unnamed: 3_level_33,Unnamed: 4_level_33,Unnamed: 5_level_33,Unnamed: 6_level_33,Unnamed: 7_level_33,Unnamed: 8_level_33,Unnamed: 9_level_33,Unnamed: 10_level_33,Unnamed: 11_level_33,Unnamed: 12_level_33,Unnamed: 13_level_33,Unnamed: 14_level_33,Unnamed: 15_level_33,Unnamed: 16_level_33,Unnamed: 17_level_33,Unnamed: 18_level_33,Unnamed: 19_level_33,Unnamed: 20_level_33,Unnamed: 21_level_33,Unnamed: 22_level_33,Unnamed: 23_level_33,Unnamed: 24_level_33,Unnamed: 25_level_33,Unnamed: 26_level_33,Unnamed: 27_level_33,Unnamed: 28_level_33,Unnamed: 29_level_33,Unnamed: 30_level_33,Unnamed: 31_level_33,Unnamed: 32_level_33,Unnamed: 33_level_33,Unnamed: 34_level_33,Unnamed: 35_level_33,Unnamed: 36_level_33,Unnamed: 37_level_33,Unnamed: 38_level_33
Weight?,Feature,Unnamed: 2_level_34,Unnamed: 3_level_34,Unnamed: 4_level_34,Unnamed: 5_level_34,Unnamed: 6_level_34,Unnamed: 7_level_34,Unnamed: 8_level_34,Unnamed: 9_level_34,Unnamed: 10_level_34,Unnamed: 11_level_34,Unnamed: 12_level_34,Unnamed: 13_level_34,Unnamed: 14_level_34,Unnamed: 15_level_34,Unnamed: 16_level_34,Unnamed: 17_level_34,Unnamed: 18_level_34,Unnamed: 19_level_34,Unnamed: 20_level_34,Unnamed: 21_level_34,Unnamed: 22_level_34,Unnamed: 23_level_34,Unnamed: 24_level_34,Unnamed: 25_level_34,Unnamed: 26_level_34,Unnamed: 27_level_34,Unnamed: 28_level_34,Unnamed: 29_level_34,Unnamed: 30_level_34,Unnamed: 31_level_34,Unnamed: 32_level_34,Unnamed: 33_level_34,Unnamed: 34_level_34,Unnamed: 35_level_34,Unnamed: 36_level_34,Unnamed: 37_level_34,Unnamed: 38_level_34
Weight?,Feature,Unnamed: 2_level_35,Unnamed: 3_level_35,Unnamed: 4_level_35,Unnamed: 5_level_35,Unnamed: 6_level_35,Unnamed: 7_level_35,Unnamed: 8_level_35,Unnamed: 9_level_35,Unnamed: 10_level_35,Unnamed: 11_level_35,Unnamed: 12_level_35,Unnamed: 13_level_35,Unnamed: 14_level_35,Unnamed: 15_level_35,Unnamed: 16_level_35,Unnamed: 17_level_35,Unnamed: 18_level_35,Unnamed: 19_level_35,Unnamed: 20_level_35,Unnamed: 21_level_35,Unnamed: 22_level_35,Unnamed: 23_level_35,Unnamed: 24_level_35,Unnamed: 25_level_35,Unnamed: 26_level_35,Unnamed: 27_level_35,Unnamed: 28_level_35,Unnamed: 29_level_35,Unnamed: 30_level_35,Unnamed: 31_level_35,Unnamed: 32_level_35,Unnamed: 33_level_35,Unnamed: 34_level_35,Unnamed: 35_level_35,Unnamed: 36_level_35,Unnamed: 37_level_35,Unnamed: 38_level_35
Weight?,Feature,Unnamed: 2_level_36,Unnamed: 3_level_36,Unnamed: 4_level_36,Unnamed: 5_level_36,Unnamed: 6_level_36,Unnamed: 7_level_36,Unnamed: 8_level_36,Unnamed: 9_level_36,Unnamed: 10_level_36,Unnamed: 11_level_36,Unnamed: 12_level_36,Unnamed: 13_level_36,Unnamed: 14_level_36,Unnamed: 15_level_36,Unnamed: 16_level_36,Unnamed: 17_level_36,Unnamed: 18_level_36,Unnamed: 19_level_36,Unnamed: 20_level_36,Unnamed: 21_level_36,Unnamed: 22_level_36,Unnamed: 23_level_36,Unnamed: 24_level_36,Unnamed: 25_level_36,Unnamed: 26_level_36,Unnamed: 27_level_36,Unnamed: 28_level_36,Unnamed: 29_level_36,Unnamed: 30_level_36,Unnamed: 31_level_36,Unnamed: 32_level_36,Unnamed: 33_level_36,Unnamed: 34_level_36,Unnamed: 35_level_36,Unnamed: 36_level_36,Unnamed: 37_level_36,Unnamed: 38_level_36
Weight?,Feature,Unnamed: 2_level_37,Unnamed: 3_level_37,Unnamed: 4_level_37,Unnamed: 5_level_37,Unnamed: 6_level_37,Unnamed: 7_level_37,Unnamed: 8_level_37,Unnamed: 9_level_37,Unnamed: 10_level_37,Unnamed: 11_level_37,Unnamed: 12_level_37,Unnamed: 13_level_37,Unnamed: 14_level_37,Unnamed: 15_level_37,Unnamed: 16_level_37,Unnamed: 17_level_37,Unnamed: 18_level_37,Unnamed: 19_level_37,Unnamed: 20_level_37,Unnamed: 21_level_37,Unnamed: 22_level_37,Unnamed: 23_level_37,Unnamed: 24_level_37,Unnamed: 25_level_37,Unnamed: 26_level_37,Unnamed: 27_level_37,Unnamed: 28_level_37,Unnamed: 29_level_37,Unnamed: 30_level_37,Unnamed: 31_level_37,Unnamed: 32_level_37,Unnamed: 33_level_37,Unnamed: 34_level_37,Unnamed: 35_level_37,Unnamed: 36_level_37,Unnamed: 37_level_37,Unnamed: 38_level_37
Weight?,Feature,Unnamed: 2_level_38,Unnamed: 3_level_38,Unnamed: 4_level_38,Unnamed: 5_level_38,Unnamed: 6_level_38,Unnamed: 7_level_38,Unnamed: 8_level_38,Unnamed: 9_level_38,Unnamed: 10_level_38,Unnamed: 11_level_38,Unnamed: 12_level_38,Unnamed: 13_level_38,Unnamed: 14_level_38,Unnamed: 15_level_38,Unnamed: 16_level_38,Unnamed: 17_level_38,Unnamed: 18_level_38,Unnamed: 19_level_38,Unnamed: 20_level_38,Unnamed: 21_level_38,Unnamed: 22_level_38,Unnamed: 23_level_38,Unnamed: 24_level_38,Unnamed: 25_level_38,Unnamed: 26_level_38,Unnamed: 27_level_38,Unnamed: 28_level_38,Unnamed: 29_level_38,Unnamed: 30_level_38,Unnamed: 31_level_38,Unnamed: 32_level_38,Unnamed: 33_level_38,Unnamed: 34_level_38,Unnamed: 35_level_38,Unnamed: 36_level_38,Unnamed: 37_level_38,Unnamed: 38_level_38
+7.168,суп,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+4.308,фо,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+3.499,бульон,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+3.063,мисо,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+2.760,лапша,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+2.730,том,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+2.466,рамена,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+2.398,яма,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+1.827,бо,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+1.263,морепродукт,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,

Weight?,Feature
+7.168,суп
+4.308,фо
+3.499,бульон
+3.063,мисо
+2.760,лапша
+2.730,том
+2.466,рамена
+2.398,яма
+1.827,бо
+1.263,морепродукт

Weight?,Feature
+20.520,блин
+14.202,блинчик
+8.120,блинный
+5.151,варение
+4.585,нутелла
+3.833,сыр
+3.371,фарш
+3.343,сырник
+3.334,начинка
+3.330,джем

Weight?,Feature
+16.007,бургер
+10.772,burger
+9.563,чикенбургер
+8.483,чизбургер
+8.467,булочка
+8.458,булка
+7.980,гамбургер
+7.762,котлета
+7.663,сэндвич
+7.287,фишбургер

Weight?,Feature
+11.050,салат
+10.150,ролл
+9.093,рис
+7.841,панир
+7.615,постный
+7.217,нори
+6.955,вареник
+6.776,суп
+5.614,индийский
+5.395,гречневый

Weight?,Feature
+13.846,рис
+10.131,тяхан
+9.802,wok
+9.580,лапша
+8.695,вок
+8.279,сао
+7.717,соус
+7.554,топпинг
+7.506,ком
+7.041,харусама

Weight?,Feature
+13.422,рис
+8.694,спагетти
+8.616,каша
+7.728,макароны
+7.595,картофель
+5.611,перловка
+5.560,гречка
+5.520,греча
+5.437,лапша
+5.260,панировка

Weight?,Feature
+18.037,драник
+4.122,сметана
+2.274,со
+1.706,подаваться
+1.282,гриб
+0.967,картофельный
+0.891,репка
+0.663,шкварка
+0.567,паштет
+0.552,творожный

Weight?,Feature
+8.312,блин
+8.194,кутаба
+8.005,пхать
+7.247,гренок
+6.927,брускетт
+6.910,спринг
+6.779,хинкали
+6.671,разносол
+6.594,редис
+6.450,сэндвич

Weight?,Feature
+17.201,каша
+6.972,овсяный
+6.710,молоко
+6.404,крупа
+3.956,рисовый
+3.474,хлопья
+2.318,банан
+2.304,овсянка
+2.214,пшенный
+2.198,гречка

Weight?,Feature
+11.939,эспрессо
+11.833,кофе
+11.571,раф
+10.095,латт
+8.480,капучино
+8.321,напиток
+7.077,мл
+6.865,американо
+6.214,вода
+5.918,молоко

Weight?,Feature
+21.210,мороженое
+13.604,пломбир
+13.184,сорбеты
+11.002,мороженый
+8.340,эскимо
+7.333,натуральный
+7.094,сахар
+6.516,магнат
+6.229,шарик
+5.725,вкус

Weight?,Feature
+8.009,стейк
+6.843,котлета
+6.400,баранина
+6.288,свиной
+6.215,бараний
+6.079,ягненок
+6.029,бефстроганов
+5.884,цыпленок
+5.777,говяжий
+5.624,медальон

Weight?,Feature
+7.474,омлет
+5.091,яичница
+4.526,яйцо
+2.207,ветчина
+1.787,бекон
+1.698,зелень
+1.571,глазунья
+1.354,соль
+1.235,молоко
+1.080,помидор

Weight?,Feature
+14.585,тальятелла
+12.152,равиоли
+11.905,паста
+11.627,спагетти
+11.064,лингвинить
+10.802,ньокк
+10.738,пенне
+10.717,ризотто
+10.406,паппарделла
+10.343,лазание

Weight?,Feature
+14.459,вареник
+14.022,пельмень
+8.663,тесто
+6.081,шт
+5.081,пельмешек
+4.622,специя
+4.035,равиоли
+3.995,порция
+3.968,соль
+3.802,творог

Weight?,Feature
+17.674,пирог
+10.310,песочный
+8.852,см
+6.828,пирожок
+6.383,сыр
+5.829,киш
+5.448,начинка
+5.403,кулебяка
+5.360,тесто
+5.237,сдобный

Weight?,Feature
+13.449,тесто
+13.269,пирожок
+6.740,сдобный
+5.784,яйцо
+4.989,дрожжевой
+4.831,сосиска
+4.662,начинка
+4.544,сахар
+4.175,дрожжи
+3.946,творог

Weight?,Feature
+20.016,пицца
+13.562,моцарелл
+13.381,см
+8.725,кальцона
+6.720,фокачча
+6.701,тесто
+5.375,корж
+5.189,лосось
+5.024,принц
+4.928,римский

Weight?,Feature
+15.878,плов
+4.091,рис
+3.960,морковь
+3.843,баранина
+3.720,узбекский
+3.407,чеснок
+3.112,нут
+3.087,лазер
+2.533,изюм
+2.505,барбарис

Weight?,Feature
+17.663,боул
+13.707,пока
+10.404,салат
+6.651,рис
+6.355,кукуруза
+6.034,черри
+5.552,капуста
+5.284,эдамам
+5.274,редис
+5.201,чук

Weight?,Feature
+21.483,пончик
+17.761,донат
+9.423,глазурь
+8.296,начинка
+4.988,сахарный
+4.867,шоколадный
+4.522,пудра
+3.785,шоколад
+3.630,посыпка
+3.338,высокий

Weight?,Feature
+14.470,смузи
+13.493,компот
+12.817,лимонад
+11.503,напиток
+11.347,морс
+10.778,милкшейк
+10.524,кофе
+10.039,чай
+10.019,безалкогольный
+9.985,квас

Weight?,Feature
+2.096,суп
+0.981,гималайский
+0.594,розовый
+0.568,грибной
+0.567,пюре
+0.554,крем
+0.534,петрушка
+0.441,тыквенный
+0.381,соль
+0.375,кокосовый

Weight?,Feature
+16.980,донер
+14.890,бурритый
+13.153,гирос
+7.050,соус
+6.614,пит
+6.441,лаваш
+6.276,тортилья
+5.683,гиро
+5.418,дзадзики
+4.860,шаурма

Weight?,Feature
+9.701,ккал
+7.242,шт
+6.017,мл
+5.961,мак
+5.636,пельмень
+5.470,панчана
+4.810,крылышко
+4.809,ножка
+4.355,филе
+4.333,крыло

Weight?,Feature
+13.224,рак
+11.196,устрица
+8.692,мидия
+8.383,дорадо
+7.556,треск
+7.220,сибас
+7.043,магаданский
+6.864,стейк
+6.410,форель
+6.276,креветка

Weight?,Feature
+22.342,салат
+9.575,боул
+8.501,хе
+8.498,оливье
+7.241,салатный
+6.877,винегрет
+6.682,сарад
+6.409,теплый
+6.360,гренок
+6.177,майонез

Weight?,Feature
+17.490,сэндвич
+9.702,круассан
+9.478,панини
+8.000,багет
+7.520,чиабатт
+6.942,ролл
+6.415,хлеб
+5.960,пит
+5.923,тост
+5.716,бейгл

Weight?,Feature
+27.625,сет
+16.661,набор
+16.643,комбо
+15.926,ролл
+14.107,пицца
+13.136,напиток
+12.485,см
+9.770,сэт
+9.489,суша
+9.133,гункан

Weight?,Feature
+14.436,соус
+8.031,майонез
+6.827,топпинг
+6.111,ткемали
+5.844,сацебель
+5.709,хрен
+5.704,для
+5.636,варение
+5.291,аджика
+5.235,heinz

Weight?,Feature
+12.370,стейк
+7.459,подаваться
+6.926,филе
+6.816,часть
+6.501,вырезка
+6.077,отруб
+5.901,свиной
+5.404,откорм
+5.331,миньон
+5.282,гарнир

Weight?,Feature
+17.437,ролл
+10.031,нори
+8.686,суша
+8.309,сашимь
+7.864,сет
+7.649,гункан
+7.081,огурец
+6.585,пицца
+6.548,рис
+5.809,нигирь

Weight?,Feature
+15.826,сырник
+7.546,творог
+5.186,мука
+4.353,запеканка
+3.565,рисовый
+3.394,сахар
+3.222,яйцо
+2.677,сметана
+1.411,сливки
+1.381,брокколи

Weight?,Feature
+16.652,торт
+14.798,пирожный
+13.581,бисквит
+10.826,чизкейк
+7.971,песочный
+7.792,корж
+7.451,эклер
+5.977,медовик
+5.297,безе
+5.168,десерт

Weight?,Feature
+10.065,хачапури
+8.599,кубдари
+7.458,пирог
+6.806,грузинский
+6.275,лобиани
+6.088,сыр
+5.092,лаваш
+5.040,лепешка
+4.878,тесто
+4.422,пеновани

Weight?,Feature
+15.475,хинкали
+7.331,сулугуни
+6.810,тесто
+6.057,шт
+4.803,кинза
+4.679,кверь
+4.344,фарш
+3.946,специя
+3.908,сыр
+3.450,начинка

Weight?,Feature
+11.806,булочка
+10.674,батон
+10.611,багет
+9.093,чиабатт
+8.985,хлеб
+7.892,фокачча
+6.674,закваска
+5.808,лаваш
+5.359,хлебный
+5.143,тесто

Weight?,Feature
+16.783,шаурма
+11.791,шаверма
+8.210,лаваш
+7.817,донер
+6.231,огурец
+5.490,шашлык
+5.346,пита
+5.211,гирос
+5.107,пит
+4.957,кебаб

Weight?,Feature
+10.166,шашлык
+7.813,шашлычок
+6.589,дорадо
+6.212,стейк
+5.802,перепелка
+5.731,крыло
+5.679,сибас
+5.564,крылышко
+5.451,ребрышко
+5.437,мангал


# LogReg fasttext

In [None]:
NUM_LABELS = 39

In [None]:
!pip install gensim -U

Collecting gensim
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 2.8 kB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.1.2


In [None]:
import gensim
import numpy as np
import torch
from torch import nn

In [None]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

data_c['X'] = data_c['name_dish'] + ' ' + data_c['product_description']
data_c['X_len'] = data_c['X'].apply(lambda x: len([w for w in x.split(' ') if w != '']))
data_c = data_c.loc[data_c['X_len'] > 0]

scaler = StandardScaler()
data_c['price'] = scaler.fit_transform(data_c['price'].values.reshape(-1,1))

enc = OrdinalEncoder()
y = enc.fit_transform(data_c.tags_menu.values.reshape(-1, 1))
data_c['y'] = y
data_c = data_c.astype({'y':int})

In [None]:
word_to_idx = dict()
c = 0

for row in data_c['X']:
    for word in row.split():
        if word not in word_to_idx:
            word_to_idx[word] = c
            c += 1

idx_to_word = {v: k for k, v in word_to_idx.items()}

In [None]:
data_c['X_idx'] = data_c['X'].apply(lambda x: [word_to_idx[word] for word in x.split(' ') if word != ''])

In [None]:
ft_model = gensim.models.fasttext.FastTextKeyedVectors.load('Fasttext/model.model')

In [None]:
vectors = [0] * len(idx_to_word)

for idx, word in idx_to_word.items():
    vectors[idx] = ft_model.get_vector(word)

In [None]:
PAD_VEC = np.array([0.0 for _ in vectors[0]])
PAD_IDX = len(vectors)
vectors.append(PAD_VEC)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_c.drop('y', axis = 1), data_c['y'], test_size=0.2, random_state=42, stratify = data_c['y'])

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class CustomTextDataset(Dataset):
    def __init__(self, price, text, seq_len, labels):
        self.labels = labels
        self.text = text
        self.price = price
        self.seq_len = seq_len
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        label = self.labels[idx]
        text = self.text[idx]
        price = self.price[idx]
        seq_len = self.seq_len[idx]
        return price, text, seq_len, label

def collate_fn(data):
    label_list, price_list, text_list, len_list = list(), list(), list(), list()
    for price, text, seq_len, label in data:
        label_list.append(label)
        price_list.append(price)
        text_list.append(torch.LongTensor(text))
        len_list.append(seq_len)

    label = torch.LongTensor(label_list)
    price = torch.FloatTensor(price_list)
    text = pad_sequence(text_list, padding_value=PAD_IDX, batch_first=True)
    seq_len = torch.LongTensor(len_list)
    return price, text, seq_len, label

train_dataset = CustomTextDataset(X_train['price'].values, X_train['X_idx'].values, X_train['X_len'].values, y_train.values)
test_dataset = CustomTextDataset(X_test['price'].values, X_test['X_idx'].values, X_test['X_len'].values, y_test.values)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=5000, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=5000, shuffle=True, collate_fn=collate_fn)

In [None]:
class TextModel(nn.Module):
    def __init__(self):
        super(TextModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vectors), freeze = True)
        self.fc = nn.Linear(300 + 1, NUM_LABELS)
        torch.nn.init.xavier_uniform_(self.fc.weight)

    def forward(self, price, text, seq_len):
        embedded = self.embedding(text) # batch_len, seq_len, emb_shape
        input = torch.sum(embedded, dim = 1) # batch_len, emb_shape
        input = torch.div(input.permute([1, 0]), seq_len).permute([1, 0]) # batch_len, emb_shape
        input_price = torch.cat((input, price.unsqueeze(1)), dim = 1) #batch_len, emb_shape + 1
        input = self.fc(input_price) # batch_len, NUM_LABELS
        return input

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextModel().to(device)
#for price, text, seq_len, label in train_dataloader:
    #model(price, text, seq_len)
    #break

In [None]:
import torch.optim as optim

class_sample_count = np.unique(data_c['y'].values, return_counts=True)[1]
weight = torch.FloatTensor(1. / class_sample_count)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train(model, device):
    for epoch in range(20):
        print(f"Epoch {epoch}")
        with torch.no_grad():
            eval_loss = list()
            for price, text, seq_len, label in test_dataloader:
                price = price.to(device)
                text = text.to(device)
                seq_len = seq_len.to(device)

                out = model(price, text, seq_len)
                loss = criterion(out.cpu(), label)
                eval_loss.append(loss.item())
            print(f"Eval loss: {np.mean(eval_loss)}")

        epoch_loss = list()
        for price, text, seq_len, label in train_dataloader:
            price = price.to(device)
            text = text.to(device)
            seq_len = seq_len.to(device)

            optimizer.zero_grad()
            out = model(price, text, seq_len)

            #calculate the loss
            loss = criterion(out.cpu(), label)
            
            #track batch loss
            epoch_loss.append(loss.item())
            
            #backpropagation
            loss.backward()
            
            #update the parameters
            optimizer.step()
        print(f"Train loss: {np.mean(epoch_loss)}")

train(model, device)

Epoch 0
Eval loss: 3.610125968622607
Train loss: 2.2846504478059577
Epoch 1
Eval loss: 1.781737626985062
Train loss: 1.5497588088526528
Epoch 2
Eval loss: 1.362334545268569
Train loss: 1.2358828056493454
Epoch 3
Eval loss: 1.1277903484743694
Train loss: 1.047444889531333
Epoch 4
Eval loss: 0.9772300512291664
Train loss: 0.9210676758952395
Epoch 5
Eval loss: 0.8711702102838561
Train loss: 0.8311938520013933
Epoch 6
Eval loss: 0.7933100087698116
Train loss: 0.7638418275929062
Epoch 7
Eval loss: 0.7348279190617938
Train loss: 0.7123704298713504
Epoch 8
Eval loss: 0.6885601947473925
Train loss: 0.6713577142128577
Epoch 9
Eval loss: 0.6535301125326822
Train loss: 0.6381299199437248
Epoch 10
Eval loss: 0.6219314946684726
Train loss: 0.6108163870297946
Epoch 11
Eval loss: 0.5981995435648186
Train loss: 0.5875728638214472
Epoch 12
Eval loss: 0.576340280299963
Train loss: 0.5679779451274307
Epoch 13
Eval loss: 0.5576027520867282
Train loss: 0.5507770688576106
Epoch 14
Eval loss: 0.5431063785109

In [None]:
with torch.no_grad():
    y_pred = list()
    y_true = list()
    for price, text, seq_len, label in test_dataloader:
        price = price.to(device)
        text = text.to(device)
        seq_len = seq_len.to(device)

        out = model(price, text, seq_len)
        y_pred.extend(torch.argmax(out.cpu(), dim = 1).tolist())
        y_true.extend(label)

In [None]:
from sklearn.metrics import classification_report

report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True, target_names = enc.categories_[0])).T

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# report no weights
report.astype({'support': int})

Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.0,0.0,0.0,18
блины и оладьи,0.885508,0.784954,0.832205,1409
бургер,0.891417,0.888906,0.89016,6391
вегетарианская еда,0.540541,0.043431,0.080402,921
вок и лапша,0.866203,0.902314,0.88389,5661
гарниры,0.770196,0.778763,0.774456,5754
драники,0.0,0.0,0.0,29
закуски,0.531379,0.363186,0.431471,4243
каша и гранола,0.833333,0.076923,0.140845,65
кофе,0.357143,0.032538,0.059642,461


In [None]:
# report with weights
report.astype({'support': int})

Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.045455,0.777778,0.08589,18
блины и оладьи,0.863144,0.904187,0.883189,1409
бургер,0.914561,0.906118,0.91032,6391
вегетарианская еда,0.08847,0.397394,0.144721,921
вок и лапша,0.874764,0.899488,0.886953,5661
гарниры,0.842215,0.703163,0.766433,5754
драники,0.333333,0.896552,0.485981,29
закуски,0.463058,0.381098,0.4181,4243
каша и гранола,0.197719,0.8,0.317073,65
кофе,0.162605,0.882863,0.274629,461


# LSTM

In [None]:
!pip install gensim -U -q

[K     |████████████████████████████████| 24.1 MB 2.8 kB/s 
[?25h

In [None]:
import gensim
import numpy as np
import torch
import torch.optim as optim
from torch import nn
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
NUM_LABELS = 39

In [None]:
def preprocess_for_nn(data):
    data_c = data.copy()
    data_c['X'] = data_c['name_dish'] + ' ' + data_c['product_description']
    data_c['X_len'] = data_c['X'].apply(lambda x: len([w for w in x.split(' ') if w != '']))
    data_c = data_c.loc[data_c['X_len'] > 0]

    scaler = StandardScaler()
    data_c['price'] = scaler.fit_transform(data_c['price'].values.reshape(-1,1))

    enc = OrdinalEncoder()
    y = enc.fit_transform(data_c.tags_menu.values.reshape(-1, 1))
    data_c['y'] = y
    data_c = data_c.astype({'y':int})
    return data_c, enc

def get_vectors(data):
    data_c = data.copy()
    word_to_idx = dict()
    c = 0

    for row in data_c['X']:
        for word in row.split():
            if word not in word_to_idx:
                word_to_idx[word] = c
                c += 1

    idx_to_word = {v: k for k, v in word_to_idx.items()}

    data_c['X_idx'] = data_c['X'].apply(lambda x: [word_to_idx[word] for word in x.split(' ') if word != ''])

    ft_model = gensim.models.fasttext.FastTextKeyedVectors.load('Fasttext/model.model')

    vectors = [0] * len(idx_to_word)

    for idx, word in idx_to_word.items():
        vectors[idx] = ft_model.get_vector(word)

    PAD_VEC = np.array([0.0 for _ in vectors[0]])
    PAD_IDX = len(vectors)
    vectors.append(PAD_VEC)
    return data_c, word_to_idx, idx_to_word, vectors, PAD_IDX

data_c, enc = preprocess_for_nn(data_c)
data_c, word_to_idx, idx_to_word, vectors, PAD_IDX = get_vectors(data_c)
X_train, X_test, y_train, y_test = train_test_split(data_c.drop('y', axis = 1), data_c['y'], test_size=0.2, random_state=42, stratify = data_c['y'])

In [None]:
data_c = data_c.loc[data_c.X_len<40]
X_train, X_test, y_train, y_test = train_test_split(data_c.drop('y', axis = 1), data_c['y'], test_size=0.2, random_state=42, stratify = data_c['y'])

In [None]:
class CustomTextDataset(Dataset):
    def __init__(self, price, text, seq_len, labels):
        self.labels = labels
        self.text = text
        self.price = price
        self.seq_len = seq_len
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        label = self.labels[idx]
        text = self.text[idx]
        price = self.price[idx]
        seq_len = self.seq_len[idx]
        return price, text, seq_len, label

def collate_fn(data):
    label_list, price_list, text_list, len_list = list(), list(), list(), list()
    for price, text, seq_len, label in data:
        label_list.append(label)
        price_list.append(price)
        text_list.append(torch.LongTensor(text))
        len_list.append(seq_len)

    label = torch.LongTensor(label_list)
    price = torch.FloatTensor(price_list)
    text = pad_sequence(text_list, padding_value=PAD_IDX, batch_first=True)
    seq_len = torch.LongTensor(len_list)
    return price, text, seq_len, label

train_dataset = CustomTextDataset(X_train['price'].values, X_train['X_idx'].values, X_train['X_len'].values, y_train.values)
test_dataset = CustomTextDataset(X_test['price'].values, X_test['X_idx'].values, X_test['X_len'].values, y_test.values)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=500, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=500, shuffle=True, collate_fn=collate_fn)

In [None]:
class TextModel(nn.Module):
    def __init__(self):
        super(TextModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vectors), freeze = True)
        self.rnn = nn.LSTM(input_size=300, hidden_size=300, batch_first = True, bidirectional = True)
        self.fc1 = nn.Linear(600 + 1, 256)
        self.fc2 = nn.Linear(256, NUM_LABELS)

    def forward(self, price, text, seq_len):
        embedded = self.embedding(text) # batch_len, seq_len, emb_shape
        output, (hidden, cell) = self.rnn(embedded)
        output = torch.cat([hidden[0,:, :], hidden[1,:,:]], dim=1) #batch_len, emb_shape*2
        output = torch.cat((output, price.unsqueeze(1)), dim = 1) #batch_len, emb_shape*2+1
        output = nn.functional.relu(self.fc1(output))
        output = self.fc2(output)
        return output

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextModel().to(device)
#with torch.no_grad():
#    for price, text, seq_len, label in train_dataloader:
#        model(price, text, seq_len)
#        break

In [None]:
import torch.optim as optim

class_sample_count = np.unique(data_c['y'].values, return_counts=True)[1]
weight = torch.FloatTensor(1. / class_sample_count)
weight = np.log(weight*1000000)/100

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [None]:
model.load_state_dict(torch.load('LSTM_10_model_unfreeze_emb'), strict=False)

<All keys matched successfully>

In [None]:
def train(model, device):
    eval_loss_max = 0.16240031869641922
    for epoch in range(10):
        print(f"Epoch {epoch}")
        epoch_loss = list()
        for i, (price, text, seq_len, label) in enumerate(train_dataloader):

            if i % 400 == 0:
                with torch.no_grad():
                    eval_loss = list()
                    for price, text, seq_len, label in test_dataloader:
                        price = price.to(device)
                        text = text.to(device)
                        seq_len = seq_len.to(device)

                        out = model(price, text, seq_len)
                        loss = criterion(out.cpu(), label)
                        eval_loss.append(loss.item())
                    mean_loss = np.mean(eval_loss)
                    if eval_loss_max > mean_loss:
                        eval_loss_max = mean_loss
                        torch.save(model.state_dict(), './LSTM_10_model_unfreeze_emb')
                    print(f"Eval loss: {np.mean(eval_loss)}")
            
            price = price.to(device)
            text = text.to(device)
            seq_len = seq_len.to(device)

            optimizer.zero_grad()
            out = model(price, text, seq_len)

            #calculate the loss
            loss = criterion(out.cpu(), label)
            
            #track batch loss
            epoch_loss.append(loss.item())
            
            #backpropagation
            loss.backward()
            
            #update the parameters
            optimizer.step()
        print(f"Train loss: {np.mean(epoch_loss)}")

train(model, device)

In [None]:
# 4 эпохи + разморозка ембеддингов и уменьшение lr в 2 раза
# 10 words
with torch.no_grad():
    y_pred = list()
    y_true = list()
    for price, text, seq_len, label in test_dataloader:
        price = price.to(device)
        text = text.to(device)
        seq_len = seq_len.to(device)

        out = model(price, text, seq_len)
        y_pred.extend(torch.argmax(out.cpu(), dim = 1).tolist())
        y_true.extend(label)

report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True, target_names = enc.categories_[0])).T
report.astype({'support': int})

Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.631579,0.705882,0.666667,17
блины и оладьи,0.948252,0.969264,0.958643,1399
бургер,0.965189,0.954422,0.959776,6275
вегетарианская еда,0.733154,0.301887,0.427673,901
вок и лапша,0.932732,0.960886,0.9466,5599
гарниры,0.871887,0.887635,0.87969,5758
драники,0.764706,0.448276,0.565217,29
закуски,0.736939,0.75313,0.744947,4233
каша и гранола,0.76,0.876923,0.814286,65
кофе,0.446301,0.40564,0.425,461


In [None]:
# 30 words
with torch.no_grad():
    y_pred = list()
    y_true = list()
    for price, text, seq_len, label in test_dataloader:
        price = price.to(device)
        text = text.to(device)
        seq_len = seq_len.to(device)

        out = model(price, text, seq_len)
        y_pred.extend(torch.argmax(out.cpu(), dim = 1).tolist())
        y_true.extend(label)

report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True, target_names = enc.categories_[0])).T
report.astype({'support': int})

Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.25,0.125,0.166667,16
блины и оладьи,0.943732,0.95945,0.951526,1381
бургер,0.973717,0.943428,0.958333,5851
вегетарианская еда,0.580897,0.342923,0.431259,869
вок и лапша,0.934794,0.958403,0.946451,5385
гарниры,0.883989,0.875768,0.879859,5699
драники,0.638889,0.793103,0.707692,29
закуски,0.721296,0.740955,0.730993,4146
каша и гранола,0.771429,0.84375,0.80597,64
кофе,0.435252,0.274376,0.336579,441


# GRU

In [None]:
!pip install gensim -U -q

[K     |████████████████████████████████| 24.1 MB 2.8 kB/s 
[?25h

In [None]:
import gensim
import numpy as np
import torch
import torch.optim as optim
from torch import nn
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
NUM_LABELS = 39

In [None]:
def preprocess_for_nn(data):
    data_c = data.copy()
    data_c['X'] = data_c['name_dish'] + ' ' + data_c['product_description']
    data_c['X_len'] = data_c['X'].apply(lambda x: len([w for w in x.split(' ') if w != '']))
    data_c = data_c.loc[data_c['X_len'] > 0]

    scaler = StandardScaler()
    data_c['price'] = scaler.fit_transform(data_c['price'].values.reshape(-1,1))

    enc = OrdinalEncoder()
    y = enc.fit_transform(data_c.tags_menu.values.reshape(-1, 1))
    data_c['y'] = y
    data_c = data_c.astype({'y':int})
    return data_c, enc

def get_vectors(data):
    data_c = data.copy()
    word_to_idx = dict()
    c = 0

    for row in data_c['X']:
        for word in row.split():
            if word not in word_to_idx:
                word_to_idx[word] = c
                c += 1

    idx_to_word = {v: k for k, v in word_to_idx.items()}

    data_c['X_idx'] = data_c['X'].apply(lambda x: [word_to_idx[word] for word in x.split(' ') if word != ''])

    ft_model = gensim.models.fasttext.FastTextKeyedVectors.load('Fasttext/model.model')

    vectors = [0] * len(idx_to_word)

    for idx, word in idx_to_word.items():
        vectors[idx] = ft_model.get_vector(word)

    PAD_VEC = np.array([0.0 for _ in vectors[0]])
    PAD_IDX = len(vectors)
    vectors.append(PAD_VEC)
    return data_c, word_to_idx, idx_to_word, vectors, PAD_IDX

data_c, enc = preprocess_for_nn(data_c)
data_c, word_to_idx, idx_to_word, vectors, PAD_IDX = get_vectors(data_c)
X_train, X_test, y_train, y_test = train_test_split(data_c.drop('y', axis = 1), data_c['y'], test_size=0.2, random_state=42, stratify = data_c['y'])

In [None]:
data_c = data_c.loc[data_c.X_len<30]
X_train, X_test, y_train, y_test = train_test_split(data_c.drop('y', axis = 1), data_c['y'], test_size=0.2, random_state=42, stratify = data_c['y'])

In [None]:
class CustomTextDataset(Dataset):
    def __init__(self, price, text, seq_len, labels):
        self.labels = labels
        self.text = text
        self.price = price
        self.seq_len = seq_len
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        label = self.labels[idx]
        text = self.text[idx]
        price = self.price[idx]
        seq_len = self.seq_len[idx]
        return price, text, seq_len, label

def collate_fn(data):
    label_list, price_list, text_list, len_list = list(), list(), list(), list()
    for price, text, seq_len, label in data:
        label_list.append(label)
        price_list.append(price)
        text_list.append(torch.LongTensor(text))
        len_list.append(seq_len)

    label = torch.LongTensor(label_list)
    price = torch.FloatTensor(price_list)
    text = pad_sequence(text_list, padding_value=PAD_IDX, batch_first=True)
    seq_len = torch.LongTensor(len_list)
    return price, text, seq_len, label

train_dataset = CustomTextDataset(X_train['price'].values, X_train['X_idx'].values, X_train['X_len'].values, y_train.values)
test_dataset = CustomTextDataset(X_test['price'].values, X_test['X_idx'].values, X_test['X_len'].values, y_test.values)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=500, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=500, shuffle=True, collate_fn=collate_fn)

In [None]:
class TextModel(nn.Module):
    def __init__(self):
        super(TextModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vectors), freeze = True)
        self.rnn = nn.GRU(input_size=300, hidden_size=300, batch_first = True, bidirectional = True)
        self.fc1 = nn.Linear(600 + 1, 256)
        self.fc2 = nn.Linear(256, NUM_LABELS)

    def forward(self, price, text, seq_len):
        embedded = self.embedding(text) # batch_len, seq_len, emb_shape
        output, hidden = self.rnn(embedded)
        output = torch.cat([hidden[0,:, :], hidden[1,:,:]], dim=1) #batch_len, emb_shape*2
        output = torch.cat((output, price.unsqueeze(1)), dim = 1) #batch_len, emb_shape*2+1
        output = nn.functional.relu(self.fc1(output))
        output = self.fc2(output)
        return output

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextModel().to(device)
with torch.no_grad():
    for price, text, seq_len, label in train_dataloader:
        price = price.to(device)
        text = text.to(device)
        seq_len = seq_len.to(device)
        model(price, text, seq_len)
        break

In [None]:
import torch.optim as optim

class_sample_count = np.unique(data_c['y'].values, return_counts=True)[1]
weight = torch.FloatTensor(1. / class_sample_count)
weight = np.log(weight*1000000)/100

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train(model, device):
    eval_loss_max = 0.2
    for epoch in range(10):
        print(f"Epoch {epoch}")
        epoch_loss = list()
        for i, (price, text, seq_len, label) in enumerate(train_dataloader):

            if i % 400 == 0:
                with torch.no_grad():
                    eval_loss = list()
                    for price, text, seq_len, label in test_dataloader:
                        price = price.to(device)
                        text = text.to(device)
                        seq_len = seq_len.to(device)

                        out = model(price, text, seq_len)
                        loss = criterion(out.cpu(), label)
                        eval_loss.append(loss.item())
                    mean_loss = np.mean(eval_loss)
                    if eval_loss_max > mean_loss:
                        eval_loss_max = mean_loss
                        torch.save(model.state_dict(), './GRU_30')
                    print(f"Eval loss: {np.mean(eval_loss)}")
            
            price = price.to(device)
            text = text.to(device)
            seq_len = seq_len.to(device)

            optimizer.zero_grad()
            out = model(price, text, seq_len)

            #calculate the loss
            loss = criterion(out.cpu(), label)
            
            #track batch loss
            epoch_loss.append(loss.item())
            
            #backpropagation
            loss.backward()
            
            #update the parameters
            optimizer.step()
        print(f"Train loss: {np.mean(epoch_loss)}")

train(model, device)

Epoch 0
Eval loss: 3.6792612262343427
Eval loss: 0.29152624429292373
Eval loss: 0.25422492306360694
Eval loss: 0.22982598502361978
Eval loss: 0.21928413889956067
Train loss: 0.3303072519253853
Epoch 1
Eval loss: 0.21797809611395694
Eval loss: 0.2109480094355884
Eval loss: 0.20165191578034375
Eval loss: 0.19781058589520256
Eval loss: 0.1923676671273551
Train loss: 0.20021503043419145
Epoch 2
Eval loss: 0.19644661577450326
Eval loss: 0.19032416851567172
Eval loss: 0.18700966213472314
Eval loss: 0.18843786939432103
Eval loss: 0.18103033888966646
Train loss: 0.17593201958874155
Epoch 3
Eval loss: 0.18234120609926419
Eval loss: 0.18081746627474182
Eval loss: 0.17618767119270082
Eval loss: 0.1775358840458842
Eval loss: 0.1747961846933971
Train loss: 0.1558462228247556
Epoch 4
Eval loss: 0.17387965064687075
Eval loss: 0.17856438893342078
Eval loss: 0.1767528853345034
Eval loss: 0.1750946183914369
Eval loss: 0.17155427551284105
Train loss: 0.13736325078569764
Epoch 5
Eval loss: 0.1722059706021

KeyboardInterrupt: ignored

In [None]:
# 30 words GRU
model.load_state_dict(torch.load('GRU_30'))
with torch.no_grad():
    y_pred = list()
    y_true = list()
    for price, text, seq_len, label in test_dataloader:
        price = price.to(device)
        text = text.to(device)
        seq_len = seq_len.to(device)

        out = model(price, text, seq_len)
        y_pred.extend(torch.argmax(out.cpu(), dim = 1).tolist())
        y_true.extend(label)

report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True, target_names = enc.categories_[0])).T
report.astype({'support': int})

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.666667,0.125,0.210526,16
блины и оладьи,0.963757,0.943519,0.953531,1381
бургер,0.97412,0.94565,0.959674,5851
вегетарианская еда,0.571429,0.382048,0.457931,869
вок и лапша,0.934299,0.958589,0.946288,5385
гарниры,0.891167,0.869275,0.880085,5699
драники,0.625,0.689655,0.655738,29
закуски,0.689961,0.769175,0.727418,4146
каша и гранола,0.840909,0.578125,0.685185,64
кофе,0.778947,0.1678,0.276119,441


# CNN

In [None]:
!pip install gensim -U -q

[K     |████████████████████████████████| 24.1 MB 2.8 kB/s 
[?25h

In [None]:
import gensim
import numpy as np
import torch
import torch.optim as optim
from torch import nn
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
NUM_LABELS = 39

In [None]:
def preprocess_for_nn(data):
    data_c = data.copy()
    data_c['X'] = data_c['name_dish'] + ' ' + data_c['product_description']
    data_c['X_len'] = data_c['X'].apply(lambda x: len([w for w in x.split(' ') if w != '']))
    data_c = data_c.loc[data_c['X_len'] > 0]

    scaler = StandardScaler()
    data_c['price'] = scaler.fit_transform(data_c['price'].values.reshape(-1,1))

    enc = OrdinalEncoder()
    y = enc.fit_transform(data_c.tags_menu.values.reshape(-1, 1))
    data_c['y'] = y
    data_c = data_c.astype({'y':int})
    return data_c, enc

def get_vectors(data):
    data_c = data.copy()
    word_to_idx = dict()
    c = 0

    for row in data_c['X']:
        for word in row.split():
            if word not in word_to_idx:
                word_to_idx[word] = c
                c += 1

    idx_to_word = {v: k for k, v in word_to_idx.items()}

    data_c['X_idx'] = data_c['X'].apply(lambda x: [word_to_idx[word] for word in x.split(' ') if word != ''])

    ft_model = gensim.models.fasttext.FastTextKeyedVectors.load('Fasttext/model.model')

    vectors = [0] * len(idx_to_word)

    for idx, word in idx_to_word.items():
        vectors[idx] = ft_model.get_vector(word)

    PAD_VEC = np.array([0.0 for _ in vectors[0]])
    PAD_IDX = len(vectors)
    vectors.append(PAD_VEC)
    return data_c, word_to_idx, idx_to_word, vectors, PAD_IDX

data_c, enc = preprocess_for_nn(data_c)
data_c, word_to_idx, idx_to_word, vectors, PAD_IDX = get_vectors(data_c)
X_train, X_test, y_train, y_test = train_test_split(data_c.drop('y', axis = 1), data_c['y'], test_size=0.2, random_state=42, stratify = data_c['y'])

In [None]:
data_c = data_c.loc[data_c.X_len<30]
X_train, X_test, y_train, y_test = train_test_split(data_c.drop('y', axis = 1), data_c['y'], test_size=0.2, random_state=42, stratify = data_c['y'])

In [None]:
class CustomTextDataset(Dataset):
    def __init__(self, price, text, seq_len, labels):
        self.labels = labels
        self.text = text
        self.price = price
        self.seq_len = seq_len
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        label = self.labels[idx]
        text = self.text[idx]
        price = self.price[idx]
        seq_len = self.seq_len[idx]
        return price, text, seq_len, label

def collate_fn(data):
    label_list, price_list, text_list, len_list = list(), list(), list(), list()
    for price, text, seq_len, label in data:
        label_list.append(label)
        price_list.append(price)
        text_list.append(torch.LongTensor(text))
        len_list.append(seq_len)

    label = torch.LongTensor(label_list)
    price = torch.FloatTensor(price_list)
    text = pad_sequence(text_list, padding_value=PAD_IDX, batch_first=True)
    seq_len = torch.LongTensor(len_list)
    return price, text, seq_len, label

train_dataset = CustomTextDataset(X_train['price'].values, X_train['X_idx'].values, X_train['X_len'].values, y_train.values)
test_dataset = CustomTextDataset(X_test['price'].values, X_test['X_idx'].values, X_test['X_len'].values, y_test.values)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=500, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=500, shuffle=True, collate_fn=collate_fn)

In [None]:
class TextModel(nn.Module):
    def __init__(self):
        super(TextModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vectors), freeze = True)
        self.cnn1 = nn.Conv1d(300, 200, 1,)
        self.cnn2 = nn.Conv1d(300, 200, 2)
        self.cnn3 = nn.Conv1d(300, 200, 3)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(600 + 1, 256)
        self.fc2 = nn.Linear(256, NUM_LABELS)

    def forward(self, price, text, seq_len):
        embedded = self.embedding(text) # batch_len, seq_len, emb_shape
        embedded = embedded.permute(0, 2, 1) # batch_len, emb_shape, seq_len
        output1 = self.pool(self.cnn1(embedded))
        output2 = self.pool(self.cnn2(embedded))
        output3 = self.pool(self.cnn3(embedded)) # batch_len, 200, 1
        output = torch.cat([output1, output2, output3], dim=1).squeeze(-1) #batch_len, emb_shape*2
        output = torch.cat((output, price.unsqueeze(1)), dim = 1) #batch_len, emb_shape*2+1
        output = nn.functional.relu(self.fc1(output))
        output = self.fc2(output)
        return output

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextModel().to(device)
with torch.no_grad():
    for price, text, seq_len, label in train_dataloader:
        price = price.to(device)
        text = text.to(device)
        seq_len = seq_len.to(device)
        model(price, text, seq_len)
        break

In [None]:
import torch.optim as optim

class_sample_count = np.unique(data_c['y'].values, return_counts=True)[1]
weight = torch.FloatTensor(1. / class_sample_count)
weight = np.log(weight*1000000)/100

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train(model, device):
    eval_loss_max = 0.2
    for epoch in range(10):
        print(f"Epoch {epoch}")
        epoch_loss = list()
        for i, (price, text, seq_len, label) in enumerate(train_dataloader):

            if i % 400 == 0:
                with torch.no_grad():
                    eval_loss = list()
                    for price, text, seq_len, label in test_dataloader:
                        price = price.to(device)
                        text = text.to(device)
                        seq_len = seq_len.to(device)

                        out = model(price, text, seq_len)
                        loss = criterion(out.cpu(), label)
                        eval_loss.append(loss.item())
                    mean_loss = np.mean(eval_loss)
                    if eval_loss_max > mean_loss:
                        eval_loss_max = mean_loss
                        torch.save(model.state_dict(), './CNN_30')
                    print(f"Eval loss: {np.mean(eval_loss)}")
            
            price = price.to(device)
            text = text.to(device)
            seq_len = seq_len.to(device)

            optimizer.zero_grad()
            out = model(price, text, seq_len)

            #calculate the loss
            loss = criterion(out.cpu(), label)
            
            #track batch loss
            epoch_loss.append(loss.item())
            
            #backpropagation
            loss.backward()
            
            #update the parameters
            optimizer.step()
        print(f"Train loss: {np.mean(epoch_loss)}")

train(model, device)

In [None]:
# 30 words CNN
model.load_state_dict(torch.load('CNN_30'))
with torch.no_grad():
    y_pred = list()
    y_true = list()
    for price, text, seq_len, label in test_dataloader:
        price = price.to(device)
        text = text.to(device)
        seq_len = seq_len.to(device)

        out = model(price, text, seq_len)
        y_pred.extend(torch.argmax(out.cpu(), dim = 1).tolist())
        y_true.extend(label)

report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True, target_names = enc.categories_[0])).T
report.astype({'support': int})

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
азиатский суп,0.5,0.125,0.2,16
блины и оладьи,0.950072,0.95076,0.950416,1381
бургер,0.9625,0.94753,0.954957,5851
вегетарианская еда,0.560201,0.385501,0.456714,869
вок и лапша,0.924112,0.956546,0.940049,5385
гарниры,0.863207,0.881383,0.8722,5699
драники,0.655172,0.655172,0.655172,29
закуски,0.791148,0.659672,0.719453,4146
каша и гранола,0.891304,0.640625,0.745455,64
кофе,0.460317,0.263039,0.334776,441


# Results

In [1]:
import pandas as pd

pd.DataFrame(
    {
        'LogReg fasttext mean no weights' : [0.632497, 0.502078, 0.529205],
        'LogReg fasttext mean class weights' : [0.556070, 0.819777, 0.606949],
        'Naive Bayes CountVectorizer 100 words threshold' : [0.625416, 0.665721, 0.632699],
        'LogReg Tf-idf' : [0.715430, 0.663825, 0.673679],
        'biGRU max 30 sent len' : [0.831924, 0.768638, 0.785555],
        'CNN 1,2,3 kernel, max 30 sent len' : [0.812518, 0.770949, 0.785609],
        'biLSTM max 30 sent len': [0.810185, 0.801872, 0.801475],
        'biLSTM max 40 sent len + unfreeze emb' : [0.827857, 0.812538, 0.813990],

    }
    , index = ['precision', 'recall', 'f1-score']
).T

Unnamed: 0,precision,recall,f1-score
LogReg fasttext mean no weights,0.632497,0.502078,0.529205
LogReg fasttext mean class weights,0.55607,0.819777,0.606949
Naive Bayes CountVectorizer 100 words threshold,0.625416,0.665721,0.632699
LogReg Tf-idf,0.71543,0.663825,0.673679
biGRU max 30 sent len,0.831924,0.768638,0.785555
"CNN 1,2,3 kernel, max 30 sent len",0.812518,0.770949,0.785609
biLSTM max 30 sent len,0.810185,0.801872,0.801475
biLSTM max 40 sent len + unfreeze emb,0.827857,0.812538,0.81399


# Speller


In [None]:
%cd /content/drive/MyDrive/NLP Made

/content/drive/MyDrive/NLP Made


In [None]:
!pip install pyaspeller -q

In [None]:
import numpy as np
to_spell = np.load('to_spell.npy').tolist()

In [None]:
len(to_spell)

98719

In [None]:
from pyaspeller import YandexSpeller
speller = YandexSpeller()

In [None]:
speller.spelled('директорчкая')

'директорская'

In [None]:
spelled = speller.spelled(' '.join(to_spell[98709:])).split()
print(len(spelled))

10


In [None]:
to_spell[98709:]

['директорчкая',
 'пастичио',
 'пармариво',
 'фелицио',
 'ротату',
 'филицио',
 'емельяновский',
 'мачанкой',
 'тайбола',
 'бефсторгановым']

In [None]:
final = list()

In [None]:
final.extend(spelled)

In [None]:
for i, j in zip(to_spell[98709:], final):
    if i != j:
        print(i, j)

директорчкая директорская


In [None]:
len(speller.spelled(' '.join(to_spell[:10000])).split())

10000

In [None]:
spelled

In [None]:
import pandas as pd
import html
pd.set_option('display.max_rows', 500)

data = pd.read_pickle('menu_df_for_made.pkl')
data.dropna(inplace = True)
data['name_dish'] = data.name_dish.apply(lambda x: normalize(html.unescape(x)))
data['product_description'] = data.product_description.apply(lambda x: normalize(html.unescape(x)))

In [None]:
len(data.product_description.unique())

811483

In [None]:
to_spell = {i:None for i in to_spell}

In [None]:
np.save('to_spell', to_spell)

In [None]:
from threading import Thread
import numpy as np
from pyaspeller import YandexSpeller

def replace(subset):
    speller = YandexSpeller()

    for word in subset:
        correct = speller.spelled(word)
        to_spell[word] = correct

def split(a, n):
    k, m = divmod(len(a), n)
    return [a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]


def main(num_threads, num_words):
    # create threads
    work = list()
    subset_len = num_words*num_threads
    subset = [k for k, v in to_spell.items() if v is None][:subset_len]
    splitted = split(subset, num_threads)
    for item in splitted:
        work.append(item)
    threads = [Thread(target=replace, args=([job])) for job in work]

    # start the threads
    for thread in threads:
        thread.start()

    # wait for the threads to complete
    for thread in threads:
        thread.join()

    

if __name__ == "__main__":
    to_spell = np.load('to_spell.npy', allow_pickle=True).item()

    try:
        main(2,10)
    except Exception:
        np.save('to_spell', to_spell)
    
    np.save('to_spell', to_spell)

In [None]:
%cd /content/drive/MyDrive/NLP Made
!pip install pyaspeller -q

/content/drive/MyDrive/NLP Made


In [None]:
!python speller_threads.py

Completed 0 in Thread 0
Completed 0 in Thread 1
Completed 0 in Thread 2
Completed 0 in Thread 3
Completed 0 in Thread 4
Completed 0 in Thread 5
Completed 0 in Thread 6
Completed 0 in Thread 7
Completed 0 in Thread 8
Completed 0 in Thread 9
Completed 0 in Thread 10
Completed 0 in Thread 11
Completed 0 in Thread 12
Completed 0 in Thread 13
Completed 0 in Thread 14
Completed 0 in Thread 15
Completed 0 in Thread 16
Completed 0 in Thread 17
Completed 0 in Thread 18
Completed 0 in Thread 19
Completed 0 in Thread 20
Completed 0 in Thread 21
Completed 0 in Thread 22
Completed 0 in Thread 23
Completed 0 in Thread 24
Completed 0 in Thread 25
Completed 0 in Thread 26
Completed 0 in Thread 27
Completed 0 in Thread 28
Completed 0 in Thread 29
Completed 0 in Thread 30
Completed 0 in Thread 31
Completed 0 in Thread 32
Completed 0 in Thread 33
Completed 0 in Thread 34
Completed 0 in Thread 35
Completed 0 in Thread 36
Completed 0 in Thread 37
Completed 0 in Thread 38
Completed 0 in Thread 39
Finished T

In [None]:
import numpy as np
to_spell = np.load('to_spell.npy', allow_pickle=True).item()

In [None]:
len({k:v for k, v in to_spell.items() if v is not None})

0

In [None]:
list({k:v for k, v in to_spell.items() if v is not None and k!=v}.items())[2500:2700]

[('рванные', 'рваные'),
 ('речпатый', 'репчатый'),
 ('куруруза', 'кукуруза'),
 ('покрошенного', 'покрашенного'),
 ('эдомер', 'эдамер'),
 ('гавядина', 'говядина'),
 ('моцареклла', 'моцарелла'),
 ('игредиент', 'ингредиент'),
 ('охтничьи', 'охотничьи'),
 ('коласки', 'коляски'),
 ('моцареллаx', 'моцарелла'),
 ('пепперониx', 'пепперони'),
 ('марджорио', 'марджори'),
 ('халап', 'халапеньо'),
 ('папмезан', 'пармезан'),
 ('ддрожжи', 'дрожжи'),
 ('сливи', 'сливки'),
 ('слабосоелный', 'слабосоленый'),
 ('мидиии', 'мидии'),
 ('чепотле', 'чипотле'),
 ('гарчичный', 'горчичный'),
 ('яйам', 'яйцам'),
 ('моцарелласыры', 'моцарелла сыры'),
 ('napolitano', 'наполитано'),
 ('томатычерри', 'томаты черри'),
 ('мяся', 'мяса'),
 ('колабски', 'колбаски'),
 ('телледжио', 'таледжио'),
 ('фиета', 'фиеста'),
 ('поидоры', 'помидоры'),
 ('регано', 'орегано'),
 ('оликовое', 'оливковое'),
 ('пармиджанно', 'пармиджанино'),
 ('невидалия', 'не видали'),
 ('пертрушка', 'петрушка'),
 ('макадами', 'макадамии'),
 ('пармезан

#Next

## Уникальных collected_name 45980

## Уникальных collected_description 41252

# После удаления опечаток

## Уникальных collected_name 35909

## Уникальных collected_description 28827

Для name_dish использовать пересечение двух collected_name и collected_description. Так можно избавиться от "оригинальных названий" типа Карлсон или Goods Game. Останется 20к слов. Хз

Для полученных словарей попробовать их отфильтровать получив векторы и прогнав через knn разделив на 2 класса. Но так могут потеряться слова, например поке -> лемма -> пока -> вектор не рядом с едой. Поэтому можно отдельно прогнать для лемм и оригинальных слов.
Хочется использовать фасттекст, потому что много слов порождено опечатками

Но, например, слово калифорния в нашем контексте, относится скорее к роллам, чем к штату, что может не заметить языковая модель. Поэтому популярные нужно оставить, а редкие проверить векторами

Можно прогнать на опечатки через яндекс спеллер

После очистки словарей отсортировать полученные значения в ячейках и оставить только уникальные строки



In [None]:
from collections import Counter
Counter(collected_description).most_common()[-2400:-2000]

In [None]:
model = gensim.models.fasttext.FastTextKeyedVectors.load('Fasttext/model.model')

In [None]:
model.most_similar('ченсок')

[('дженсон', 0.5545827150344849),
 ('лоренсо', 0.5365540385246277),
 ('чень', 0.529872715473175),
 ('ченнай', 0.5085588693618774),
 ('мартинесс', 0.5060179829597473),
 ('ченгд', 0.501118540763855),
 ('тайпей', 0.49762624502182007),
 ('hongqiao', 0.49383875727653503),
 ('ухань', 0.487272709608078),
 ('чентаи', 0.4863824248313904)]

In [None]:
!pip install pyaspeller

Collecting pyaspeller
  Downloading pyaspeller-0.2.0-py2.py3-none-any.whl (12 kB)
Installing collected packages: pyaspeller
Successfully installed pyaspeller-0.2.0


In [None]:
from pyaspeller import YandexSpeller
speller = YandexSpeller()
fixed = speller.spelled('париезан')
fixed

'пармезан'