In [1]:
import pandas as pd 
import numpy as np
import json
import ast
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import os
from dotenv import load_dotenv
from scipy.spatial.distance import euclidean
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances, manhattan_distances




load_dotenv()



True

## Reading data

In [2]:
PATH = os.getenv("DATA_PATH")
attributes_df = pd.read_parquet(PATH + "attributes.parquet")
resnet_df = pd.read_parquet(PATH + "resnet.parquet")
text_and_bert_df = pd.read_parquet(PATH + "text_and_bert.parquet")
train_pairs_df = pd.read_parquet(PATH + "train.parquet")
test_pairs_db = pd.read_parquet(PATH + "test.parquet")

In [3]:
train_pairs_df.head()

Unnamed: 0,variantid1,variantid2,target
0,1447875869,1447872068,1
1,1176231201,284733670,1
2,658617865,549848659,0
3,719320625,719370486,1
4,1067645658,949954740,0


In [4]:
attributes_df.head()

Unnamed: 0,variantid,categories,characteristic_attributes_mapping
0,47920382,"{""1"": ""EPG"", ""2"": ""Детские товары"", ""3"": ""Игру...","{""Цвет товара"": [""бежевый"", ""светло-розовый""],..."
1,49801845,"{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...","{""Количество в упаковке, шт"": [""1""], ""Бренд"": ..."
2,49853444,"{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...","{""Бренд"": [""Vervaco""], ""Тип"": [""Набор для выши..."
3,49893028,"{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...","{""Цвет товара"": [""серый""], ""Ширина, см"": [""0.8..."
4,49987483,"{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...","{""Цвет товара"": [""разноцветный""], ""Название цв..."


In [5]:
def split_cat_product(json_data):
    try:
        categories = json.loads(json_data)
        return [categories.get(str(i), None) for i in range(1, 5)]
    except json.JSONDecodeError:
        print(f"Error decoding JSON: {json_data}")
        return [None] * 4

attributes_df[["cat_level_1", "cat_level_2", "cat_level_3", "cat_level_4"]] = attributes_df["categories"].apply(
    lambda x: pd.Series(split_cat_product(x))
)


In [6]:
attributes_df.head()

Unnamed: 0,variantid,categories,characteristic_attributes_mapping,cat_level_1,cat_level_2,cat_level_3,cat_level_4
0,47920382,"{""1"": ""EPG"", ""2"": ""Детские товары"", ""3"": ""Игру...","{""Цвет товара"": [""бежевый"", ""светло-розовый""],...",EPG,Детские товары,Игрушки,Игрушка для ванной
1,49801845,"{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...","{""Количество в упаковке, шт"": [""1""], ""Бренд"": ...",EPG,Хобби и творчество,Материал для рукоделия,"Стеклярус, бусины, стразы"
2,49853444,"{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...","{""Бренд"": [""Vervaco""], ""Тип"": [""Набор для выши...",EPG,Хобби и творчество,"Набор для рукоделия, творчества",Набор для вышивания
3,49893028,"{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...","{""Цвет товара"": [""серый""], ""Ширина, см"": [""0.8...",EPG,Хобби и творчество,"Нитки, пряжа","Тесьма, кружево, лента, резинка"
4,49987483,"{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...","{""Цвет товара"": [""разноцветный""], ""Название цв...",EPG,Хобби и творчество,Настольные и карточные игры,Настольная игра


In [7]:
def join_attrib_by_pare(pairs_df:pd.DataFrame):
    features_df = (
        pairs_df
        .merge(
            attributes_df
            .add_suffix('1'),
            on="variantid1"
        )
        .merge(
            attributes_df
            .add_suffix('2'),
            on="variantid2"
        )
    )
    features_df = (
        features_df
        .merge(
            text_and_bert_df
            .add_suffix('1'),
            on="variantid1"
        )
        .merge(
            text_and_bert_df
            .add_suffix('2'),
            on="variantid2"
        )
    )
    features_df = (
        features_df
        .merge(
            resnet_df
            .add_suffix('1'),
            on="variantid1"
        )
        .merge(
            resnet_df
            .add_suffix('2'),
            on="variantid2"
        )
    )
    return features_df
    
train_features_df = join_attrib_by_pare(train_pairs_df)

train_features_df.shape

(1168516, 25)

In [8]:
train_features_df.head()

Unnamed: 0,variantid1,variantid2,target,categories1,characteristic_attributes_mapping1,cat_level_11,cat_level_21,cat_level_31,cat_level_41,categories2,...,name1,description1,name_bert_641,name2,description2,name_bert_642,main_pic_embeddings_resnet_v11,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v12,pic_embeddings_resnet_v12
0,1447875869,1447872068,1,"{""1"": ""Одежда и обувь"", ""2"": ""Одежда"", ""3"": ""О...","{""Серия в одежде и обуви"": [""Harper's Bazaar (...",Одежда и обувь,Одежда,Одежда,"Свитер, джемпер","{""1"": ""Одежда и обувь"", ""2"": ""Одежда"", ""3"": ""О...",...,Свитер Uniqlo Harper's Bazaar (журнал),,"[-0.3927455246448517, 0.4909455478191376, 0.56...",Свитер Uniqlo Harper's Bazaar (журнал),,"[-0.3927455246448517, 0.4909455478191376, 0.56...","[[0.5318107604980469, 0.35363996028900146, -0....","[[0.48105278611183167, 0.6172202825546265, -0....","[[0.5318107604980469, 0.35363996028900146, -0....","[[0.37079137563705444, 0.21004441380500793, -0..."
1,1176231201,284733670,1,"{""1"": ""EPG"", ""2"": ""Товары для взрослых"", ""3"": ...","{""Тип фиксатора БДСМ"": [""Наручники""], ""Вибраци...",EPG,Товары для взрослых,БДСМ,Наручники и фиксаторы,"{""1"": ""EPG"", ""2"": ""Товары для взрослых"", ""3"": ...",...,Ремень для фиксации предплечий к запястьям Джага,Фиксатор поможет правильно зафиксировать партн...,"[-0.5404430627822876, 0.31526750326156616, 0.3...",Ремень для фиксации предплечий к запястьям Джа...,"БДСМ, для женщин, стандартный, черный <br>мате...","[-0.5957552790641785, 0.3414252698421478, 0.46...","[[0.4308440089225769, 0.7620932459831238, 0.79...","[[0.5668608546257019, 0.9573432803153992, 1.01...","[[0.5668608546257019, 0.9573432803153992, 1.01...","[[0.4432561695575714, 0.7602171301841736, 0.76..."
2,658617865,549848659,0,"{""1"": ""EPG"", ""2"": ""Товары для животных"", ""3"": ...","{""Артикул"": [""83809""], ""Комплектация"": [""Сухой...",EPG,Товары для животных,Корма и лакомства для кошек и собак,Корм сухой,"{""1"": ""EPG"", ""2"": ""Товары для животных"", ""3"": ...",...,Сухой корм Hill's Science Plan для кошек с кур...,Сухой корм Hill&#39;s Science Plan для взрослы...,"[-0.4767049252986908, 0.5492467880249023, 0.41...",HILL'S SP Optimal Care Сухой корм д/кошек с Ку...,\nСухой корм для взрослых кошек HILL'S SCIENCE...,"[-0.26443710923194885, 0.3921107053756714, 0.5...","[[-0.36238163709640503, 0.4316844344139099, -0...","[[-0.5063896179199219, 0.707477331161499, -0.1...","[[-0.25123998522758484, 0.3757574260234833, -0...","[[-0.3037261366844177, -1.0693305730819702, 0...."
3,719320625,719370486,1,"{""1"": ""EPG"", ""2"": ""Строительство и ремонт"", ""3...","{""Комплектация"": [""Бур 310 мм, 1 штука.""], ""Цв...",EPG,Строительство и ремонт,Расходники для инструмента,"Бур, сверло, набор сверл","{""1"": ""EPG"", ""2"": ""Строительство и ремонт"", ""3...",...,Бур SDS-plus 20х310мм ELITECH,Бур Elitech используется в перфораторах с патр...,"[-0.6116019487380981, 0.4067917764186859, 0.54...",Бур SDS-plus 20х310мм ELITECH,Бур Elitech используется в перфораторах с патр...,"[-0.6116019487380981, 0.4067917764186859, 0.54...","[[0.7327960729598999, -0.7488707900047302, 0.5...",,"[[0.7327960729598999, -0.7488707900047302, 0.5...",
4,1067645658,949954740,0,"{""1"": ""EPG"", ""2"": ""Канцелярские товары"", ""3"": ...","{""Комплектация"": [""Товар поставляется в упаков...",EPG,Канцелярские товары,Пенал,Пенал,"{""1"": ""EPG"", ""2"": ""Канцелярские товары"", ""3"": ...",...,"Пенал-тубус мягкий 65x210 мм, ПТ-22 ""Кошачьи л...","Материал: Металл, пластик, текстиль<br/> Габар...","[-0.6164239048957825, 0.6769619584083557, 0.73...","Пенал-тубус мягкий 65 х 210 мм, ПТ-22 Милая панда","Пенал-тубус мягкий 65 х 210 мм, ПТ-22 Милая панда","[-0.5198364853858948, 0.6435920596122742, 0.64...","[[-1.3140270709991455, -0.8071212768554688, 0....","[[-1.3976482152938843, -0.6419062614440918, 0....","[[-0.49589139223098755, -0.5760805606842041, 0...",


In [9]:
train_features_df[["cat_level_1_1", "cat_level_2_1", "cat_level_3_1", "cat_level_4_1"]] = train_features_df["categories1"].apply(lambda x: pd.Series(split_cat_product(x)))
train_features_df[["cat_level_1_2", "cat_level_2_2", "cat_level_3_2", "cat_level_4_2"]] = train_features_df["categories2"].apply(lambda x: pd.Series(split_cat_product(x)))

In [10]:
train_features_df.head()

Unnamed: 0,variantid1,variantid2,target,categories1,characteristic_attributes_mapping1,cat_level_11,cat_level_21,cat_level_31,cat_level_41,categories2,...,main_pic_embeddings_resnet_v12,pic_embeddings_resnet_v12,cat_level_1_1,cat_level_2_1,cat_level_3_1,cat_level_4_1,cat_level_1_2,cat_level_2_2,cat_level_3_2,cat_level_4_2
0,1447875869,1447872068,1,"{""1"": ""Одежда и обувь"", ""2"": ""Одежда"", ""3"": ""О...","{""Серия в одежде и обуви"": [""Harper's Bazaar (...",Одежда и обувь,Одежда,Одежда,"Свитер, джемпер","{""1"": ""Одежда и обувь"", ""2"": ""Одежда"", ""3"": ""О...",...,"[[0.5318107604980469, 0.35363996028900146, -0....","[[0.37079137563705444, 0.21004441380500793, -0...",Одежда и обувь,Одежда,Одежда,"Свитер, джемпер",Одежда и обувь,Одежда,Одежда,"Свитер, джемпер"
1,1176231201,284733670,1,"{""1"": ""EPG"", ""2"": ""Товары для взрослых"", ""3"": ...","{""Тип фиксатора БДСМ"": [""Наручники""], ""Вибраци...",EPG,Товары для взрослых,БДСМ,Наручники и фиксаторы,"{""1"": ""EPG"", ""2"": ""Товары для взрослых"", ""3"": ...",...,"[[0.5668608546257019, 0.9573432803153992, 1.01...","[[0.4432561695575714, 0.7602171301841736, 0.76...",EPG,Товары для взрослых,БДСМ,Наручники и фиксаторы,EPG,Товары для взрослых,БДСМ,Наручники и фиксаторы
2,658617865,549848659,0,"{""1"": ""EPG"", ""2"": ""Товары для животных"", ""3"": ...","{""Артикул"": [""83809""], ""Комплектация"": [""Сухой...",EPG,Товары для животных,Корма и лакомства для кошек и собак,Корм сухой,"{""1"": ""EPG"", ""2"": ""Товары для животных"", ""3"": ...",...,"[[-0.25123998522758484, 0.3757574260234833, -0...","[[-0.3037261366844177, -1.0693305730819702, 0....",EPG,Товары для животных,Корма и лакомства для кошек и собак,Корм сухой,EPG,Товары для животных,Корма и лакомства для кошек и собак,Корм сухой
3,719320625,719370486,1,"{""1"": ""EPG"", ""2"": ""Строительство и ремонт"", ""3...","{""Комплектация"": [""Бур 310 мм, 1 штука.""], ""Цв...",EPG,Строительство и ремонт,Расходники для инструмента,"Бур, сверло, набор сверл","{""1"": ""EPG"", ""2"": ""Строительство и ремонт"", ""3...",...,"[[0.7327960729598999, -0.7488707900047302, 0.5...",,EPG,Строительство и ремонт,Расходники для инструмента,"Бур, сверло, набор сверл",EPG,Строительство и ремонт,Расходники для инструмента,"Бур, сверло, набор сверл"
4,1067645658,949954740,0,"{""1"": ""EPG"", ""2"": ""Канцелярские товары"", ""3"": ...","{""Комплектация"": [""Товар поставляется в упаков...",EPG,Канцелярские товары,Пенал,Пенал,"{""1"": ""EPG"", ""2"": ""Канцелярские товары"", ""3"": ...",...,"[[-0.49589139223098755, -0.5760805606842041, 0...",,EPG,Канцелярские товары,Пенал,Пенал,EPG,Канцелярские товары,Пенал,Пенал


In [11]:
train_features_df["is_equal_cat_1"] = train_features_df.apply(lambda x: 1 if x["cat_level_1_1"] == x["cat_level_1_2"] else 0, axis=1)
train_features_df["is_equal_cat_2"] = train_features_df.apply(lambda x: 1 if x["cat_level_2_1"] == x["cat_level_2_2"] else 0, axis=1)
train_features_df["is_equal_cat_3"] = train_features_df.apply(lambda x: 1 if x["cat_level_3_1"] == x["cat_level_3_2"] else 0, axis=1)
train_features_df["is_equal_cat_4"] = train_features_df.apply(lambda x: 1 if x["cat_level_4_1"] == x["cat_level_4_2"] else 0, axis=1)

Calculate distance between pictures embeddings

In [14]:
train_features_df['pic_embeddings_resnet_v11'].fillna(train_features_df['main_pic_embeddings_resnet_v11'], inplace=True)
train_features_df['pic_embeddings_resnet_v12'].fillna(train_features_df['main_pic_embeddings_resnet_v12'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_features_df['pic_embeddings_resnet_v12'].fillna(train_features_df['main_pic_embeddings_resnet_v12'], inplace=True)


In [None]:
train_features_df['pic_embeddings_resnet_v11'][0]

In [15]:
train_features_df['main_pic_distance'] = train_features_df.apply(
    lambda row: euclidean(row['main_pic_embeddings_resnet_v11'], row['main_pic_embeddings_resnet_v12']),
    axis=1
)

In [16]:
train_features_df['text_distance_cos'] = train_features_df.apply(
    lambda row: cosine_distances([row['name_bert_641']], [row['name_bert_642']]),
    axis=1
)

train_features_df['text_distance_euc'] = train_features_df.apply(
    lambda row: euclidean_distances([row['name_bert_641']], [row['name_bert_642']]),
    axis=1
)

train_features_df['text_distance_man'] = train_features_df.apply(
    lambda row: manhattan_distances([row['name_bert_641']], [row['name_bert_642']]),
    axis=1
)


In [17]:
total = train_features_df.shape[0]
for i in range (1,5):
    not_the_same = train_features_df[train_features_df[f'is_equal_cat_{i}']==0].shape[0]
    the_same = total - not_the_same
    print(not_the_same,the_same)


102 1168414
484 1168032
45325 1123191
121250 1047266


In [18]:

category_data_1 = {}

for category in train_features_df["cat_level_1_1"].unique():
    filtered_df = attributes_df[attributes_df["cat_level_1"] == category]
    characteristics_series = filtered_df["characteristic_attributes_mapping"]
    
    key_frequencies = {}  # Словарь для подсчета частоты ключей
    value_frequencies = {}  # Словарь для подсчета частоты значений по ключам

    for characteristic_text in characteristics_series.dropna():  
        characteristic_data = json.loads(characteristic_text) 
        
        for attribute, values in characteristic_data.items():  
            if attribute not in key_frequencies:
                key_frequencies[attribute] = 0
            key_frequencies[attribute] += 1
            
            if attribute not in value_frequencies:
                value_frequencies[attribute] = {}
            
            for value in values:
                if value not in value_frequencies[attribute]:
                    value_frequencies[attribute][value] = 0
                value_frequencies[attribute][value] += 1

    category_data_1[category] = (key_frequencies, value_frequencies, len(characteristics_series.dropna()))

# Выводим результат (если нужно)
# print(category_data_1)


In [19]:
category_data_1.keys()

dict_keys(['Одежда и обувь', 'EPG', 'Книги и цифровые книги'])

In [15]:

category_data_2 = {}

for category in train_features_df["cat_level_2_1"].unique():
    filtered_df = attributes_df[attributes_df["cat_level_2"] == category]
    characteristics_series = filtered_df["characteristic_attributes_mapping"]
    
    key_frequencies = {}  # Словарь для подсчета частоты ключей
    value_frequencies = {}  # Словарь для подсчета частоты значений по ключам

    for characteristic_text in characteristics_series.dropna():  
        characteristic_data = json.loads(characteristic_text) 
        
        for attribute, values in characteristic_data.items():  
            if attribute not in key_frequencies:
                key_frequencies[attribute] = 0
            key_frequencies[attribute] += 1
            
            if attribute not in value_frequencies:
                value_frequencies[attribute] = {}
            
            for value in values:
                if value not in value_frequencies[attribute]:
                    value_frequencies[attribute][value] = 0
                value_frequencies[attribute][value] += 1

    category_data_2[category] = (key_frequencies, value_frequencies, len(characteristics_series.dropna()))

# Выводим результат (если нужно)
# print(category_data_2)


In [None]:
category_data_2.keys()

In [17]:

category_data_3 = {}

for category in train_features_df["cat_level_3_1"].unique():
    filtered_df = attributes_df[attributes_df["cat_level_3"] == category]
    characteristics_series = filtered_df["characteristic_attributes_mapping"]
    
    key_frequencies = {}  # Словарь для подсчета частоты ключей
    value_frequencies = {}  # Словарь для подсчета частоты значений по ключам

    for characteristic_text in characteristics_series.dropna():  
        characteristic_data = json.loads(characteristic_text) 
        
        for attribute, values in characteristic_data.items():  
            if attribute not in key_frequencies:
                key_frequencies[attribute] = 0
            key_frequencies[attribute] += 1
            
            if attribute not in value_frequencies:
                value_frequencies[attribute] = {}
            
            for value in values:
                if value not in value_frequencies[attribute]:
                    value_frequencies[attribute][value] = 0
                value_frequencies[attribute][value] += 1

    category_data_3[category] = (key_frequencies, value_frequencies, len(characteristics_series.dropna()))

# Выводим результат (если нужно)
# print(category_data_3)


In [None]:
category_data_3.keys()

In [19]:

category_data_4 = {}

for category in train_features_df["cat_level_4_1"].unique():
    filtered_df = attributes_df[attributes_df["cat_level_4"] == category]
    characteristics_series = filtered_df["characteristic_attributes_mapping"]
    
    key_frequencies = {}  # Словарь для подсчета частоты ключей
    value_frequencies = {}  # Словарь для подсчета частоты значений по ключам

    for characteristic_text in characteristics_series.dropna():  
        characteristic_data = json.loads(characteristic_text) 
        
        for attribute, values in characteristic_data.items():  
            if attribute not in key_frequencies:
                key_frequencies[attribute] = 0
            key_frequencies[attribute] += 1
            
            if attribute not in value_frequencies:
                value_frequencies[attribute] = {}
            
            for value in values:
                if value not in value_frequencies[attribute]:
                    value_frequencies[attribute][value] = 0
                value_frequencies[attribute][value] += 1

    category_data_4[category] = (key_frequencies, value_frequencies, len(characteristics_series.dropna()))

# Выводим результат (если нужно)
# print(category_data_4)


In [None]:
category_data_4.keys()

### Let's check how many different categories we have for each level

In [None]:
#attributes_df['categories'] = attributes_df['categories']
unique_categories_by_level = {}
for categories in attributes_df['categories']:
    if isinstance(categories, str):  
        categories = ast.literal_eval(categories)
    for level, category in categories.items():
        if level not in unique_categories_by_level:
            unique_categories_by_level[level] = set()
        unique_categories_by_level[level].add(category)
for level, categories in unique_categories_by_level.items():
    print(f"Level {level}: {len(categories)} unique categories")


In [None]:
train_features_df.shape

In [20]:
X_train_val_df, X_test_df = train_test_split(
    train_features_df, 
    test_size=0.02, 
    stratify=train_features_df[['cat_level_1_1', 'target']]
#     stratify=train_df[['cat_level_2_1', 'target']]
)
X_train_val_df.shape, X_test_df.shape

((1145145, 41), (23371, 41))

In [21]:
X_test_df.head().columns

Index(['variantid1', 'variantid2', 'target', 'categories1',
       'characteristic_attributes_mapping1', 'cat_level_11', 'cat_level_21',
       'cat_level_31', 'cat_level_41', 'categories2',
       'characteristic_attributes_mapping2', 'cat_level_12', 'cat_level_22',
       'cat_level_32', 'cat_level_42', 'name1', 'description1',
       'name_bert_641', 'name2', 'description2', 'name_bert_642',
       'main_pic_embeddings_resnet_v11', 'pic_embeddings_resnet_v11',
       'main_pic_embeddings_resnet_v12', 'pic_embeddings_resnet_v12',
       'cat_level_1_1', 'cat_level_2_1', 'cat_level_3_1', 'cat_level_4_1',
       'cat_level_1_2', 'cat_level_2_2', 'cat_level_3_2', 'cat_level_4_2',
       'is_equal_cat_1', 'is_equal_cat_2', 'is_equal_cat_3', 'is_equal_cat_4',
       'main_pic_distance', 'text_distance_cos', 'text_distance_euc',
       'text_distance_man'],
      dtype='object')

In [22]:
features_columns = [
    'is_equal_cat_1', 'is_equal_cat_2', 'is_equal_cat_3', 'is_equal_cat_4',
    "main_pic_distance", "text_distance_cos", "text_distance_euc", "text_distance_man"
]


In [1]:

def plot_feature_importance(importance, names, model_name="", top_n=-1, skip_columns=None):
    skip_columns = skip_columns or []
    
    fi_df = pd.DataFrame({
        'feature_names': names,
        'feature_importance': importance
    })

    fi_df = fi_df[~fi_df['feature_names'].isin(skip_columns)]
    
    fi_df = fi_df.sort_values(by='feature_importance', ascending=False)
    
    if top_n != -1:
        fi_df = fi_df.head(top_n)
    
    plt.figure(figsize=(10, 8))
    sns.barplot(x='feature_importance', y='feature_names', data=fi_df, palette="viridis")
    
    title = f"{model_name} FEATURE IMPORTANCE"
    if top_n != -1:
        title += f" (Top: {top_n})"
    plt.title(title)
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    plt.tight_layout()
    plt.show()
    
    return fi_df


Model for category 1

In [2]:
def train_test_split_by_cat(full_df, cat_name:str):
    full_df = full_df[full_df['cat_level_1_1'] == cat_name]
    X_train_df, X_val_df = train_test_split(
        full_df, 
        test_size=0.05, 
        stratify=full_df["target"]
    )
    # X_train_df = full_df
    return X_train_df, X_val_df

models_by_cat = {}
for cat_name in train_features_df["cat_level_1_1"].unique():
    print(cat_name)
    model = CatBoostClassifier(iterations=1000,
                    eval_metric =  "AUC", 
                    early_stopping_rounds=500, 
                    ignored_features=['variantid1', 'variantid2'],
                    cat_features=category_data_1,
                    # od_type = "Iter",
                    # od_wait = 200,                               
                    random_state=53,
                    task_type="CPU",
                    learning_rate = 0.1
                    )  
    
    X_train_df, X_val_df = train_test_split_by_cat(full_df=X_train_val_df, cat_name=cat_name)
    target_column = 'target'

    X_train = X_train_df[features_columns]
    y_train = X_train_df[target_column]

    X_val = X_val_df[features_columns]
    y_val = X_val_df[target_column]
    
    model.fit(X_train, y_train,
              eval_set=(X_val, y_val), 
              plot=True, 
              verbose=True 
             )
    
    plot_feature_importance(model.get_feature_importance(), (features_columns ),'CATBOOST ', top_n=10)
    plt.show()
    models_by_cat[cat_name] = model

NameError: name 'train_features_df' is not defined