In [None]:
# libraries
import io
import os
from datetime import datetime
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tensorflow.keras.applications import MobileNetV3Small
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from sklearn import metrics
import seaborn as sns
# Matplotlib configuration
font = { 'family': 'DejaVu Sans', 'weight': 'bold', 'size': 16 }
plt.rc('font', **font)


In [None]:
# set random seed
np.random.seed(seed=42)

In [None]:
PATH = f"/content/drive/My Drive/DCU/Machine Learning/data/data_2025/2025"

In [None]:
import os
parquet_filenames_train = os.listdir(os.path.join(PATH, 'train'))

In [None]:
parquet_filenames_train[:3]

['part-00285-of-00362.parquet.parquet',
 'part-00206-of-00362.parquet.parquet',
 'part-00320-of-00362.parquet.parquet']

In [None]:
f"Number of files in train: {len(parquet_filenames_train):,}"

'Number of files in train: 362'

In [None]:
parquet_filenames_test = os.listdir(os.path.join(PATH, 'test'))

In [None]:
parquet_filenames_test[:3]

['part-00109-of-00362.parquet.parquet',
 'part-00092-of-00362.parquet.parquet',
 'part-00123-of-00362.parquet.parquet']

In [None]:
f"Number of files in test: {len(parquet_filenames_test):,}"

'Number of files in test: 362'

# 1.EDA

## 1.1 Training Dataset

In [None]:
def read_data(folder, filenames):
    """ Read parquet data from mutiple files """

    dataframes = [
        pd.read_parquet(f"{PATH}/{folder}/{filename}")
        for filename in filenames
    ]

    return pd.concat(dataframes)

In [None]:
df_train = read_data("train", parquet_filenames_train)

In [None]:
df_train.shape

(229624, 23)

In [None]:
list(df_train.columns)

['product_id',
 'title',
 'description',
 'tags',
 'type',
 'room',
 'craft_type',
 'recipient',
 'material',
 'occasion',
 'holiday',
 'art_subject',
 'style',
 'shape',
 'pattern',
 'bottom_category_id',
 'bottom_category_text',
 'top_category_id',
 'top_category_text',
 'primary_color_id',
 'primary_color_text',
 'secondary_color_id',
 'secondary_color_text']

In [None]:
df_train.iloc[0]

Unnamed: 0,0
product_id,1500855633
title,Driftwood Solar Lamp Handcrafted Off Grid Sola...
description,Free shipping in Canada and the USA. This one...
tags,"Driftwood Lamp,driftwood solar,nautical,cottag..."
type,physical
room,patio & outdoor
craft_type,
recipient,
material,wood
occasion,


In [None]:
df_train.iloc[0][3]

  df_train.iloc[0][3]


'Driftwood Lamp,driftwood solar,nautical,cottage decor,unique,one of a kind,handcrafted,off grid,garden decor,farmhouse decor,house boat decor,solar,garden store'

In [None]:
df_train['type'].unique()

array(['physical', 'download', ''], dtype=object)

In [None]:
df_train['room'].unique()

array(['patio & outdoor', '', 'living room', 'entryway', 'game room',
       'craft', 'bedroom', 'office', 'kitchen & dining', 'nursery',
       'kids', 'dorm', 'bathroom', 'laundry', 'bar', 'porch', 'garage',
       'man cave', 'yard'], dtype=object)

In [None]:
df_train['craft_type'].unique()

array(['', 'party & gifting', "kids' crafts", 'sculpture',
       'printing & printmaking', 'card making & stationery',
       'scrapbooking', 'bookbinding', 'ceramics & pottery', 'collage',
       'woodworking & carpentry', 'beading', 'jewelry making',
       'paper stamping', 'drawing & drafting', 'hat making & hair crafts',
       'floral arranging', 'crochet', 'knitting', 'watchmaking',
       'gardening', 'nail art', 'metalworking', 'sewing', 'quilting',
       'shoemaking', 'leatherworking', 'upholstery', 'photography',
       'needlepoint', 'embroidery', 'crewel', 'cross stitch',
       'electronics & circuitry', 'doll making', 'model making',
       'tatting & lacemaking', 'robotics', 'dollhouses & miniatures',
       'home improvement', 'painting', 'framing', 'coloring',
       '3d printing', 'calligraphy', 'rug making', 'dairy & kombucha',
       'neon art', 'dyeing & batik', 'carving & whittling', 'tattooing',
       'baking', 'stained glass', 'lampworking & shaping', 'candy

In [None]:
len(df_train['top_category_id'].unique())

15

In [None]:
len(df_train['top_category_text'].unique())

15

In [None]:
len(df_train['bottom_category_id'].unique())

2609

In [None]:
len(df_train['bottom_category_text'].unique())

2609

In [None]:
pd.value_counts(df_train['bottom_category_text'], sort=True, ascending=False)

  pd.value_counts(df_train['bottom_category_text'], sort=True, ascending=False)


Unnamed: 0_level_0,count
bottom_category_text,Unnamed: 1_level_1
art_and_collectibles.drawing_and_illustration.architectural_drawings,98
craft_supplies_and_tools.closures_and_fasteners.buttons,98
accessories.scarves_and_wraps.scarves,98
home_and_living.kitchen_and_dining.dining_and_serving.cake_stands,98
toys_and_games.sports_and_outdoor_games.martial_arts_and_boxing.boxing_gloves,98
...,...
home_and_living.furniture.entryway_furniture.standing_coat_racks,44
clothing.girls_clothing.baby_girls_clothing.socks_and_leg_warmers,44
home_and_living.lighting.light_accessories,43
clothing.womens_clothing.pajamas_and_robes.sleep_masks_and_blindfolds.blindfolds,42


In [None]:
df_train[df_train['bottom_category_text'] == 'home_and_living']

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,...,shape,pattern,bottom_category_id,bottom_category_text,top_category_id,top_category_text,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text
178,1387867018,Bi-Color Cherry Tomato Heirloom Non GMO seeds,1-1.5 inch sweet little morsels. Yellow/orange...,"Orange,SmallCherry,Sweeet,Tomatoes",physical,,,,,,...,,,891,home_and_living,8,home_and_living,18,yellow,10,orange
179,1340456157,"Elephant Mug, Elephant Coffee Mug, Elephant Mu...","Elephant Mug, Elephant Mug For Gift Christmas,...","elephant mug,elephant gift mug,coffee mug elep...",physical,,,,metal,,...,,,891,home_and_living,8,home_and_living,17,white,1,black
180,192105352,In Stock Enamel House Number #1-99. 4 7/8&quo...,This custom house number is carefully handcraf...,"french house number,enamel house number,house ...",physical,,,,,housewarming,...,,,891,home_and_living,8,home_and_living,1,black,17,white
181,505442658,Black Powder Coated KISS Cup,Custom Powder Coated and laser engraved 30 oun...,"Yeti Cup,KISS BAND MERCH,Friend Gift,KISS,Blac...",physical,,,,,birthday,...,,,891,home_and_living,8,home_and_living,1,black,16,silver
182,1510422862,Retro Hand Drawn Desk Mat Extra Large Small De...,"Extra large desk pad, light blue hand drawn pr...","gaming desk mat,keyboard mat,desk accessories,...",physical,,,,,birthday,...,,,891,home_and_living,8,home_and_living,2,blue,1,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,1130135243,"February 14th Sign, Valentine Decor, Small Woo...","+ February 14th Sign, Valentine Decor, Small W...","valentines day,valentines decor,valentines sig...",physical,,,,,housewarming,...,,,891,home_and_living,8,home_and_living,11,pink,14,red
260,494195061,Miami Heat Themed Memory Board,16 x 20 memory board (bulletin board that does...,"Miami Heat,basketball,team,sports,memory board...",physical,,,,,,...,,,891,home_and_living,8,home_and_living,1,black,14,red
261,997297099,Tourmaline Black Hand Thrown Stoneware Pottery...,*** ORDER SHIPPING/FULFILLMENT INFORMATION ***...,"Mug,stoneware,pottery,modern,kitchen,coffee,te...",physical,,,,,housewarming,...,,,891,home_and_living,8,home_and_living,1,black,19,other
262,966157237,Personalized Garbage Truck Plate - Godparent G...,NOTE: The silverware set is not included with ...,"Kids Melamine,Childrens Dinnerware,Childrens M...",physical,,,,,,...,,,891,home_and_living,8,home_and_living,2,blue,9,green


In [None]:
# Find rows with max_depth of 7
max_depth_rows = df_train[df_train['bottom_category_text'].apply(
    lambda x: len(x.split('.')) == 7
)]

# Count how many such rows exist
print(f"Number of rows with depth 7: {len(max_depth_rows)}")

# Display the bottom_category_text for these rows
print("\nCategories with depth 7:")
for category in max_depth_rows['bottom_category_text'].unique():
    print(category)

# Optional: show a sample of these rows with relevant columns
print("\nSample rows with depth 7:")
print(max_depth_rows[['bottom_category_id', 'bottom_category_text', 'top_category_text']].head())

Number of rows with depth 7: 93

Categories with depth 7:
home_and_living.kitchen_and_dining.cookware.pots_and_pans.pans.skillets.cast_iron_skillets

Sample rows with depth 7:
     bottom_category_id                               bottom_category_text  \
495                2297  home_and_living.kitchen_and_dining.cookware.po...   
496                2297  home_and_living.kitchen_and_dining.cookware.po...   
497                2297  home_and_living.kitchen_and_dining.cookware.po...   
498                2297  home_and_living.kitchen_and_dining.cookware.po...   
499                2297  home_and_living.kitchen_and_dining.cookware.po...   

    top_category_text  
495   home_and_living  
496   home_and_living  
497   home_and_living  
498   home_and_living  
499   home_and_living  


## 1.2 Test dataset

The test dataset will be used to predict using the model/s you have trained

## Challenge

Etsy has nearly 100 Million active listings on the etsy.com marketplace for sale from more than 5 million active sellers.

The task is to leverage the training dataset to learn patterns from and to predict the following attributes on an unseen test dataset given a products’ information:

- **top category id**
- **bottom category id**

The goal is to **maximize F1** for each of the classes on each attribute to predict (top category, bottom category). Your approaches and models will be benchmarked against a hidden test dataset.

**Bonus points** will be given for submissions that:

- **visualize some learned representations or embeddings** and show that similar items cluster together
- compares the performance of pre-trained embeddings taken from hubs or papers with a fine-tuned model


In [None]:
df_test = read_data("test", parquet_filenames_test)

# 新段落

In [None]:
df_test.shape

(25514, 15)

In [None]:
list(df_test.columns)

['product_id',
 'title',
 'description',
 'tags',
 'type',
 'room',
 'craft_type',
 'recipient',
 'material',
 'occasion',
 'holiday',
 'art_subject',
 'style',
 'shape',
 'pattern']

In [None]:
df_test.iloc[0]

Unnamed: 0,0
product_id,1397234990
title,Antler Skull Deer Skull Hand Painted Beads Dec...
description,Hand painted deer antlers.\nThe skull is natur...
tags,
type,physical
room,
craft_type,
recipient,
material,
occasion,


In [None]:
df_test.head()

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,holiday,art_subject,style,shape,pattern
0,1397234990,Antler Skull Deer Skull Hand Painted Beads Dec...,Hand painted deer antlers.\nThe skull is natur...,,physical,,,,,,,,,,
1,1167322940,Hemifusus Conchilidium - Collectible Shell Spe...,Measures 2.625 x 1.625 x 1.25 inches.\n\nSeash...,"small conch,hemifusus,orange,orange shell,spir...",physical,,,,,,,,,,
2,1346575470,Excavation set with real bones / hammer chisel...,Discovery fun for young and old!\nThere are RE...,,physical,,,,,,,,,,
3,1607587430,Rabbit&#39;s Paw Rabbit Foot Paw Claw Glass De...,"For sale is this glass filled with moss, pine ...",,physical,,,,,,,,,,
4,1633456300,Coyote Tooth and Freshwater Pearl Ornament | S...,Coyote Tooth and Freshwater Pearl Ornament 🌸\n...,"ostara,oddities,pagan art,goblincore,forest wi...",physical,,,,,birthday,,,,,


In [None]:
df_train.head()

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,...,shape,pattern,bottom_category_id,bottom_category_text,top_category_id,top_category_text,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text
0,1500855633,Driftwood Solar Lamp Handcrafted Off Grid Sola...,Free shipping in Canada and the USA. This one...,"Driftwood Lamp,driftwood solar,nautical,cottag...",physical,patio & outdoor,,,wood,,...,,,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,4,brown,1,black
1,717452434,Coconut bistro patio light hand carved in Bali...,Hand carved Bali coconut bistro light covers p...,"Hand carved coconut,coconut shell,coconut cand...",physical,,,,,,...,,,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,4,brown,0,beige
2,868201745,"Garden outdoor lamp for meditation, white pin...",Ready to ship! \n\nOnly one piece!\n\n\nA Lot...,"Meditation lamp,Lotus for meditation,Outdoor l...",physical,,,,,,...,,,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,17,white,11,pink
3,718866859,"Sunflower solar mason jar light, solar outdoor...",Sunflower Jenni Jar\nThis solar rechargeable b...,"solar lights,mason jar lights,Farmhouse lights...",physical,,,,,housewarming,...,,,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,18,yellow,5,clear
4,1052996924,NWT Rae Dunn Queen Pool Lounger,"Gorgeous , measurements 49&quot; x 30&quot; , ...",,physical,,,,,,...,,,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,17,white,1,black


In [None]:
df_train['top_category_id'].unique()

array([ 8, 13,  0,  6,  5, 14, 10,  1,  7,  2,  4,  3, 11,  9, 12])

In [None]:
df_train['bottom_category_id'].unique()

array([1117, 1582, 1143, ...,  989,  284,  977])

In [None]:
df_train['bottom_category_id'].min()

1

In [None]:
df_train['bottom_category_id'].max()

12493

In [None]:
#df_train[df_train['bottom_category_id']==1]

In [None]:
# Create a function to count dots in a string
def count_dots(s):
    return s.count('.')

# Count the occurrences of each depth
depth_counts = df_train['bottom_category_text'].apply(lambda x: len(x.split('.'))).value_counts().sort_index()

# Display results
print("마지막 노드의 분포정황:")
for depth, count in depth_counts.items():
    print(f"depth {depth} (dots: {depth-1}): {count}")

# Alternative visualization with dots directly
dot_counts = df_train['bottom_category_text'].apply(count_dots).value_counts().sort_index()

마지막 노드의 분포정황:
depth 1 (dots: 0): 1345
depth 2 (dots: 1): 13292
depth 3 (dots: 2): 87221
depth 4 (dots: 3): 91004
depth 5 (dots: 4): 30493
depth 6 (dots: 5): 6176
depth 7 (dots: 6): 93


In [None]:
df_train['bottom_category_text']

# 2.Feature Engineering

## 2-1 Feature Augmentation
* According to given top_category_text and bottom_category_text, add level1 to level 6
* If top_category_text == bottom_category: continue, fill level1 to level6 with empty
* e.g. if bottom_category_text: electronics.phones.smartphones.android, and top_category_text: electronics, then level1 would be :electronics.phones, level2:electronics.phones.smartphones and so on....

In [None]:
df1 = df_train.copy()

In [None]:
def create_hierarchical_features(df):
    # 결과를 저장할 새로운 DataFrame 생성
    result_df = df.copy()

    # 각 layer_text 컬럼 초기화 (빈 문자열로)
    for i in range(1, 8):  # layer1부터 layer7까지
        result_df[f'layer{i}_category_text'] = ''

    # 모든 bottom_category_text를 분할하는 함수
    def split_categories(text):
        return text.split('.') if isinstance(text, str) else []

    # 카테고리 분할
    category_lists = df['bottom_category_text'].apply(split_categories)

    # 각 레이어에 값 할당 + 경로 누적 생성
    for i in range(1, 8):  # layer1부터 layer7까지
        result_df[f'layer{i}_category_text'] = category_lists.apply(
            lambda cats: '>'.join(cats[:i]) if i <= len(cats) else ''
        )

    return result_df

In [None]:
data = {
    'id': [1, 2, 3, 4, 5],
    'top_category_text': ['electronics', 'clothing', 'books', 'home', 'beauty'],
    'bottom_category_text': [
        'electronics.phones.smartphones.android',
        'clothing.women.dresses',
        'books',
        'home.kitchen.appliances.coffee_makers',
        'beauty.skincare.moisturizers'
    ]
}

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head()

Unnamed: 0,id,top_category_text,bottom_category_text
0,1,electronics,electronics.phones.smartphones.android
1,2,clothing,clothing.women.dresses
2,3,books,books
3,4,home,home.kitchen.appliances.coffee_makers
4,5,beauty,beauty.skincare.moisturizers


In [None]:
df_test_example = create_hierarchical_features(df)

In [None]:
df_test_example

Unnamed: 0,id,top_category_text,bottom_category_text,layer1_category_text,layer2_category_text,layer3_category_text,layer4_category_text,layer5_category_text,layer6_category_text,layer7_category_text
0,1,electronics,electronics.phones.smartphones.android,electronics,electronics>phones,electronics>phones>smartphones,electronics>phones>smartphones>android,,,
1,2,clothing,clothing.women.dresses,clothing,clothing>women,clothing>women>dresses,,,,
2,3,books,books,books,,,,,,
3,4,home,home.kitchen.appliances.coffee_makers,home,home>kitchen,home>kitchen>appliances,home>kitchen>appliances>coffee_makers,,,
4,5,beauty,beauty.skincare.moisturizers,beauty,beauty>skincare,beauty>skincare>moisturizers,,,,


In [None]:
df1 = create_hierarchical_features(df_train)

In [None]:
df1.iloc[10000]

Unnamed: 0,1745
product_id,1612686816
title,"Slytherin snake elastic bookmark, green and si..."
description,Elastic bookmark featuring a beautiful snake i...
tags,"book lover gifts,book lover Christmas,Elastic ..."
type,physical
room,
craft_type,
recipient,
material,
occasion,


In [None]:
# layer1_category_text 열에서 빈 값이 아닌 행 개수 확인
non_empty_count = df1[df1['layer7_category_text'] != ''].shape[0]

print(f"layer7_category_text에서 빈 값이 아닌 행의 개수: {non_empty_count}")

layer7_category_text에서 빈 값이 아닌 행의 개수: 93


In [None]:
df1.shape

(229624, 30)

## 2-2 Text processing

In [None]:
import re

# Text preprocessing function
def preprocess_text(text):
   if pd.isna(text):
       return ""
   # Remove HTML tags
   text = re.sub(r'&[a-zA-Z0-9]+;', ' ', text)
   # Remove URLs with www
   text = re.sub(r'www\.[^\s]+', ' ', text)
   # Replace special characters with spaces
   text = re.sub(r'[^\w\s]', ' ', text)
   # Replace multiple spaces with a single space
   text = re.sub(r'\s+', ' ', text)
   return text.lower().strip()

# Feature combination function with all features
def combine_all_features(row):
    features = []

    columns = ['title', 'description', 'tags','type', 'room', 'craft_type',
               'recipient', 'material', 'occasion', 'holiday',
               'art_subject', 'style', 'shape', 'pattern']

    for col in columns:
        if not pd.isna(row[col]) and row[col] != '':
            features.append(str(row[col]))

    return ' '.join(features)

In [None]:
processed_train_df = df1.copy()

In [None]:
processed_train_df['all_features'] = processed_train_df.apply(combine_all_features, axis=1)
processed_train_df['processed_text_all'] = processed_train_df['all_features'].apply(preprocess_text)

In [None]:
processed_train_df.shape

(229624, 32)

In [None]:
processed_train_df.iloc[0]

Unnamed: 0,0
product_id,1500855633
title,Driftwood Solar Lamp Handcrafted Off Grid Sola...
description,Free shipping in Canada and the USA. This one...
tags,"Driftwood Lamp,driftwood solar,nautical,cottag..."
type,physical
room,patio & outdoor
craft_type,
recipient,
material,wood
occasion,


In [None]:
# # Filter rows where bottom_category_text equals top_category_text
# matching_rows = processed_train_df[processed_train_df['bottom_category_id'] == processed_train_df['top_category_id']]

# # Display the number of matching rows
# print(f"Number of rows where bottom_category_text equals top_category_text: {len(matching_rows)}")

# # Display the first few matching rows
# print(matching_rows[['top_category_id', 'bottom_category_id']].head())

In [None]:
def create_layer_datasets(df):
    """
    각 레이어 컬럼(layer1_category_text ~ layer7_category_text)에서
    빈 값이 아닌 행들만 추출하여 7개의 데이터프레임을 생성합니다.

    Parameters:
    df (pandas.DataFrame): 입력 데이터프레임

    Returns:
    dict: 각 레이어별 데이터프레임을 담은 사전 객체
    """
    # 결과를 저장할 사전 초기화
    layer_datasets = {}

    # 각 레이어에 대해 처리
    for i in range(1, 8):
        # 레이어 컬럼명
        layer_col = f'layer{i}_category_text'

        # 해당 레이어 컬럼이 존재하는지 확인
        if layer_col not in df.columns:
            print(f"경고: {layer_col} 컬럼이 데이터프레임에 존재하지 않습니다.")
            layer_datasets[f'df_layer{i}'] = pd.DataFrame()  # 빈 데이터프레임
            continue

        # 해당 레이어에서 빈 값이 아닌 행만 선택
        layer_df = df[df[layer_col] != '']

        # 결과 저장
        layer_datasets[f'df_layer{i}'] = layer_df

        # 정보 출력
        print(f"df_layer{i}: {layer_df.shape}")

    return layer_datasets

# 사용 예시
# layer_datasets = create_layer_datasets(processed_train_df)
# df_layer1 = layer_datasets['df_layer1']
# df_layer2 = layer_datasets['df_layer2']
# ...
# df_layer7 = layer_datasets['df_layer7']

In [None]:
layer_datasets = create_layer_datasets(processed_train_df)

df_layer1: (229624, 32)
df_layer2: (228279, 32)
df_layer3: (214987, 32)
df_layer4: (127766, 32)
df_layer5: (36762, 32)
df_layer6: (6269, 32)
df_layer7: (93, 32)


In [None]:
df_layer1 = layer_datasets['df_layer1']
df_layer2 = layer_datasets['df_layer2']
df_layer3 = layer_datasets['df_layer3']
df_layer4 = layer_datasets['df_layer4']
df_layer5 = layer_datasets['df_layer5']
df_layer6 = layer_datasets['df_layer6']
df_layer7 = layer_datasets['df_layer7']

In [None]:
df_layer1.iloc[0]

Unnamed: 0,0
product_id,1500855633
title,Driftwood Solar Lamp Handcrafted Off Grid Sola...
description,Free shipping in Canada and the USA. This one...
tags,"Driftwood Lamp,driftwood solar,nautical,cottag..."
type,physical
room,patio & outdoor
craft_type,
recipient,
material,wood
occasion,


In [None]:
df_bottom = processed_train_df.copy()

In [None]:
print(df_layer1.shape)
print(df_layer2.shape)
print(df_layer3.shape)
print(df_layer4.shape)
print(df_layer5.shape)
print(df_layer6.shape)
print(df_layer7.shape)
print(df_bottom.shape)

(229624, 32)
(228279, 32)
(214987, 32)
(127766, 32)
(36762, 32)
(6269, 32)
(93, 32)
(229624, 32)


In [None]:
df_layer6.head()

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,...,secondary_color_text,layer1_category_text,layer2_category_text,layer3_category_text,layer4_category_text,layer5_category_text,layer6_category_text,layer7_category_text,all_features,processed_text_all
358,1525840934,"Personalised White Champagne Prosecco Flutes, ...",Personalised Champagne Flute\n\nThis product f...,"BIRTHDAY GIFT,NOVELTY GIFT,UNIQUE GIFT,GIFT FO...",physical,,,,plastic,wedding,...,white,home_and_living,home_and_living>kitchen_and_dining,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,,"Personalised White Champagne Prosecco Flutes, ...",personalised white champagne prosecco flutes b...
359,897638623,"Set of two crystal champagne flutes, hand pain...","Duo of two pretty golden flutes, hand painted....",,physical,,,,glass,wedding,...,clear,home_and_living,home_and_living>kitchen_and_dining,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,,"Set of two crystal champagne flutes, hand pain...",set of two crystal champagne flutes hand paint...
360,737273061,Gold bridesmaid champagne flutes- bridesmaid g...,This listing is for one(1) stemless glass cham...,"bride glass,bride flute,bride champagne,weddin...",physical,,,,glass,wedding,...,gold,home_and_living,home_and_living>kitchen_and_dining,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,,Gold bridesmaid champagne flutes- bridesmaid g...,gold bridesmaid champagne flutes bridesmaid gi...
361,1174874769,Champagne Flute Colour of Love painted glass (...,Valentines Colour of Love\n\nHand Painted Cham...,"Champagne,Floral glass,Prosecco,Wedding gift,B...",physical,,,,glass,,...,rainbow,home_and_living,home_and_living>kitchen_and_dining,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,,Champagne Flute Colour of Love painted glass (...,champagne flute colour of love painted glass c...
362,1406616499,2 Vintage Belfor Exquisite Crystal 6oz Short S...,Two Vintage Belfor Exquisite Crystal 6oz Short...,"vintage barware,vintage crystal,vintage Belfor...",physical,,,,glass,bachelorette party,...,black,home_and_living,home_and_living>kitchen_and_dining,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,home_and_living>kitchen_and_dining>drink_and_b...,,2 Vintage Belfor Exquisite Crystal 6oz Short S...,2 vintage belfor exquisite crystal 6oz short s...


In [None]:
#df_layer7

In [None]:
basic_path = '/content/drive/My Drive/DCU/Machine Learning/level/'

In [None]:
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

basic_path = '/content/drive/My Drive/DCU/Machine Learning/level/'
def generate_lev_df(df, layer):
  df[f'{layer}_category'] = '__label__' + df[f'{layer}_category_text'].astype(str)
  df[f'{layer}_category_with_text'] = df[f'{layer}_category'] + " " + df['processed_text_all']

  return df

def save_input_file(df, layer):
  # Split training/validation data
  train, val = train_test_split(
    df,
    test_size=0.2,
    stratify=df[f'{layer}_category_text'],
    random_state=42
)
  print(f"Training data size: {len(train)}")
  print(f"Validation data size: {len(val)}")

  train.to_csv(f"{basic_path}/{layer}.train", columns=[f'{layer}_category_with_text'], index=False, header=False)
  val.to_csv(f"{basic_path}/{layer}.val", columns=[f'{layer}_category_with_text'], index=False, header=False)

In [None]:
df_layer1_input = generate_lev_df(df_layer1, layer='layer1')
df_layer2_input = generate_lev_df(df_layer2, layer='layer2')
df_layer3_input = generate_lev_df(df_layer3, layer='layer3')
df_layer4_input = generate_lev_df(df_layer4, layer='layer4')
df_layer5_input = generate_lev_df(df_layer5, layer='layer5')
df_layer6_input = generate_lev_df(df_layer6, layer='layer6')
df_layer7_input = generate_lev_df(df_layer7, layer='layer7')
df_bottom_input = generate_lev_df(df_bottom, layer='bottom')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{layer}_category'] = '__label__' + df[f'{layer}_category_text'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{layer}_category_with_text'] = df[f'{layer}_category'] + " " + df['processed_text_all']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{layer}_category'] = '__la

In [None]:
df_bottom_input.shape

(229624, 34)

In [None]:
save_input_file(df_layer1_input, layer='layer1')
save_input_file(df_layer2_input, layer='layer2')
save_input_file(df_layer3_input, layer='layer3')
save_input_file(df_layer4_input, layer='layer4')
save_input_file(df_layer5_input, layer='layer5')
save_input_file(df_layer6_input, layer='layer6')
save_input_file(df_layer7_input, layer='layer7')
save_input_file(df_bottom_input, layer='bottom')

Training data size: 183699
Validation data size: 45925
Training data size: 182623
Validation data size: 45656
Training data size: 171989
Validation data size: 42998
Training data size: 102212
Validation data size: 25554
Training data size: 29409
Validation data size: 7353
Training data size: 5015
Validation data size: 1254
Training data size: 74
Validation data size: 19
Training data size: 183699
Validation data size: 45925


In [None]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313502 sha256=70478855585630d9b40f32b1333f81fba6cc39000659a7c5b18ec7da6dd909db
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [None]:
import fasttext

In [None]:

layer1_model = fasttext.train_supervised(
    input=f'{basic_path}/layer1.train',
    lr = 1.5,
    dim = 70,
    epoch = 6,
    ws = 6,
    wordNgrams = 3,
    loss = 'ova'
    )
result = layer1_model.test(f"{basic_path}/layer1.val")
layer1_model.save_model(f"{basic_path}/layer1.bin")

In [None]:
result

(45925, 0.8996407185628742, 0.8996407185628742)

In [None]:
model_layer2 = fasttext.train_supervised(
    input=f'{basic_path}/layer2.train',
    lr = 1.2,
    dim = 70,
    epoch = 6,
    ws = 6,
    wordNgrams = 3,
    loss = 'ova'
    )
result = model_layer2.test(f"{basic_path}/layer2.val")
print(result)

(45656, 0.823330997021202, 0.823330997021202)


In [None]:
model_layer2.save_model(f"{basic_path}/layer2.bin")

In [None]:
model_layer3 = fasttext.train_supervised(
    input=f'{basic_path}/layer3.train',
    lr = 1.0,
    dim = 100,
    epoch = 10,
    ws = 6,
    wordNgrams = 2,
    loss = 'softmax'
    )
result = model_layer3.test(f"{basic_path}/layer3.val")
print(result)
#76

(42998, 0.7386622633610865, 0.7386622633610865)


In [None]:
model_layer3.save_model(f"{basic_path}/layer3.bin")

In [None]:
model_layer4 = fasttext.train_supervised(
    input=f'{basic_path}/layer4.train',
    lr = 1.0,
    dim = 100,
    epoch = 15,
    ws = 6,
    wordNgrams = 3,
    loss = 'softmax'
    )
result = model_layer4.test(f"{basic_path}/layer4.val")
print(result)

(25554, 0.748219456836503, 0.748219456836503)


In [None]:
model_layer4.save_model(f"{basic_path}/layer4.bin")

In [None]:
model_layer5 = fasttext.train_supervised(
    input=f'{basic_path}/layer5.train',
    lr = 1.0,
    dim = 100,
    epoch = 15,
    ws = 6,
    wordNgrams = 3,
    loss = 'ova'
    )
result = model_layer5.test(f"{basic_path}/layer5.val")
print(result)

(7353, 0.8076975384196926, 0.8076975384196926)


In [None]:
model_layer5.save_model(f"{basic_path}/layer5.bin")

In [None]:
model_layer6 = fasttext.train_supervised(
    input=f'{basic_path}/layer6.train',
    lr = 1.2,
    dim = 100,
    epoch = 20,
    ws = 6,
    wordNgrams = 3,
    loss = 'ova'
    )
result = model_layer6.test(f"{basic_path}/layer6.val")
print(result)

(1254, 0.8532695374800638, 0.8532695374800638)


In [None]:
model_layer6.save_model(f"{basic_path}/layer6.bin")

In [None]:
model_layer7 = fasttext.train_supervised(
    input=f'{basic_path}/layer7.train',
    lr = 1.2,
    dim = 100,
    epoch = 20,
    ws = 6,
    wordNgrams = 3,
    loss = 'ova'
    )
result = model_layer7.test(f"{basic_path}/layer7.val")
print(result)

(19, 1.0, 1.0)


In [None]:
model_layer7.save_model(f"{basic_path}/layer7.bin")

In [None]:
model_bottom = fasttext.train_supervised(
    input=f'{basic_path}/bottom.train',
    lr = 1.2,
    dim = 100,
    epoch = 20,
    ws = 6,
    wordNgrams = 3,
    loss = 'softmax'
    )
result = model_bottom.test(f"{basic_path}/bottom.val")
print(result)

(45925, 0.6183342406096897, 0.6183342406096897)


In [None]:
model_bottom.save_model(f"{basic_path}/bottom.bin")

In [None]:
with open(f'{basic_path}/bottom.val', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Display a sample line
print(lines[100])

__label__art_and_collectibles.mixed_media_and_collage.other_assemblage fire and ice cigar pen in copper hardware handmade cigar style pen turned from hand poured swirled turquoise blue and metallic reddish brown resins which together evoke thoughts of fire and ice or perhaps a lava lamp pattern copper plating on the hardware black ink ballpoint accepts parker style refills n npremium hardware and silky smooth twist mechanism combined with a solid feel make this the ultimate in luxury handmade pens n nthe cigar pen is a classic design with its namesake shape having been used historically by many of the big names in vintage pens as well as current luxury pen manufacturers it adds a touch of elegance to any gentleman 39 s desk lady 39 s purse or family 39 s den n nincludes a presentation box perfect for presenting this hand crafted pen to that special loved one and or keeping it on display n nplease note this item is handmade to order so appearance e g swirl patterns may vary slightly fro

In [None]:
text = 'fire and ice cigar pen in copper hardware handmade cigar style pen turned from hand poured swirled turquoise blue and metallic reddish brown resins which together evoke thoughts of fire and ice or perhaps a lava lamp pattern copper plating on the hardware black ink ballpoint accepts parker style refills n npremium hardware and silky smooth twist mechanism combined with a solid feel make this the ultimate in luxury handmade pens n nthe cigar pen is a classic design with its namesake shape having been used historically by many of the big names in vintage pens as well as current luxury pen manufacturers it adds a touch of elegance to any gentleman 39 s desk lady 39 s purse or family 39 s den n nincludes a presentation box perfect for presenting this hand crafted pen to that special loved one and or keeping it on display n nplease note this item is handmade to order so appearance e g swirl patterns may vary slightly from photographs shown writing pen black ink shiny luxury lathe turning woodworking fine implement ballpoint big ben physical birthday christmas'

In [None]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313485 sha256=f04088ec98c4a3cb34c4a51d66b622414eed56760a29eec2005058ad922e76ed
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [None]:
import fasttext

In [None]:
# Load the saved model
layer1_model = fasttext.load_model(f'{basic_path}/layer1.bin')
layer2_model = fasttext.load_model(f'{basic_path}/layer2.bin')
layer3_model = fasttext.load_model(f'{basic_path}/layer3.bin')
layer4_model = fasttext.load_model(f'{basic_path}/layer4.bin')
layer5_model = fasttext.load_model(f'{basic_path}/layer5.bin')
layer6_model = fasttext.load_model(f'{basic_path}/layer6.bin')
layer7_model = fasttext.load_model(f'{basic_path}/layer7.bin')
bottom_model = fasttext.load_model(f'{basic_path}/bottom.bin')

NameError: name 'basic_path' is not defined

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'art_and_collectibles': np.float32(0.945),
 'home_and_living': np.float32(0.028),
 'craft_supplies_and_tools': np.float32(0.015)}

In [None]:
def get_hierarchical_top_categories(above_dict, under_dict, top_k=3):
    """
    계층적 카테고리 예측: 상위 계층(above_dict) 확률을 하위 계층(under_dict)에 반영하여 최종 상위 카테고리 추출

    Args:
        above_dict (dict): 상위 계층의 확률 딕셔너리 (예: {'art': 0.9, 'home': 0.1})
        under_dict (dict): 하위 계층의 확률 딕셔너리 (예: {'art>painting': 0.2, 'home>furniture': 0.5})
        top_k (int): 반환할 상위 카테고리 개수 (기본값: 3)

    Returns:
        dict: 최종 확률 기준 상위 top_k개의 {카테고리: 최종 확률} 딕셔너리
              (예: {'art>painting': 0.18, 'home>furniture': 0.05})
    """
    # 1. 상위 계층에서 top_k개 카테고리 추출 (확률 높은 순)
    top_above = sorted(above_dict.items(), key=lambda x: x[1], reverse=True)[:top_k]

    # 2. 하위 계층 후보들의 최종 확률 계산 (above_prob * under_prob)
    combined_probs = {}
    for above_cat, above_prob in top_above:
        # 상위 카테고리로 시작하는 하위 항목만 필터링
        under_items = {label: prob for label, prob in under_dict.items()
                      if label.startswith(above_cat + ">")}

        for under_label, under_prob in under_items.items():
            combined_probs[under_label] = above_prob * under_prob

    # 3. 최종 확률 기준 정렬 후 상위 top_k개 추출
    top_final = dict(sorted(combined_probs.items(), key=lambda x: x[1], reverse=True)[:top_k])
    return top_final

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'art_and_collectibles>sculpture': np.float32(0.014175),
 'art_and_collectibles>mixed_media_and_collage': np.float32(0.00567),
 'art_and_collectibles>drawing_and_illustration': np.float32(0.0037800001)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'art_and_collectibles>mixed_media_and_collage>other_assemblage': np.float32(0.00126441),
 'art_and_collectibles>sculpture>vessels': np.float32(0.001233225),
 'art_and_collectibles>drawing_and_illustration>pen_and_ink': np.float32(0.00027594)}

In [None]:
layer4_lable, layer4_probs = layer4_model.predict([text], k=len(layer4_model.get_labels()))
layer4_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer4_lable[0], layer4_probs[0])}
layer4_dict=get_hierarchical_top_categories(layer3_dict,layer4_dict)
layer4_dict

{}

In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=3)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'home_and_living.office.office_and_desk_storage': np.float32(0.305),
 'craft_supplies_and_tools.pens_pencils_and_marking_tools.nibs_and_nib_holders': np.float32(0.23),
 'home_and_living.office.office_and_school_supplies': np.float32(0.148)}

## test 2nd example

In [None]:
print(lines[1000])

__label__craft_supplies_and_tools.tools_and_equipment.tools.levels_and_measuring.needle_gauges knit chek stitch gauge susan bates needle sizer from 0 50 and hook sizer from b to u us 50 25mm ruler and stitch counter window 14099 this newly designed knit chek sizes up to 50mm needles and hooks it is an aluminum metal gauge for sizing both knitting needle and crochet hooks in us and metric sizes it has a handy imperial ruler on one edge and a metric ruler on the other as well as a very convenient 2 inch window for counting stitches and rows when checking your finished gauge susan bates knit chek gauge is perfect for crocheters too n n 10 refund adjustment on the listed price shown is through the direct link to get to this item and to get a 10 refund adjustment after purchasing is need help ask before buying as 10 refunds are not valid at our regular etsy shop the 10 shop has this banner pic https bit ly rrypattern n redrockyarns com is powered by etsy and backed by etsy s buyer protectio

In [None]:
text = 'knit chek stitch gauge susan bates needle sizer from 0 50 and hook sizer from b to u us 50 25mm ruler and stitch counter window 14099 this newly designed knit chek sizes up to 50mm needles and hooks it is an aluminum metal gauge for sizing both knitting needle and crochet hooks in us and metric sizes it has a handy imperial ruler on one edge and a metric ruler on the other as well as a very convenient 2 inch window for counting stitches and rows when checking your finished gauge susan bates knit chek gauge is perfect for crocheters too n n 10 refund adjustment on the listed price shown is through the direct link to get to this item and to get a 10 refund adjustment after purchasing is need help ask before buying as 10 refunds are not valid at our regular etsy shop the 10 shop has this banner pic https bit ly rrypattern n redrockyarns com is powered by etsy and backed by etsy s buyer protection valid when you sign into etsy to make the purchase n nsizes tools n knitting needles from size 0 to 50 2mm to 25mm n crochet hooks from size b to s that 50mm hole is a size u which isn 39 t marked as such for some reason n 2 window for counting stitches and rows of finished work for gauge purposes n measure in inches up to 5 25 n measure in centimeters up to 14cm n nspecifications n size 5 5 x 3 n aluminum brushed silver front with red rulers n nretail 2 55 nupc 077216040991 n n looking for other needle hook sizers lots can appear different on your screen than ours please inquire if there is a color concern most pictures are stock photos which have been enhanced to reflect what the product looks like the item you receive will not be the exact item shown in the pictures yarn dye lots and the manufacturing of yarn will differ between dye lots we try to match dye lots but it may not be possible if matched dye lots is a must please inquire before buying or at least leave us a message when buying so we can cancel if we don 39 t have enough of one dye lot to fulfill your order n nour shop has no pets no smoke no funky smells however yarn products especially can have a chemical smell due to the dying process smells can be intensified by air tight packaging and the environmental conditions including but limited to carrier smoking hot cold changes and humidity within the transport vehicle we also recycle cardboard boxes and bits and pieces of them for safe transit please keep this in mind if you are sensitive to smells or recycled cardboard n nitems noted as destash or clearance are final sales and not eligible for refunds or returns please ask if you have a concern or question about a product before purchasing n npolicies for more information please read our shop 39 s faq if you have a question or concern about an item or our shop policies please ask before purchasing knit chek knit check susan bates needle sizer hook sizer large hole stitch gauge row gauge silver aluminum stitch window large hole sizer susan bate knit chek physical'

In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=3)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'craft_supplies_and_tools.tools_and_equipment.tools.levels_and_measuring.needle_gauges': np.float32(0.366),
 'craft_supplies_and_tools.fabric_and_notions.notions.pins_and_needles.needles.hand_needles': np.float32(0.143),
 'craft_supplies_and_tools.fabric_and_notions.notions.stitch_holders': np.float32(0.132)}

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'craft_supplies_and_tools': np.float32(0.997),
 'clothing': np.float32(0.015),
 'accessories': np.float32(0.004)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'craft_supplies_and_tools>tools_and_equipment': np.float32(0.868387),
 'craft_supplies_and_tools>fabric_and_notions': np.float32(0.10967),
 'craft_supplies_and_tools>storage_and_organization': np.float32(0.010966999)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'craft_supplies_and_tools>tools_and_equipment>tools': np.float32(0.8249676),
 'craft_supplies_and_tools>fabric_and_notions>notions': np.float32(0.0037287802),
 'craft_supplies_and_tools>tools_and_equipment>parts': np.float32(0.000868387)}

In [None]:
layer4_lable, layer4_probs = layer4_model.predict([text], k=len(layer4_model.get_labels()))
layer4_dict = {label.replace('__label__', ''): round(prob, 5) for label, prob in zip(layer4_lable[0], layer4_probs[0])}
layer4_dict=get_hierarchical_top_categories(layer3_dict,layer4_dict)
layer4_dict

{'craft_supplies_and_tools>tools_and_equipment>tools>levels_and_measuring': np.float32(0.675797),
 'craft_supplies_and_tools>tools_and_equipment>tools>spinning_and_winding': np.float32(0.023957059),
 'craft_supplies_and_tools>tools_and_equipment>tools>hooking': np.float32(0.006022264)}

In [None]:
layer5_lable, layer5_probs = layer5_model.predict([text], k=len(layer5_model.get_labels()))
layer5_dict = {label.replace('__label__', ''): round(prob, 5) for label, prob in zip(layer5_lable[0], layer5_probs[0])}
layer5_dict=get_hierarchical_top_categories(layer4_dict,layer5_dict)
layer5_dict

{'craft_supplies_and_tools>tools_and_equipment>tools>levels_and_measuring>needle_gauges': np.float32(0.024484126),
 'craft_supplies_and_tools>tools_and_equipment>tools>levels_and_measuring>rulers_and_yardsticks': np.float32(0.008406915),
 'craft_supplies_and_tools>tools_and_equipment>tools>levels_and_measuring>stitch_counters': np.float32(0.00452784)}

In [None]:
layer6_lable, layer6_probs = layer6_model.predict([text], k=len(layer6_model.get_labels()))
layer6_dict = {label.replace('__label__', ''): round(prob, 5) for label, prob in zip(layer6_lable[0], layer6_probs[0])}
layer6_dict=get_hierarchical_top_categories(layer5_dict,layer6_dict)
layer6_dict

{'craft_supplies_and_tools>tools_and_equipment>tools>levels_and_measuring>rulers_and_yardsticks>rulers': np.float32(0.00019327497)}

In [None]:
# third test
print(lines[10000])

__label__craft_supplies_and_tools.tools_and_equipment.parts.tool_parts_and_accessories.sewing_machine_parts singer 12k new family fiddlebase sewing machine thread take up lever bolt spring lot 1 hi in the last few months i 39 ve listed many hundreds of singer sewing machine parts on 39 39 that other site 39 39 so now it 39 s time to start getting all my duplicate items here on etsy n ni 39 ve got literally thousands of singer parts here at any given moment for most of the main stream domestic singer models such as 12k 15k 27k 28k 127k 128k 66k 99k and the legendary 201k n ni started out collecting after i caught the bug following the purchase of my first machine to help me with my main job that of a craftsman and therefore because i only want beautiful machines for myself i only provide items that are in extremely good condition and i will take the time to clean and usually polish restore every item i offer for sale a very time consuming process but it 39 s worth it there 39 s nothing 

In [None]:
text = 'singer 12k new family fiddlebase sewing machine thread take up lever bolt spring lot 1 hi in the last few months i 39 ve listed many hundreds of singer sewing machine parts on 39 39 that other site 39 39 so now it 39 s time to start getting all my duplicate items here on etsy n ni 39 ve got literally thousands of singer parts here at any given moment for most of the main stream domestic singer models such as 12k 15k 27k 28k 127k 128k 66k 99k and the legendary 201k n ni started out collecting after i caught the bug following the purchase of my first machine to help me with my main job that of a craftsman and therefore because i only want beautiful machines for myself i only provide items that are in extremely good condition and i will take the time to clean and usually polish restore every item i offer for sale a very time consuming process but it 39 s worth it there 39 s nothing i hate more than brown gunked up stained and neglected sewing machines i try to get everything back to as near the day it was made many years ago n nso this listing is for a n nsinger 12k new family fiddle base sewing machine thread take up lever bolt spring n nsolid condition perfect for sprucing up your beloved machine n n please see the sewing machine parts section for many more similar items in the next few weeks i 39 ll be listing literally hundreds of singer accessories n nat times i may be able to be a little flexible on price especially on a multi purchase though as i mentioned i spend a lot of time and effort providing top quality items feel free to ask of course i 39 ll either say yes or no and i understand just how crazy postage prices have become to the u s etc feel free to drop me a line with any questions n nregards n nsonni singer sewing machine part foot attachment simanco original vintage restored embroidery craft darning physical'

In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'craft_supplies_and_tools.tools_and_equipment.parts.tool_parts_and_accessories.sewing_machine_parts': np.float32(0.997)}

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'craft_supplies_and_tools': np.float32(0.999),
 'clothing': np.float32(0.003),
 'bags_and_purses': np.float32(0.002)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'craft_supplies_and_tools>tools_and_equipment': np.float32(0.999),
 'craft_supplies_and_tools>storage_and_organization': np.float32(0.000999),
 'craft_supplies_and_tools>fabric_and_notions': np.float32(0.000999)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'craft_supplies_and_tools>tools_and_equipment>parts': np.float32(0.996003),
 'craft_supplies_and_tools>tools_and_equipment>equipment_and_machines': np.float32(0.000999),
 'craft_supplies_and_tools>tools_and_equipment>tools': np.float32(0.0)}

In [None]:
layer4_lable, layer4_probs = layer4_model.predict([text], k=len(layer4_model.get_labels()))
layer4_dict = {label.replace('__label__', ''): round(prob, 5) for label, prob in zip(layer4_lable[0], layer4_probs[0])}
layer4_dict=get_hierarchical_top_categories(layer3_dict,layer4_dict)
layer4_dict

{'craft_supplies_and_tools>tools_and_equipment>parts>tool_parts_and_accessories': np.float32(0.994509),
 'craft_supplies_and_tools>tools_and_equipment>equipment_and_machines>sewing_and_needlework_machines': np.float32(9.0909003e-07),
 'craft_supplies_and_tools>tools_and_equipment>equipment_and_machines>button_makers': np.float32(2.1978e-07)}

In [None]:
layer5_lable, layer5_probs = layer5_model.predict([text], k=len(layer5_model.get_labels()))
layer5_dict = {label.replace('__label__', ''): round(prob, 5) for label, prob in zip(layer5_lable[0], layer5_probs[0])}
layer5_dict=get_hierarchical_top_categories(layer4_dict,layer5_dict)
layer5_dict

{'craft_supplies_and_tools>tools_and_equipment>parts>tool_parts_and_accessories>sewing_machine_parts': np.float32(0.9821572),
 'craft_supplies_and_tools>tools_and_equipment>parts>tool_parts_and_accessories>dies': np.float32(9.94509e-06),
 'craft_supplies_and_tools>tools_and_equipment>parts>tool_parts_and_accessories>burs': np.float32(9.94509e-06)}

In [None]:
layer6_lable, layer6_probs = layer6_model.predict([text], k=len(layer6_model.get_labels()))
layer6_dict = {label.replace('__label__', ''): round(prob, 5) for label, prob in zip(layer6_lable[0], layer6_probs[0])}
layer6_dict=get_hierarchical_top_categories(layer5_dict,layer6_dict)
layer6_dict

{}

In [None]:
# fourth_test
print(lines[4646])

__label__bath_and_beauty.spa_and_relaxation.massage.massage_tools eye massage roller rose quartz eye massage roller eye massage eye roller rose quartz massage tool eye treatment eye massage tool amazing rose quartz eye massage roller with rose gold findings ethically source rose quartz no child labour involved no big commercial mining n ndon 39 t get me wrong we love our ethically sourced rose quartz gua sha 39 s and use them daily but they are not the best tool for the eye area no matter how gentle i used the gua sha under the eyes the skin got stretched too much and to be honest the gua sha didn 39 t remove any puffiness or dark circles under my eyes i use the gua sha daily to remove shoulder and neck tension and the gua sha does everything it is supposed to do for the face but sorry in my opinion it 39 s not good for the eye area after a long search i found this rose quartz eye massage roller i didn 39 t expect much and i didn 39 t even put the massage roller into the fridge i was s

In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'bath_and_beauty.spa_and_relaxation.massage.massage_tools': np.float32(0.608)}

In [None]:
text = 'eye massage roller rose quartz eye massage roller eye massage eye roller rose quartz massage tool eye treatment eye massage tool amazing rose quartz eye massage roller with rose gold findings ethically source rose quartz no child labour involved no big commercial mining n ndon 39 t get me wrong we love our ethically sourced rose quartz gua sha 39 s and use them daily but they are not the best tool for the eye area no matter how gentle i used the gua sha under the eyes the skin got stretched too much and to be honest the gua sha didn 39 t remove any puffiness or dark circles under my eyes i use the gua sha daily to remove shoulder and neck tension and the gua sha does everything it is supposed to do for the face but sorry in my opinion it 39 s not good for the eye area after a long search i found this rose quartz eye massage roller i didn 39 t expect much and i didn 39 t even put the massage roller into the fridge i was surprised how easy the eye massage roller rolled over my eye area without stretching or pulling the skin it felt so good that i didn 39 t want to stop i have never felt so good under and above my eyes in my whole life i have now idea why i get dark circles and puffiness under my eyes like nobody else seems to get i used to look awful on photos no matter how much concealer i used but this little rose quartz eye massage roller made a difference in a very short time even if it wouldn 39 t make a visible difference i wouldn 39 t want to miss it i feel so refreshed after using it i usually don 39 t write often about personal experience but i thought i make an exception and share my personal experience with you maybe you find it helpful n nhere comes the professional part n nrose quartz face rollers are said to improve blood circulation and overall skin tone remove wrinkles and puffiness reduce dark undereye circles eliminate toxins and promote lymphatic drainage n n nhow to clean and store your pink rose quartz eye massage roller n nuse a soft cloth to gently wipe down the tool after every use occasionally use warm soapy water to hand wash it for a deeper clean after washing let the massage tool air dry store the eye roller in a baggie to avoid contamination especially if you keep it in the fridge n n n ngemstone crystal healing properties n n nrose quartz with its gentle pink essence is a stone of the heart a crystal of unconditional love it is the most important crystal of the heart and the heart chakra teaching the true essence of love and purifying and opening the heart at all levels n nrose quartz stimulates the proper functioning of the heart and circulatory system it aids in relieving tension and stress palpitations or skipped beats and may stabilise irregular heart rhythm rose quartz is a calming and reassuring crystal excellent for use in trauma or crisis including the emotional upheaval of mid life crisis it strengthens empathy sensitivity and aids in the acceptance of necessary change it is also an excellent stone for comforting grief n n nplease note that while the descriptions of the properties qualities and meanings of the crystals refer to healing benefits they are not intended to replace diagnosis of illness or ailments and do not imply a guarantee of effect healing means bringing mind body and spirit back into balance it does not imply a cure always consult your doctor or other health professional in the case of illness all associations mentioned are meant as a general information source only n n n nas crystals are natural each and every crystal carving has its own totally unique colouring inclusions and markings n n ni can combine postage of multiple items providing they remain in the same price bracket by dimensions and weight as provided by the postage provider for multiple item orders message me for shipping prices the cost is almost always less than appears in your basket at checkout more shipping options at check out finally i can offer you the premium quality storage bags so many of you have been waiting for i chose 100 linen because linen is much more breathable and hygienic for storing your gua sha and or eye massage tool in than polycotton is the linen pouches are of excellent quality thick and dense with excellent stitching they are not just for storing your gua sha and or rose quartz eye massage roller in of course you can store your tumble stones in them fill one with lavender for a goodnight sleep and or store one filled with lavender in your wardrobe or drawers moths hate the smell and won t go near your clothes i recently filled one of the bags with freshly roasted coffee beans and gave it as a present rose quartz eye massage roller eye massage tool massage roller gemstone crystal pink rose gold spa beauty beauty tool eye massage physical'

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'bath_and_beauty': np.float32(0.983),
 'home_and_living': np.float32(0.02),
 'weddings': np.float32(0.011)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'bath_and_beauty>spa_and_relaxation': np.float32(0.82080495),
 'bath_and_beauty>skin_care': np.float32(0.036371),
 'bath_and_beauty>bath_accessories': np.float32(0.017693998)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'bath_and_beauty>spa_and_relaxation>massage': np.float32(0.6976842),
 'bath_and_beauty>spa_and_relaxation>aromatherapy': np.float32(0.0041040247),
 'bath_and_beauty>spa_and_relaxation>spa_kits_and_gifts': np.float32(0.00082080497)}

In [None]:
layer4_lable, layer4_probs = layer4_model.predict([text], k=len(layer4_model.get_labels()))
layer4_dict = {label.replace('__label__', ''): round(prob, 5) for label, prob in zip(layer4_lable[0], layer4_probs[0])}
layer4_dict=get_hierarchical_top_categories(layer3_dict,layer4_dict)
layer4_dict

{'bath_and_beauty>spa_and_relaxation>massage>massage_tools': np.float32(0.50064427),
 'bath_and_beauty>spa_and_relaxation>massage>massage_oils': np.float32(0.010144329)}

In [None]:
layer5_lable, layer5_probs = layer5_model.predict([text], k=len(layer5_model.get_labels()))
layer5_dict = {label.replace('__label__', ''): round(prob, 5) for label, prob in zip(layer5_lable[0], layer5_probs[0])}
layer5_dict=get_hierarchical_top_categories(layer4_dict,layer5_dict)
layer5_dict

{}

In [None]:
# fourth_test
print(lines[9962])

__label__clothing.gender_neutral_kids_clothing.jackets_and_coats butterfly friends size 5 or custom order omg this adorable jean jacket is perfect for butterfly lovers and we all know that butterflies have lots of flying friends everywhere so it just make sense n neach of our jackets are a one of a kind creation n nwe started by buying a brand new levi 39 s trucker jacket we love ribbon accents so we added just enough in our sewing studio we machine embroidered a large butterfly on the back and a bumble bee and dragon fly next came the butterfly patches that are hand stitched on we then made the adorable bee heart arm patches also handstitched on followed up pink butterflies stitched all around the collar finished with a bit of hand embroidered accents n nso fun to make and stunning jean jacket denim kids girls boys butterfly embroider embellish physical



In [None]:
# fifth_test
text = 'butterfly friends size 5 or custom order omg this adorable jean jacket is perfect for butterfly lovers and we all know that butterflies have lots of flying friends everywhere so it just make sense n neach of our jackets are a one of a kind creation n nwe started by buying a brand new levi 39 s trucker jacket we love ribbon accents so we added just enough in our sewing studio we machine embroidered a large butterfly on the back and a bumble bee and dragon fly next came the butterfly patches that are hand stitched on we then made the adorable bee heart arm patches also handstitched on followed up pink butterflies stitched all around the collar finished with a bit of hand embroidered accents n nso fun to make and stunning jean jacket denim kids girls boys butterfly embroider embellish physical'

In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'clothing.girls_clothing.jackets_and_coats': np.float32(0.635)}

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'clothing': np.float32(0.856),
 'accessories': np.float32(0.06),
 'toys_and_games': np.float32(0.013)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'clothing>girls_clothing': np.float32(0.362088),
 'clothing>gender_neutral_kids_clothing': np.float32(0.06848),
 'clothing>boys_clothing': np.float32(0.002568)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'clothing>girls_clothing>jackets_and_coats': np.float32(0.12636872),
 'clothing>gender_neutral_kids_clothing>jackets_and_coats': np.float32(0.03677376),
 'clothing>girls_clothing>sweaters': np.float32(0.002896704)}

In [None]:
with open(f'{basic_path}/bottom.val', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Display a sample line
print(lines[-1])

__label__home_and_living.lighting.light_accessories.light_switch_and_outlet_covers black and white home decor abstract art decor modern decor metal light switch cover switch plates and outlet covers free shipping luna gallery switch plates nprinted designs on metal switch plate covers nmade to last a lifetime n nstainless steel metal switch plate covers neco friendly printed designs nthe surface is flat nsemi gloss varnish finish nfree mounting screws are provided neasily cleaned with any non abrasive cleaner nproudly made in the usa n important information and sizing chart below n nsizes of switch plates nsingles 2 3 4 wide x 4 1 2 tall ndoubles 4 1 2 wide x 4 1 2 tall ntriples 6 1 2 wide x 4 1 2 tall nquadruples 8 wide x 4 1 2 tall nfor more plate configurations please see nhttps gallery switch plates nhigh quality metal switch plate covers with eco friendly printed designs neach plate is made at the time of order using a large format flatbed printer nthe design covers the top surfac

In [None]:
text = 'black and white home decor abstract art decor modern decor metal light switch cover switch plates and outlet covers free shipping luna gallery switch plates nprinted designs on metal switch plate covers nmade to last a lifetime n nstainless steel metal switch plate covers neco friendly printed designs nthe surface is flat nsemi gloss varnish finish nfree mounting screws are provided neasily cleaned with any non abrasive cleaner nproudly made in the usa n important information and sizing chart below n nsizes of switch plates nsingles 2 3 4 wide x 4 1 2 tall ndoubles 4 1 2 wide x 4 1 2 tall ntriples 6 1 2 wide x 4 1 2 tall nquadruples 8 wide x 4 1 2 tall nfor more plate configurations please see nhttps gallery switch plates nhigh quality metal switch plate covers with eco friendly printed designs neach plate is made at the time of order using a large format flatbed printer nthe design covers the top surface and the sides of the switch plates there is na very thin white border at the base of each switch plate where it would meet the wall n nthis cannot be avoided as the printer cannot reach that small section of the plate nas most of us know by now all monitors are different nand display slightly different colors i do my very best to represent nthe final product if you a planning a large room make over nand a color match is very important i strongly suggest npurchasing one to make sure you love it first n n n switch plate light switch plate switch plates triple toggle quadruple toggle double toggle switch plate cover outlet covers switch plate covers triple rocker plates switchplates metal switch plate unique switch plate physical living room metal'

In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'home_and_living.lighting.light_accessories.light_switch_and_outlet_covers': np.float32(1.0)}

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'home_and_living': np.float32(0.993),
 'electronics_and_accessories': np.float32(0.007),
 'art_and_collectibles': np.float32(0.0)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'home_and_living>lighting': np.float32(0.99002093),
 'home_and_living>home_decor': np.float32(0.001986),
 'home_and_living>home_appliances': np.float32(0.000993)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'home_and_living>lighting>light_accessories': np.float32(0.9890309),
 'home_and_living>lighting>ceiling_fans': np.float32(0.0),
 'home_and_living>lighting>lamps_shades_and_bases': np.float32(0.0)}

In [None]:
layer4_lable, layer4_probs = layer4_model.predict([text], k=len(layer4_model.get_labels()))
layer4_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer4_lable[0], layer4_probs[0])}
layer4_dict=get_hierarchical_top_categories(layer3_dict,layer4_dict)
layer4_dict

{'home_and_living>lighting>light_accessories>light_switch_and_outlet_covers': np.float32(0.9890309),
 'home_and_living>lighting>light_accessories>ceiling_medallions': np.float32(0.0),
 'home_and_living>lighting>light_accessories>light_pulls': np.float32(0.0)}

In [None]:
bottom_model.test(f"{basic_path}/bottom.val")

(45925, 0.6183342406096897, 0.6183342406096897)

In [None]:
bottom_model.test(f"{basic_path}/bottom.val")
test_samples = open(f"{basic_path}/bottom.val").readlines()

# 存储错误样本
wrong_samples = []

for line in test_samples:
    true_label = line.split()[0]  # FastText标签格式如 "__label__class1"
    text = " ".join(line.split()[1:])
    pred_label = bottom_model.predict([text])[0][0]  # 获取预测标签

    if pred_label != true_label:
        wrong_samples.append({
            "text": text,
            "true_label": true_label,
            "pred_label": pred_label
        })

# 转换为DataFrame分析
import pandas as pd
df_errors = pd.DataFrame(wrong_samples)

In [None]:
text = df_errors.iloc[2][0]

  text = df_errors.iloc[2][0]


In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'bags_and_purses.handbags.hobo_bags': np.float32(0.714)}

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'bags_and_purses': np.float32(0.969),
 'home_and_living': np.float32(0.037),
 'toys_and_games': np.float32(0.009)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'bags_and_purses>handbags': np.float32(0.878883),
 'bags_and_purses>fanny_packs': np.float32(0.21027298),
 'bags_and_purses>food_and_insulated_bags': np.float32(0.016473)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'bags_and_purses>handbags>shoulder_bags': np.float32(0.33661216),
 'bags_and_purses>handbags>crossbody_bags': np.float32(0.25487608),
 'bags_and_purses>handbags>hobo_bags': np.float32(0.19599092)}

In [None]:
layer4_lable, layer4_probs = layer4_model.predict([text], k=len(layer4_model.get_labels()))
layer4_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer4_lable[0], layer4_probs[0])}
layer4_dict=get_hierarchical_top_categories(layer3_dict,layer4_dict)
layer4_dict

{}

In [None]:
df_errors.iloc[2][1]

  df_errors.iloc[2][1]


'__label__bags_and_purses.handbags.shoulder_bags'

In [None]:
type(text)

str

In [None]:
text = df_errors.iloc[15][0]

  text = df_errors.iloc[15][0]


In [None]:
df_errors.iloc[15][1]

  df_errors.iloc[15][1]


'__label__home_and_living.outdoor_and_garden.plants.bushes_and_trees'

In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'home_and_living.outdoor_and_garden.plants.bushes_and_trees': np.float32(0.838)}

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'home_and_living': np.float32(0.991),
 'craft_supplies_and_tools': np.float32(0.009),
 'accessories': np.float32(0.002)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'home_and_living>outdoor_and_garden': np.float32(0.89388204),
 'home_and_living>home_improvement': np.float32(0.0019820002),
 'home_and_living>spirituality_and_religion': np.float32(0.0019820002)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'home_and_living>outdoor_and_garden>plants': np.float32(0.89298815),
 'home_and_living>outdoor_and_garden>plant_accessories': np.float32(0.00089388207),
 'home_and_living>outdoor_and_garden>seeds_and_seed_bombs': np.float32(0.0)}

In [None]:
layer4_lable, layer4_probs = layer4_model.predict([text], k=len(layer4_model.get_labels()))
layer4_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer4_lable[0], layer4_probs[0])}
layer4_dict=get_hierarchical_top_categories(layer3_dict,layer4_dict)
layer4_dict

{'home_and_living>outdoor_and_garden>plants>bushes_and_trees': np.float32(0.81797713),
 'home_and_living>outdoor_and_garden>plants>bonsai': np.float32(0.016073786),
 'home_and_living>outdoor_and_garden>plants>fruit_and_vegetables': np.float32(0.0116088465)}

In [None]:
text = df_errors.iloc[1991][0]

  text = df_errors.iloc[1991][0]


In [None]:
df_errors.iloc[1991][1]

  df_errors.iloc[1991][1]


'__label__accessories.sunglasses_and_eyewear.glasses_cases'

In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'accessories.sunglasses_and_eyewear.glasses_cases': np.float32(0.834)}

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'accessories': np.float32(0.998),
 'books_movies_and_music': np.float32(0.015),
 'bags_and_purses': np.float32(0.009)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'accessories>sunglasses_and_eyewear': np.float32(0.95808),
 'accessories>keychains_and_lanyards': np.float32(0.008982),
 'accessories>gloves_and_mittens': np.float32(0.0059880004)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'accessories>sunglasses_and_eyewear>glasses_cases': np.float32(0.707063),
 'accessories>sunglasses_and_eyewear>glasses_chains': np.float32(0.03736512),
 'accessories>sunglasses_and_eyewear>eyeglass_stands': np.float32(0.030658562)}

In [None]:
text = df_errors.iloc[1993][0]

  text = df_errors.iloc[1993][0]


In [None]:
df_errors.iloc[1993][1]

  df_errors.iloc[1993][1]


'__label__electronics_and_accessories.docking_and_stands.stands'

In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'electronics_and_accessories.computers_and_peripherals.computers': np.float32(0.364)}

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'electronics_and_accessories': np.float32(0.991),
 'home_and_living': np.float32(0.009),
 'toys_and_games': np.float32(0.003)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'electronics_and_accessories>docking_and_stands': np.float32(0.479644),
 'electronics_and_accessories>audio': np.float32(0.127839),
 'electronics_and_accessories>video_games': np.float32(0.027748)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'electronics_and_accessories>docking_and_stands>stands': np.float32(0.2652431),
 'electronics_and_accessories>docking_and_stands>docking_stations': np.float32(0.0033575082),
 'electronics_and_accessories>audio>headphones_and_stands': np.float32(0.002812458)}

In [None]:
layer4_lable, layer4_probs = layer4_model.predict([text], k=len(layer4_model.get_labels()))
layer4_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer4_lable[0], layer4_probs[0])}
layer4_dict=get_hierarchical_top_categories(layer3_dict,layer4_dict)
layer4_dict

{'electronics_and_accessories>audio>headphones_and_stands>headphone_stands': np.float32(2.2499666e-05),
 'electronics_and_accessories>audio>headphones_and_stands>headphones': np.float32(2.8124582e-06)}

In [None]:
layer5_lable, layer5_probs = layer5_model.predict([text], k=len(layer5_model.get_labels()))
layer5_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer5_lable[0], layer5_probs[0])}
layer5_dict=get_hierarchical_top_categories(layer4_dict,layer5_dict)
layer5_dict

{}

In [None]:
df_errors.iloc[2002]

Unnamed: 0,2002
text,hair band rose headband of roses roses on the ...
true_label,__label__accessories.hair_accessories.hair_jew...
pred_label,[__label__accessories.hair_accessories.wreaths...


In [None]:
text = df_errors.iloc[2002][0]

  text = df_errors.iloc[2002][0]


In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'accessories.hair_accessories.wreaths_and_tiaras.wreaths': np.float32(0.966)}

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'accessories': np.float32(0.999),
 'weddings': np.float32(0.126),
 'craft_supplies_and_tools': np.float32(0.002)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'accessories>hair_accessories': np.float32(0.999),
 'weddings>accessories': np.float32(0.00012600001),
 'accessories>hats_and_caps': np.float32(0.0)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'accessories>hair_accessories>hair_jewelry': np.float32(0.95904),
 'accessories>hair_accessories>wreaths_and_tiaras': np.float32(0.005994),
 'accessories>hair_accessories>headbands': np.float32(0.000999)}

In [None]:
layer4_lable, layer4_probs = layer4_model.predict([text], k=len(layer4_model.get_labels()))
layer4_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer4_lable[0], layer4_probs[0])}
layer4_dict=get_hierarchical_top_categories(layer3_dict,layer4_dict)
layer4_dict

{'accessories>hair_accessories>wreaths_and_tiaras>wreaths': np.float32(0.0053706244),
 'accessories>hair_accessories>headbands>headbands': np.float32(4.2957003e-05),
 'accessories>hair_accessories>headbands>baby_headbands': np.float32(9.990001e-07)}

In [None]:
layer5_lable, layer5_probs = layer5_model.predict([text], k=len(layer5_model.get_labels()))
layer5_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer5_lable[0], layer5_probs[0])}
layer5_dict=get_hierarchical_top_categories(layer4_dict,layer5_dict)
layer5_dict

{}

In [None]:
df_errors.iloc[2002][1]

  df_errors.iloc[2002][1]


'__label__accessories.hair_accessories.hair_jewelry.hair_rings_and_charms'

In [None]:
df_errors.iloc[2111]

Unnamed: 0,2111
text,natural loofa sponge home grown organic ready ...
true_label,__label__craft_supplies_and_tools.beauty_suppl...
pred_label,[__label__bath_and_beauty.bath_accessories.loo...


In [None]:
text = df_errors.iloc[2111][0]

  text = df_errors.iloc[2111][0]


In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'bath_and_beauty.bath_accessories.loofahs': np.float32(0.612)}

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'craft_supplies_and_tools': np.float32(0.808),
 'bath_and_beauty': np.float32(0.16),
 'pet_supplies': np.float32(0.007)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'craft_supplies_and_tools>beauty_supplies': np.float32(0.033936),
 'bath_and_beauty>bath_accessories': np.float32(0.01664),
 'craft_supplies_and_tools>floral_arranging_supplies': np.float32(0.00808)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'craft_supplies_and_tools>beauty_supplies>soap_supplies': np.float32(0.020395538),
 'bath_and_beauty>bath_accessories>loofahs': np.float32(0.00184704),
 'bath_and_beauty>bath_accessories>sponges_and_body_brushes': np.float32(0.0002496)}

In [None]:
layer4_lable, layer4_probs = layer4_model.predict([text], k=len(layer4_model.get_labels()))
layer4_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer4_lable[0], layer4_probs[0])}
layer4_dict=get_hierarchical_top_categories(layer3_dict,layer4_dict)
layer4_dict

{'craft_supplies_and_tools>beauty_supplies>soap_supplies>soap_embeds': np.float32(0.0020395538)}

In [None]:
layer5_lable, layer5_probs = layer5_model.predict([text], k=len(layer5_model.get_labels()))
layer5_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer5_lable[0], layer5_probs[0])}
layer5_dict=get_hierarchical_top_categories(layer4_dict,layer5_dict)
layer5_dict

{}

In [None]:
df_errors.iloc[2111][1]

  df_errors.iloc[2111][1]


'__label__craft_supplies_and_tools.beauty_supplies.soap_supplies.soap_embeds'

In [None]:
df_errors.iloc[5111]

Unnamed: 0,5111
text,hand printed upcycled youth small dragonfly lo...
true_label,__label__clothing.boys_clothing.sweaters.pullo...
pred_label,[__label__clothing.gender_neutral_kids_clothin...


In [None]:
text = df_errors.iloc[5111][0]

  text = df_errors.iloc[5111][0]


In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'clothing.gender_neutral_kids_clothing.sweaters': np.float32(0.596)}

In [None]:
layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
layer1_dict

{'clothing': np.float32(1.0),
 'pet_supplies': np.float32(0.0),
 'books_movies_and_music': np.float32(0.0)}

In [None]:
layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
layer2_dict

{'clothing>boys_clothing': np.float32(0.92),
 'clothing>gender_neutral_kids_clothing': np.float32(0.257),
 'clothing>girls_clothing': np.float32(0.018)}

In [None]:
layer3_lable, layer3_probs = layer3_model.predict([text], k=len(layer3_model.get_labels()))
layer3_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer3_lable[0], layer3_probs[0])}
layer3_dict=get_hierarchical_top_categories(layer2_dict,layer3_dict)
layer3_dict

{'clothing>boys_clothing>sweaters': np.float32(0.86572),
 'clothing>gender_neutral_kids_clothing>sweaters': np.float32(0.0043690004),
 'clothing>girls_clothing>sweaters': np.float32(0.000666)}

In [None]:
layer4_lable, layer4_probs = layer4_model.predict([text], k=len(layer4_model.get_labels()))
layer4_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer4_lable[0], layer4_probs[0])}
layer4_dict=get_hierarchical_top_categories(layer3_dict,layer4_dict)
layer4_dict

{'clothing>boys_clothing>sweaters>pullover_sweaters': np.float32(0.67353016),
 'clothing>boys_clothing>sweaters>cardigans': np.float32(0.00086572004),
 'clothing>girls_clothing>sweaters>pullover_sweaters': np.float32(1.7982e-05)}

In [None]:
layer5_lable, layer5_probs = layer5_model.predict([text], k=len(layer5_model.get_labels()))
layer5_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer5_lable[0], layer5_probs[0])}
layer5_dict=get_hierarchical_top_categories(layer4_dict,layer5_dict)
layer5_dict

{}

In [None]:
df_errors.iloc[5111][1]

  df_errors.iloc[5111][1]


'__label__clothing.boys_clothing.sweaters.pullover_sweaters'

In [None]:
bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
bottom_dict

{'clothing.gender_neutral_kids_clothing.sweaters': np.float32(0.596)}

In [None]:
key = list(bottom_dict.keys())[0]
print(type(key))

<class 'str'>


In [None]:
def count_levels(bottom_dict):
    # 딕셔너리에서 첫 번째 키를 가져옵니다
    first_key = next(iter(bottom_dict.keys()))

    # 키에서 마침표(.)의 개수를 세고 1을 더합니다
    # 마침표 개수 + 1 = 계층 수
    levels = first_key.count('.') + 1

    return levels

# 예시
depth = count_levels(bottom_dict)
print(depth)  # 출력: 3

3


In [None]:
def run_layer_model(text):
    """
    주어진 depth에 따라 계층 모델을 동적으로 실행

    Args:
        depth (int): 목표 계층 깊이 (예: 3 → layer1 ~ layer3 실행)
        text (str): 분류할 텍스트

    Returns:
        dict: 최종 계층의 예측 결과 (예: layer3_dict)
    """
    bottom_lable, bottom_probs = bottom_model.predict([text], k=1)
    bottom_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(bottom_lable[0], bottom_probs[0])}
    depth = count_levels(bottom_dict)

    # Layer1 처리
    layer_labels, layer_probs = layer1_model.predict([text], k=3)
    current_dict = {
        label.replace('__label__', ''): round(prob, 3)
        for label, prob in zip(layer_labels[0], layer_probs[0])
    }

    # Layer2 ~ Layer(depth) 처리
    for layer_num in range(2, depth + 1):
        model = globals()[f'layer{layer_num}_model']  # 동적 모델 호출
        labels, probs = model.predict([text], k=3)
        next_dict = {
            label.replace('__label__', ''): round(prob, 3)
            for label, prob in zip(labels[0], probs[0])
        }

        # 상위 계층과의 일관성 필터링
        current_dict = get_hierarchical_top_categories(current_dict, next_dict)
    if list(current_dict.keys())[0] == list(bottom_dict.keys())[0]:
      pred_bottom_category = list(current_dict.keys())[0]
    else:
      while(next_dict):
        layer_num += 1
        model = globals()[f'layer{layer_num}_model']  # 동적 모델 호출
        labels, probs = model.predict([text], k=3)
        next_dict = {
            label.replace('__label__', ''): round(prob, 3)
            for label, prob in zip(labels[0], probs[0])
        }




    return pred_bottom_category

SyntaxError: expected ':' (<ipython-input-418-08cb87c8cb41>, line 35)

In [None]:
def run_layer_model(text):
    """
    Bottom 모델과 계층 모델 예측을 비교해 최종 카테고리 결정

    Args:
        text (str): 분류할 텍스트

    Returns:
        str: 최종 카테고리 경로
    """
    # 1. Bottom 모델 예측
    bottom_labels, bottom_probs = bottom_model.predict([text], k=1)
    bottom_dict = {
        label.replace('__label__', ''): round(prob, 3)
        for label, prob in zip(bottom_labels[0], bottom_probs[0])
    }
    bottom_path = list(bottom_dict.keys())[0]
    depth = count_levels(bottom_dict)

    # 2. Layer1 처리 (고정)
    layer_labels, layer_probs = layer1_model.predict([text], k=3)
    current_dict = {
        label.replace('__label__', ''): round(prob, 3)
        for label, prob in zip(layer_labels[0], layer_probs[0])
    }

    # 3. 최초 비교 (수정하지 말아야 할 부분)
    if list(current_dict.keys())[0] == bottom_path:
        return bottom_path

    # 4. 불일치 시 추가 계층 탐색 (While 루프)
    layer_num = 1  # Layer1부터 시작
    while True:
        layer_num += 1  # 다음 계층 이동 (Layer2 → Layer3 → ...)

        try:
            # 동적 모델 호출
            model = globals()[f'layer{layer_num}_model']
            labels, probs = model.predict([text], k=3)
            next_dict = {
                label.replace('__label__', ''): round(prob, 3)
                for label, prob in zip(labels[0], probs[0])
            }

            # 상위 계층과 일관성 필터링
            next_dict = get_hierarchical_top_categories(current_dict, next_dict)

            # 빈 딕셔너리 처리
            if not next_dict:
                return max(current_dict.items(), key=lambda x: x[1])[0]

            # 현재 계층 업데이트
            current_dict = next_dict

            # Bottom 경로와 다시 비교
            if list(current_dict.keys())[0] == bottom_path:
                return bottom_path

        except (KeyError, IndexError):  # 모델이 없거나 예측 실패
            return max(current_dict.items(), key=lambda x: x[1])[0]

In [None]:
run_layer_model(text)

'craft_supplies_and_tools>beauty_supplies>soap_supplies>soap_embeds'

In [None]:
text = df_errors.iloc[2111][0]

  text = df_errors.iloc[2111][0]


In [None]:
run_layer_model(text)

'craft_supplies_and_tools>beauty_supplies>soap_supplies>soap_embeds'

In [None]:
text = df_errors.iloc[2002][0]

  text = df_errors.iloc[2002][0]


In [None]:
run_layer_model(text)

'weddings>accessories>hair_accessories>wreaths_and_tiaras>wreaths'

In [None]:
df_errors.iloc[2002][1]

  df_errors.iloc[2002][1]


'__label__accessories.hair_accessories.hair_jewelry.hair_rings_and_charms'

In [None]:
text = df_errors.iloc[1][0]

  text = df_errors.iloc[1][0]


In [None]:
run_layer_model(text)

'home_and_living>home_appliances>irons_and_steamers>irons'

In [None]:
df_errors.iloc[1][1]

  df_errors.iloc[1][1]


'__label__home_and_living.home_appliances.irons_and_steamers.irons'

In [None]:
def count_levels(bottom_dict):
    """
    카테고리 경로의 계층 수를 계산

    Args:
        bottom_dict (dict): 카테고리 경로가 키인 딕셔너리

    Returns:
        int: 계층 수 (마침표 개수 + 1)
    """
    first_key = next(iter(bottom_dict.keys()))
    levels = first_key.count('.') + 1
    return levels


def get_hierarchical_top_categories(above_dict, under_dict, top_k=3):
    """
    계층적 카테고리 예측: 상위 계층(above_dict) 확률을 하위 계층(under_dict)에 반영하여 최종 상위 카테고리 추출

    Args:
        above_dict (dict): 상위 계층의 확률 딕셔너리 (예: {'art': 0.9, 'home': 0.1})
        under_dict (dict): 하위 계층의 확률 딕셔너리 (예: {'art>painting': 0.2, 'home>furniture': 0.5})
        top_k (int): 반환할 상위 카테고리 개수 (기본값: 3)

    Returns:
        dict: 최종 확률 기준 상위 top_k개의 {카테고리: 최종 확률} 딕셔너리
              (예: {'art>painting': 0.18, 'home>furniture': 0.05})
    """
    # 1. 상위 계층에서 top_k개 카테고리 추출 (확률 높은 순)
    top_above = sorted(above_dict.items(), key=lambda x: x[1], reverse=True)[:top_k]

    # 2. 하위 계층 후보들의 최종 확률 계산 (above_prob * under_prob)
    combined_probs = {}
    for above_cat, above_prob in top_above:
        # 상위 카테고리로 시작하는 하위 항목만 필터링
        under_items = {label: prob for label, prob in under_dict.items()
                      if label.startswith(above_cat + ">")}

        for under_label, under_prob in under_items.items():
            combined_probs[under_label] = above_prob * under_prob

    # 3. 최종 확률 기준 정렬 후 상위 top_k개 추출
    top_final = dict(sorted(combined_probs.items(), key=lambda x: x[1], reverse=True)[:top_k])
    return top_final


def run_layer_model(text):
    """
    Bottom 모델과 계층 모델 예측을 비교해 최종 카테고리 결정

    Args:
        text (str): 분류할 텍스트

    Returns:
        str: 최종 카테고리 경로
    """
    # 1. Bottom 모델 예측
    bottom_labels, bottom_probs = bottom_model.predict([text], k=1)
    bottom_dict = {
        label.replace('__label__', ''): round(prob, 3)
        for label, prob in zip(bottom_labels[0], bottom_probs[0])
    }
    bottom_path = list(bottom_dict.keys())[0]

    # 2. Layer1 처리 (고정)
    layer_labels, layer_probs = layer1_model.predict([text], k=3)
    current_dict = {
        label.replace('__label__', ''): round(prob, 3)
        for label, prob in zip(layer_labels[0], layer_probs[0])
    }

    # 3. 최초 비교
    if list(current_dict.keys())[0] == bottom_path:
        return bottom_path

    # 4. 불일치 시 추가 계층 탐색
    layer_num = 1  # Layer1부터 시작
    max_layer = 6  # 최대 레이어 수

    while layer_num < max_layer:
        layer_num += 1  # 다음 계층 이동 (Layer2 → Layer3 → ...)

        try:
            # 동적 모델 호출
            model = globals()[f'layer{layer_num}_model']
            labels, probs = model.predict([text], k=len(model.get_labels()))
            next_dict = {
                label.replace('__label__', ''): round(prob, 3)
                for label, prob in zip(labels[0], probs[0])
            }

            # 상위 계층과 일관성 필터링
            next_dict = get_hierarchical_top_categories(current_dict, next_dict)

            # 빈 딕셔너리 처리
            if not next_dict:
                return max(current_dict.items(), key=lambda x: x[1])[0]

            # 현재 계층 업데이트
            current_dict = next_dict

            # Bottom 경로와 비교 - 일치하면 즉시 반환
            if list(current_dict.keys())[0] == bottom_path:
                return bottom_path

        except (KeyError, IndexError):  # 모델이 없거나 예측 실패
            return max(current_dict.items(), key=lambda x: x[1])[0]

    # 5. 최대 레이어까지 도달했을 때의 최종 결과
    return max(current_dict.items(), key=lambda x: x[1])[0]

In [None]:
run_layer_model(text)

'home_and_living>home_appliances>irons_and_steamers>irons'

In [None]:
text = df_errors.iloc[2002][0]

  text = df_errors.iloc[2002][0]


In [None]:
run_layer_model(text)

'accessories>hair_accessories>wreaths_and_tiaras>wreaths'

In [None]:
df_errors.iloc[2002][1]

  df_errors.iloc[2002][1]


'__label__accessories.hair_accessories.hair_jewelry.hair_rings_and_charms'

In [None]:
text = df_errors.iloc[22][0]

  text = df_errors.iloc[22][0]


In [None]:
run_layer_model(text)

'electronics_and_accessories>cameras_and_equipment>lenses_and_filters>lenses'

In [None]:
df_errors.iloc[22][1]

  df_errors.iloc[22][1]


'__label__electronics_and_accessories.cameras_and_equipment.lenses_and_filters.lenses'

In [None]:
text = df_errors.iloc[198][0]

  text = df_errors.iloc[198][0]


In [None]:
run_layer_model(text)

'clothing>girls_clothing>pajamas_and_robes>pajamas>sets'

In [None]:
df_errors.iloc[198][1]

  df_errors.iloc[198][1]


'__label__clothing.girls_clothing.pajamas_and_robes.pajamas'

In [None]:
text = df_errors.iloc[5111][0]

  text = df_errors.iloc[5111][0]


In [None]:
run_layer_model(text)

'clothing>boys_clothing>sweaters>pullover_sweaters'

In [None]:
df_errors.iloc[5111][1]

  df_errors.iloc[5111][1]


'__label__clothing.boys_clothing.sweaters.pullover_sweaters'

In [None]:
# third test
print(lines[10000])

__label__craft_supplies_and_tools.tools_and_equipment.parts.tool_parts_and_accessories.sewing_machine_parts singer 12k new family fiddlebase sewing machine thread take up lever bolt spring lot 1 hi in the last few months i 39 ve listed many hundreds of singer sewing machine parts on 39 39 that other site 39 39 so now it 39 s time to start getting all my duplicate items here on etsy n ni 39 ve got literally thousands of singer parts here at any given moment for most of the main stream domestic singer models such as 12k 15k 27k 28k 127k 128k 66k 99k and the legendary 201k n ni started out collecting after i caught the bug following the purchase of my first machine to help me with my main job that of a craftsman and therefore because i only want beautiful machines for myself i only provide items that are in extremely good condition and i will take the time to clean and usually polish restore every item i offer for sale a very time consuming process but it 39 s worth it there 39 s nothing 

In [None]:
text = ' singer 12k new family fiddlebase sewing machine thread take up lever bolt spring lot 1 hi in the last few months i 39 ve listed many hundreds of singer sewing machine parts on 39 39 that other site 39 39 so now it 39 s time to start getting all my duplicate items here on etsy n ni 39 ve got literally thousands of singer parts here at any given moment for most of the main stream domestic singer models such as 12k 15k 27k 28k 127k 128k 66k 99k and the legendary 201k n ni started out collecting after i caught the bug following the purchase of my first machine to help me with my main job that of a craftsman and therefore because i only want beautiful machines for myself i only provide items that are in extremely good condition and i will take the time to clean and usually polish restore every item i offer for sale a very time consuming process but it 39 s worth it there 39 s nothing i hate more than brown gunked up stained and neglected sewing machines i try to get everything back to as near the day it was made many years ago n nso this listing is for a n nsinger 12k new family fiddle base sewing machine thread take up lever bolt spring n nsolid condition perfect for sprucing up your beloved machine n n please see the sewing machine parts section for many more similar items in the next few weeks i 39 ll be listing literally hundreds of singer accessories n nat times i may be able to be a little flexible on price especially on a multi purchase though as i mentioned i spend a lot of time and effort providing top quality items feel free to ask of course i 39 ll either say yes or no and i understand just how crazy postage prices have become to the u s etc feel free to drop me a line with any questions n nregards n nsonni singer sewing machine part foot attachment simanco original vintage restored embroidery craft darning physical'

In [None]:
run_layer_model(text)

'craft_supplies_and_tools>tools_and_equipment>parts>tool_parts_and_accessories>sewing_machine_parts'

In [None]:
with open(f'{basic_path}/bottom.val', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Display a sample line
print(lines[-1])

__label__home_and_living.lighting.light_accessories.light_switch_and_outlet_covers black and white home decor abstract art decor modern decor metal light switch cover switch plates and outlet covers free shipping luna gallery switch plates nprinted designs on metal switch plate covers nmade to last a lifetime n nstainless steel metal switch plate covers neco friendly printed designs nthe surface is flat nsemi gloss varnish finish nfree mounting screws are provided neasily cleaned with any non abrasive cleaner nproudly made in the usa n important information and sizing chart below n nsizes of switch plates nsingles 2 3 4 wide x 4 1 2 tall ndoubles 4 1 2 wide x 4 1 2 tall ntriples 6 1 2 wide x 4 1 2 tall nquadruples 8 wide x 4 1 2 tall nfor more plate configurations please see nhttps gallery switch plates nhigh quality metal switch plate covers with eco friendly printed designs neach plate is made at the time of order using a large format flatbed printer nthe design covers the top surfac

In [None]:
pip install fasttext



# 3.Predicting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
basic_path = '/content/drive/My Drive/DCU/Machine Learning/level/'

In [None]:
import fasttext
# Load the saved model
layer1_model = fasttext.load_model(f'{basic_path}/layer1.bin')
layer2_model = fasttext.load_model(f'{basic_path}/layer2.bin')
layer3_model = fasttext.load_model(f'{basic_path}/layer3.bin')
layer4_model = fasttext.load_model(f'{basic_path}/layer4.bin')
layer5_model = fasttext.load_model(f'{basic_path}/layer5.bin')
layer6_model = fasttext.load_model(f'{basic_path}/layer6.bin')
layer7_model = fasttext.load_model(f'{basic_path}/layer7.bin')
bottom_model = fasttext.load_model(f'{basic_path}/bottom.bin')

In [None]:
def count_levels(bottom_dict):
    """
    카테고리 경로의 계층 수를 계산

    Args:
        bottom_dict (dict): 카테고리 경로가 키인 딕셔너리

    Returns:
        int: 계층 수 (마침표 개수 + 1)
    """
    first_key = next(iter(bottom_dict.keys()))
    levels = first_key.count('.') + 1
    return levels


def get_hierarchical_top_categories(above_dict, under_dict, top_k=3):
    """
    계층적 카테고리 예측: 상위 계층(above_dict) 확률을 하위 계층(under_dict)에 반영하여 최종 상위 카테고리 추출

    Args:
        above_dict (dict): 상위 계층의 확률 딕셔너리 (예: {'art': 0.9, 'home': 0.1})
        under_dict (dict): 하위 계층의 확률 딕셔너리 (예: {'art>painting': 0.2, 'home>furniture': 0.5})
        top_k (int): 반환할 상위 카테고리 개수 (기본값: 3)

    Returns:
        dict: 최종 확률 기준 상위 top_k개의 {카테고리: 최종 확률} 딕셔너리
              (예: {'art>painting': 0.18, 'home>furniture': 0.05})
    """
    # 1. 상위 계층에서 top_k개 카테고리 추출 (확률 높은 순)
    top_above = sorted(above_dict.items(), key=lambda x: x[1], reverse=True)[:top_k]

    # 2. 하위 계층 후보들의 최종 확률 계산 (above_prob * under_prob)
    combined_probs = {}
    for above_cat, above_prob in top_above:
        # 상위 카테고리로 시작하는 하위 항목만 필터링
        under_items = {label: prob for label, prob in under_dict.items()
                      if label.startswith(above_cat + ">")}

        for under_label, under_prob in under_items.items():
            combined_probs[under_label] = above_prob * under_prob

    # 3. 최종 확률 기준 정렬 후 상위 top_k개 추출
    top_final = dict(sorted(combined_probs.items(), key=lambda x: x[1], reverse=True)[:top_k])
    return top_final



# 모든 모델을 미리 딕셔너리에 로드
loaded_models = {}

# 모델 로딩 함수
def load_all_models():
    global loaded_models
    # Bottom 모델 로드
    loaded_models['bottom_model'] = bottom_model

    # Layer 모델들 로드 (Layer1부터 Layer6까지)
    for i in range(1, 7):
        model_name = f'layer{i}_model'
        try:
            model = globals()[model_name]
            loaded_models[model_name] = model
            print(f"{model_name} 로드 완료")
        except (KeyError, NameError):
            print(f"{model_name}은 존재하지 않습니다.")

# 데이터 처리 전에 모델 미리 로드
load_all_models()

# run_layer_model 함수 수정
def run_layer_model(text):
    """
    Bottom 모델과 계층 모델 예측을 비교해 최종 카테고리 결정

    Args:
        text (str): 분류할 텍스트

    Returns:
        str: 최종 카테고리 경로
    """
    # 1. Bottom 모델 예측
    bottom_model = loaded_models['bottom_model']
    bottom_labels, bottom_probs = bottom_model.predict([text], k=1)
    bottom_dict = {
        label.replace('__label__', ''): round(prob, 3)
        for label, prob in zip(bottom_labels[0], bottom_probs[0])
    }
    bottom_path = list(bottom_dict.keys())[0]
    bottom_path = bottom_path.replace('.', '>')

    # 2. Layer1 처리 (고정)
    layer1_model = loaded_models['layer1_model']
    layer_labels, layer_probs = layer1_model.predict([text], k=3)
    current_dict = {
        label.replace('__label__', ''): round(prob, 3)
        for label, prob in zip(layer_labels[0], layer_probs[0])
    }

    # 3. 최초 비교
    if list(current_dict.keys())[0] == bottom_path:
        return bottom_path

    # 4. 불일치 시 추가 계층 탐색
    layer_num = 1
    max_layer = 6

    while layer_num < max_layer:
        layer_num += 1

        try:
            # 미리 로드된 모델 사용
            model_name = f'layer{layer_num}_model'
            if model_name not in loaded_models:
                # 모델이 없으면 현재 결과 반환
                return max(current_dict.items(), key=lambda x: x[1])[0]

            model = loaded_models[model_name]
            labels, probs = model.predict([text], k=len(model.get_labels()))
            next_dict = {
                label.replace('__label__', ''): round(prob, 3)
                for label, prob in zip(labels[0], probs[0])
            }

            # 상위 계층과 일관성 필터링
            next_dict = get_hierarchical_top_categories(current_dict, next_dict)

            if list(next_dict.keys())[0] == bottom_path:
                return bottom_path

            # 빈 딕셔너리 처리
            if not next_dict:
                return max(current_dict.items(), key=lambda x: x[1])[0]

            # 현재 계층 업데이트
            current_dict = next_dict

        except (KeyError, IndexError):  # 모델이 없거나 예측 실패
            return max(current_dict.items(), key=lambda x: x[1])[0]




# def run_layer_model(text):
#     """
#     Bottom 모델과 계층 모델 예측을 비교해 최종 카테고리 결정

#     Args:
#         text (str): 분류할 텍스트

#     Returns:
#         str: 최종 카테고리 경로
#     """
#     # 1. Bottom 모델 예측
#     bottom_labels, bottom_probs = bottom_model.predict([text], k=1)
#     bottom_dict = {
#         label.replace('__label__', ''): round(prob, 3)
#         for label, prob in zip(bottom_labels[0], bottom_probs[0])
#     }
#     bottom_path = list(bottom_dict.keys())[0]
#     bottom_path = bottom_path.replace('.', '>')

#     # 2. Layer1 처리 (고정)
#     layer_labels, layer_probs = layer1_model.predict([text], k=3)
#     current_dict = {
#         label.replace('__label__', ''): round(prob, 3)
#         for label, prob in zip(layer_labels[0], layer_probs[0])
#     }

#     # 3. 최초 비교
#     if list(current_dict.keys())[0] == bottom_path:
#       return bottom_path

#     # 4. 불일치 시 추가 계층 탐색
#     layer_num = 1  # Layer1부터 시작
#     max_layer = 6  # 최대 레이어 수

#     while layer_num < max_layer:
#         layer_num += 1  # 다음 계층 이동 (Layer2 → Layer3 → ...)

#         try:
#             # 동적 모델 호출
#             model = f'layer{layer_num}_model'
#             labels, probs = model.predict([text], k=len(model.get_labels()))
#             next_dict = {
#                 label.replace('__label__', ''): round(prob, 3)
#                 for label, prob in zip(labels[0], probs[0])
#             }


#             # 상위 계층과 일관성 필터링
#             next_dict = get_hierarchical_top_categories(current_dict, next_dict)


#             if list(next_dict.keys())[0] == bottom_path:
#               return bottom_path

#             # 빈 딕셔너리 처리
#             if not next_dict:
#                 return max(current_dict.items(), key=lambda x: x[1])[0]

#             # 현재 계층 업데이트
#             current_dict = next_dict


#         except (KeyError, IndexError):  # 모델이 없거나 예측 실패
#             return max(current_dict.items(), key=lambda x: x[1])[0]

    # 5. 최대 레이어까지 도달했을 때의 최종 결과
    # return max(current_dict.items(), key=lambda x: x[1])[0]

# layer1_lable, layer1_probs = layer1_model.predict([text], k=3)
# layer1_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer1_lable[0], layer1_probs[0])}
# layer1_dict
# layer2_lable, layer2_probs = layer2_model.predict([text], k=len(layer2_model.get_labels()))
# layer2_dict = {label.replace('__label__', ''): round(prob, 3) for label, prob in zip(layer2_lable[0], layer2_probs[0])}
# layer2_dict=get_hierarchical_top_categories(layer1_dict,layer2_dict)
# layer2_dict

layer1_model 로드 완료
layer2_model 로드 완료
layer3_model 로드 완료
layer4_model 로드 완료
layer5_model 로드 완료
layer6_model 로드 완료


In [None]:
df_train.head()

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,...,shape,pattern,bottom_category_id,bottom_category_text,top_category_id,top_category_text,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text
0,1500855633,Driftwood Solar Lamp Handcrafted Off Grid Sola...,Free shipping in Canada and the USA. This one...,"Driftwood Lamp,driftwood solar,nautical,cottag...",physical,patio & outdoor,,,wood,,...,,,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,4,brown,1,black
1,717452434,Coconut bistro patio light hand carved in Bali...,Hand carved Bali coconut bistro light covers p...,"Hand carved coconut,coconut shell,coconut cand...",physical,,,,,,...,,,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,4,brown,0,beige
2,868201745,"Garden outdoor lamp for meditation, white pin...",Ready to ship! \n\nOnly one piece!\n\n\nA Lot...,"Meditation lamp,Lotus for meditation,Outdoor l...",physical,,,,,,...,,,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,17,white,11,pink
3,718866859,"Sunflower solar mason jar light, solar outdoor...",Sunflower Jenni Jar\nThis solar rechargeable b...,"solar lights,mason jar lights,Farmhouse lights...",physical,,,,,housewarming,...,,,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,18,yellow,5,clear
4,1052996924,NWT Rae Dunn Queen Pool Lounger,"Gorgeous , measurements 49&quot; x 30&quot; , ...",,physical,,,,,,...,,,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,17,white,1,black


In [None]:
import re

# Text preprocessing function
def preprocess_text(text):
   if pd.isna(text):
       return ""
   # Remove HTML tags
   text = re.sub(r'&[a-zA-Z0-9]+;', ' ', text)
   # Remove URLs with www
   text = re.sub(r'www\.[^\s]+', ' ', text)
   # Replace special characters with spaces
   text = re.sub(r'[^\w\s]', ' ', text)
   # Replace multiple spaces with a single space
   text = re.sub(r'\s+', ' ', text)
   return text.lower().strip()

# Feature combination function with all features
def combine_all_features(row):
    features = []

    columns = ['title', 'description', 'tags','type', 'room', 'craft_type',
               'recipient', 'material', 'occasion', 'holiday',
               'art_subject', 'style', 'shape', 'pattern']

    for col in columns:
        if not pd.isna(row[col]) and row[col] != '':
            features.append(str(row[col]))

    return ' '.join(features)

In [None]:
input_train_df = df_train.copy()
input_train_df['all_features'] = input_train_df.apply(combine_all_features, axis=1)
input_train_df['processed_text_all'] = input_train_df['all_features'].apply(preprocess_text)

In [None]:
text = input_train_df['processed_text_all'].iloc[130000]

In [None]:
run_layer_model(text)

'clothing>girls_clothing>baby_girls_clothing>socks_and_leg_warmers'

In [None]:
bottom_model.predict([text], k=1)

([['__label__clothing.girls_clothing.baby_girls_clothing.socks_and_leg_warmers']],
 [array([0.9080636], dtype=float32)])

In [None]:
input_train_df.iloc[130000][-9]

  input_train_df.iloc[130000][-9]


'clothing.girls_clothing.baby_girls_clothing.socks_and_leg_warmers'

In [None]:
bottom_model.test(f"{basic_path}/bottom.val")
test_samples = open(f"{basic_path}/bottom.val").readlines()

# 存储错误样本
wrong_samples = []

for line in test_samples:
    true_label = line.split()[0]  # FastText标签格式如 "__label__class1"
    text = " ".join(line.split()[1:])
    pred_label = bottom_model.predict([text])[0][0]  # 获取预测标签

    if pred_label != true_label:
        wrong_samples.append({
            "text": text,
            "true_label": true_label,
            "pred_label": pred_label
        })

# 转换为DataFrame分析
import pandas as pd
df_errors = pd.DataFrame(wrong_samples)

In [None]:
text = df_errors.iloc[10010][0]

  text = df_errors.iloc[10010][0]


In [None]:
run_layer_model(text)
#craft_supplies_and_tools>pens_pencils_and_marking_tools>pencils>graphite_pencils

'craft_supplies_and_tools>beads_gems_and_cabochons>charms_and_pendants>pendants'

In [None]:
bottom_model.predict([text], k=1)
#([['__label__craft_supplies_and_tools.pens_pencils_and_marking_tools.nibs_and_nib_holders']], [array([0.4306312], dtype=float32)])

([['__label__craft_supplies_and_tools.findings.connectors']],
 [array([0.57717425], dtype=float32)])

In [None]:
df_errors.iloc[10010][1]

  df_errors.iloc[10010][1]


'__label__craft_supplies_and_tools.beads_gems_and_cabochons.charms_and_pendants.pendants'

In [None]:
input_train_df.head()

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,...,bottom_category_id,bottom_category_text,top_category_id,top_category_text,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text,all_features,processed_text_all
0,1500855633,Driftwood Solar Lamp Handcrafted Off Grid Sola...,Free shipping in Canada and the USA. This one...,"Driftwood Lamp,driftwood solar,nautical,cottag...",physical,patio & outdoor,,,wood,,...,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,4,brown,1,black,Driftwood Solar Lamp Handcrafted Off Grid Sola...,driftwood solar lamp handcrafted off grid sola...
1,717452434,Coconut bistro patio light hand carved in Bali...,Hand carved Bali coconut bistro light covers p...,"Hand carved coconut,coconut shell,coconut cand...",physical,,,,,,...,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,4,brown,0,beige,Coconut bistro patio light hand carved in Bali...,coconut bistro patio light hand carved in bali...
2,868201745,"Garden outdoor lamp for meditation, white pin...",Ready to ship! \n\nOnly one piece!\n\n\nA Lot...,"Meditation lamp,Lotus for meditation,Outdoor l...",physical,,,,,,...,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,17,white,11,pink,"Garden outdoor lamp for meditation, white pin...",garden outdoor lamp for meditation white pink ...
3,718866859,"Sunflower solar mason jar light, solar outdoor...",Sunflower Jenni Jar\nThis solar rechargeable b...,"solar lights,mason jar lights,Farmhouse lights...",physical,,,,,housewarming,...,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,18,yellow,5,clear,"Sunflower solar mason jar light, solar outdoor...",sunflower solar mason jar light solar outdoor ...
4,1052996924,NWT Rae Dunn Queen Pool Lounger,"Gorgeous , measurements 49&quot; x 30&quot; , ...",,physical,,,,,,...,1117,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,17,white,1,black,"NWT Rae Dunn Queen Pool Lounger Gorgeous , mea...",nwt rae dunn queen pool lounger gorgeous measu...


In [None]:
predicted_df = input_train_df.copy()

In [None]:
import multiprocessing
print(f"사용 가능한 CPU 코어 수: {multiprocessing.cpu_count()}")

사용 가능한 CPU 코어 수: 8


In [None]:
from tqdm import tqdm

# pandas 데이터프레임에서 numpy 배열로 변환
texts = predicted_df['processed_text_all'].values  # numpy 배열로 변환

# numpy 배열에 함수 적용 (tqdm으로 진행 상황 모니터링)
results = np.array([run_layer_model(text) for text in tqdm(texts, desc="처리 중", total=len(texts))])

# 결과를 다시 pandas 데이터프레임에 할당
predicted_df['pred_bottom_text'] = results

처리 중: 100%|██████████| 229624/229624 [1:10:25<00:00, 54.34it/s]


In [None]:
predicted_df.head()

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,...,bottom_category_text,top_category_id,top_category_text,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text,all_features,processed_text_all,pred_bottom_text
0,1500855633,Driftwood Solar Lamp Handcrafted Off Grid Sola...,Free shipping in Canada and the USA. This one...,"Driftwood Lamp,driftwood solar,nautical,cottag...",physical,patio & outdoor,,,wood,,...,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,4,brown,1,black,Driftwood Solar Lamp Handcrafted Off Grid Sola...,driftwood solar lamp handcrafted off grid sola...,home_and_living>lighting>light_fixtures>garden...
1,717452434,Coconut bistro patio light hand carved in Bali...,Hand carved Bali coconut bistro light covers p...,"Hand carved coconut,coconut shell,coconut cand...",physical,,,,,,...,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,4,brown,0,beige,Coconut bistro patio light hand carved in Bali...,coconut bistro patio light hand carved in bali...,home_and_living>lighting>light_fixtures>garden...
2,868201745,"Garden outdoor lamp for meditation, white pin...",Ready to ship! \n\nOnly one piece!\n\n\nA Lot...,"Meditation lamp,Lotus for meditation,Outdoor l...",physical,,,,,,...,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,17,white,11,pink,"Garden outdoor lamp for meditation, white pin...",garden outdoor lamp for meditation white pink ...,home_and_living>lighting>light_fixtures>garden...
3,718866859,"Sunflower solar mason jar light, solar outdoor...",Sunflower Jenni Jar\nThis solar rechargeable b...,"solar lights,mason jar lights,Farmhouse lights...",physical,,,,,housewarming,...,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,18,yellow,5,clear,"Sunflower solar mason jar light, solar outdoor...",sunflower solar mason jar light solar outdoor ...,home_and_living>lighting>light_fixtures>garden...
4,1052996924,NWT Rae Dunn Queen Pool Lounger,"Gorgeous , measurements 49&quot; x 30&quot; , ...",,physical,,,,,,...,home_and_living.lighting.light_fixtures.garden...,8,home_and_living,17,white,1,black,"NWT Rae Dunn Queen Pool Lounger Gorgeous , mea...",nwt rae dunn queen pool lounger gorgeous measu...,toys_and_games>games_and_puzzles>game_room>poo...


In [None]:
# 将 pred_bottom_text 列中的所有 ">" 替换为 "."
predicted_df['pred_bottom_text'] = predicted_df['pred_bottom_text'].str.replace('>', '.')

In [None]:
predicted_df.iloc[2022]

Unnamed: 0,1031
product_id,966454353
title,Indian kantha quilt twin kantha bedcover bedsp...
description,Beautiful bedspread in a stunning Cotton Kanth...
tags,"handblock quilt,cotton quilt,indian blanket,ka..."
type,physical
room,
craft_type,
recipient,kids
material,cotton
occasion,anniversary


In [None]:
# 1:1 매핑 관계를 이용해 pred_bottom_id 생성하기
# 먼저 bottom_category_text와 bottom_category_id 사이의 매핑 딕셔너리 생성
mapping_dict = dict(zip(predicted_df['bottom_category_text'], predicted_df['bottom_category_id'].astype(str)))

# 이 매핑 딕셔너리를 이용해 pred_bottom_text에 해당하는 pred_bottom_id 생성
predicted_df['pred_bottom_id'] = predicted_df['pred_bottom_text'].map(mapping_dict)

In [None]:
predicted_df.iloc[7000]

Unnamed: 0,22
product_id,250916116
title,"Map Camera Strap. World map: Australia, North ..."
description,--- Product description ---\n\nThis adorable c...
tags,"camera strap,world map,SLR camera strap,Phtogr..."
type,physical
room,
craft_type,
recipient,
material,
occasion,


In [None]:
predicted_df.to_csv(f'{basic_path}/train_pred.csv')

In [None]:
y_true = predicted_df['bottom_category_id']  # Actual category IDs
y_pred = predicted_df['pred_bottom_id']

In [None]:
type(y_true[0]), type(y_pred[0])

(pandas.core.series.Series, pandas.core.series.Series)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Assuming both bottom_category_id and pred_bottom_id are strings
y_true = predicted_df['bottom_category_id'].astype(str)  # Actual category IDs
y_pred = predicted_df['pred_bottom_id'].astype(str)      # Predicted category IDs

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Calculate precision, recall, and F1 score
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.8679


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.8888
Recall: 0.8679
F1 Score: 0.8726


In [None]:
# 필요한 라이브러리 임포트
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# 실제 값과 예측 값을 문자열로 변환하여 준비
y_true = predicted_df['bottom_category_id'].astype(str)
y_pred = predicted_df['pred_bottom_id'].astype(str)

# 주요 평가 지표 계산
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

# 결과를 데이터프레임으로 만들기
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [f'{accuracy:.4f}', f'{precision:.4f}', f'{recall:.4f}', f'{f1:.4f}']
})

# 표를 이미지로 저장하기 위한 함수
def save_df_as_image(df, filename, figsize=(6, 3)):
    plt.figure(figsize=figsize)
    plt.axis('off')

    # 표 만들기
    table = plt.table(
        cellText=df.values,
        colLabels=df.columns,
        loc='center',
        cellLoc='center',
        colColours=['#f2f2f2']*len(df.columns),
        cellColours=[['#f9f9f9']*len(df.columns)]*len(df)
    )

    # 표 스타일 조정
    table.auto_set_font_size(False)
    table.set_fontsize(12)
    table.scale(1.2, 1.5)

    # 제목 추가
    plt.title('Bottom Category Prediction Performance Metrics', fontsize=14, pad=20)

    # 이미지 저장
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    plt.close()
    print(f"이미지가 {filename}에 저장되었습니다.")

# 지표를 이미지로 저장
save_df_as_image(metrics_df, f'{basic_path}/bottom_overall_metrics.png')



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


이미지가 /content/drive/My Drive/DCU/Machine Learning/level//bottom_overall_metrics.png에 저장되었습니다.


In [None]:
df_test.shape

(25514, 15)

In [None]:
df_test.head()

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,holiday,art_subject,style,shape,pattern
0,1397234990,Antler Skull Deer Skull Hand Painted Beads Dec...,Hand painted deer antlers.\nThe skull is natur...,,physical,,,,,,,,,,
1,1167322940,Hemifusus Conchilidium - Collectible Shell Spe...,Measures 2.625 x 1.625 x 1.25 inches.\n\nSeash...,"small conch,hemifusus,orange,orange shell,spir...",physical,,,,,,,,,,
2,1346575470,Excavation set with real bones / hammer chisel...,Discovery fun for young and old!\nThere are RE...,,physical,,,,,,,,,,
3,1607587430,Rabbit&#39;s Paw Rabbit Foot Paw Claw Glass De...,"For sale is this glass filled with moss, pine ...",,physical,,,,,,,,,,
4,1633456300,Coyote Tooth and Freshwater Pearl Ornament | S...,Coyote Tooth and Freshwater Pearl Ornament 🌸\n...,"ostara,oddities,pagan art,goblincore,forest wi...",physical,,,,,birthday,,,,,


In [None]:
input_test_df = df_test.copy()
input_test_df['all_features'] = input_test_df.apply(combine_all_features, axis=1)
input_test_df['processed_text_all'] = input_test_df['all_features'].apply(preprocess_text)

In [None]:
input_test_df.head()

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,holiday,art_subject,style,shape,pattern,all_features,processed_text_all
0,1397234990,Antler Skull Deer Skull Hand Painted Beads Dec...,Hand painted deer antlers.\nThe skull is natur...,,physical,,,,,,,,,,,Antler Skull Deer Skull Hand Painted Beads Dec...,antler skull deer skull hand painted beads dec...
1,1167322940,Hemifusus Conchilidium - Collectible Shell Spe...,Measures 2.625 x 1.625 x 1.25 inches.\n\nSeash...,"small conch,hemifusus,orange,orange shell,spir...",physical,,,,,,,,,,,Hemifusus Conchilidium - Collectible Shell Spe...,hemifusus conchilidium collectible shell speci...
2,1346575470,Excavation set with real bones / hammer chisel...,Discovery fun for young and old!\nThere are RE...,,physical,,,,,,,,,,,Excavation set with real bones / hammer chisel...,excavation set with real bones hammer chisel b...
3,1607587430,Rabbit&#39;s Paw Rabbit Foot Paw Claw Glass De...,"For sale is this glass filled with moss, pine ...",,physical,,,,,,,,,,,Rabbit&#39;s Paw Rabbit Foot Paw Claw Glass De...,rabbit 39 s paw rabbit foot paw claw glass dec...
4,1633456300,Coyote Tooth and Freshwater Pearl Ornament | S...,Coyote Tooth and Freshwater Pearl Ornament 🌸\n...,"ostara,oddities,pagan art,goblincore,forest wi...",physical,,,,,birthday,,,,,,Coyote Tooth and Freshwater Pearl Ornament | S...,coyote tooth and freshwater pearl ornament spr...


In [None]:
predicted_test_df = input_test_df.copy()

In [None]:
# pandas 데이터프레임에서 numpy 배열로 변환
texts = predicted_test_df['processed_text_all'].values  # numpy 배열로 변환

# numpy 배열에 함수 적용 (tqdm으로 진행 상황 모니터링)
results = np.array([run_layer_model(text) for text in tqdm(texts, desc="처리 중", total=len(texts))])

# 결과를 다시 pandas 데이터프레임에 할당
predicted_test_df['pred_bottom_text'] = results

처리 중: 100%|██████████| 25514/25514 [08:09<00:00, 52.08it/s]


In [None]:
predicted_test_df['pred_bottom_text'] = predicted_test_df['pred_bottom_text'].str.replace('>', '.')

In [None]:
predicted_test_df.iloc[100]

Unnamed: 0,48
product_id,1104542150
title,"Funny Circus gift tags, circus collage sheets,..."
description,"Perfect gift tags for a circus, carnival, or ..."
tags,"collage sheets,gift tags,tags,tag collage shee..."
type,download
room,
craft_type,
recipient,
material,
occasion,birthday


In [None]:
predicted_test_df['pred_bottom_id'] = predicted_test_df['pred_bottom_text'].map(mapping_dict)

In [None]:
predicted_test_df[predicted_test_df['pred_bottom_id'] == '2169'].iloc[0][-2]

  predicted_test_df[predicted_test_df['pred_bottom_id'] == '2169'].iloc[0][-2]


'accessories.hats_and_caps.earmuffs_and_ear_warmers.earmuffs'

In [None]:
# 역방향 매핑 딕셔너리 만들기 (bottom_category_id -> bottom_category_text)
reverse_mapping = {v: k for k, v in mapping_dict.items()}

# '2169' 키에 해당하는 값 찾기
if '2169' in reverse_mapping:
    print(f"ID '2169'에 대응되는 값: {reverse_mapping['2169']}")
else:
    print("ID '2169'는 매핑 딕셔너리에 존재하지 않습니다.")

ID '2169'에 대응되는 값: accessories.hats_and_caps.earmuffs_and_ear_warmers.earmuffs


In [None]:
final_features = ['product_id', 'pred_bottom_id']

In [None]:
df_predictions_bottom = predicted_test_df[final_features]

In [None]:
df_predictions_bottom.head()

Unnamed: 0,product_id,pred_bottom_id
0,1397234990,1958
1,1167322940,1958
2,1346575470,6087
3,1607587430,1958
4,1633456300,1958


In [None]:
df_top_pred = pd.read_csv('/content/drive/My Drive/DCU/Machine Learning/final/final_top_label_result.test')

In [None]:
df_top_pred.shape

(25514, 19)

In [None]:
df_top_pred.head()

Unnamed: 0.1,Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,holiday,art_subject,style,shape,pattern,all_features,processed_text_all,top_category_id
0,0,1397234990,Antler Skull Deer Skull Hand Painted Beads Dec...,Hand painted deer antlers.\nThe skull is natur...,,physical,,,,,,,,,,,Antler Skull Deer Skull Hand Painted Beads Dec...,antler skull deer skull hand painted beads dec...,8
1,1,1167322940,Hemifusus Conchilidium - Collectible Shell Spe...,Measures 2.625 x 1.625 x 1.25 inches.\n\nSeash...,"small conch,hemifusus,orange,orange shell,spir...",physical,,,,,,,,,,,Hemifusus Conchilidium - Collectible Shell Spe...,hemifusus conchilidium collectible shell speci...,8
2,2,1346575470,Excavation set with real bones / hammer chisel...,Discovery fun for young and old!\nThere are RE...,,physical,,,,,,,,,,,Excavation set with real bones / hammer chisel...,excavation set with real bones hammer chisel b...,13
3,3,1607587430,Rabbit&#39;s Paw Rabbit Foot Paw Claw Glass De...,"For sale is this glass filled with moss, pine ...",,physical,,,,,,,,,,,Rabbit&#39;s Paw Rabbit Foot Paw Claw Glass De...,rabbit 39 s paw rabbit foot paw claw glass dec...,8
4,4,1633456300,Coyote Tooth and Freshwater Pearl Ornament | S...,Coyote Tooth and Freshwater Pearl Ornament 🌸\n...,"ostara,oddities,pagan art,goblincore,forest wi...",physical,,,,,birthday,,,,,,Coyote Tooth and Freshwater Pearl Ornament | S...,coyote tooth and freshwater pearl ornament spr...,8


In [None]:
final_top_features = ['product_id', 'top_category_id']

In [None]:
df_predictions_top = df_top_pred[final_top_features]

In [None]:
df_predictions_top

Unnamed: 0,product_id,top_category_id
0,1397234990,8
1,1167322940,8
2,1346575470,13
3,1607587430,8
4,1633456300,8
...,...,...
25509,933574050,0
25510,934170700,0
25511,1351371360,5
25512,886303530,0


In [None]:
# product_id를 기준으로 두 데이터프레임 합치기
merged_df = pd.merge(
    df_predictions_top,
    df_predictions_bottom,
    on='product_id',
    how='inner'  # 양쪽 데이터프레임에 모두 존재하는 product_id만 유지
)

# pred_bottom_id 열 이름을 bottom_category_id로 변경
merged_df = merged_df.rename(columns={'pred_bottom_id': 'bottom_category_id'})

# 결과 확인
print(merged_df.head())

   product_id  top_category_id bottom_category_id
0  1397234990                8               1958
1  1167322940                8               1958
2  1346575470               13               6087
3  1607587430                8               1958
4  1633456300                8               1958


In [None]:
df_predictions = merged_df.copy()

In [None]:
df_predictions.shape

(25514, 3)

In [None]:
student_id = "11383"
df_predictions[
    ["product_id", "top_category_id", "bottom_category_id"]
].to_parquet(f"{basic_path}/predictions_{student_id}.parquet")

In [None]:
loaded_df = pd.read_parquet(f"{basic_path}/predictions_{student_id}.parquet")

In [None]:
loaded_df

Unnamed: 0,product_id,top_category_id,bottom_category_id
0,1397234990,8,1958
1,1167322940,8,1958
2,1346575470,13,6087
3,1607587430,8,1958
4,1633456300,8,1958
...,...,...,...
25509,933574050,0,1764
25510,934170700,0,17
25511,1351371360,5,2169
25512,886303530,0,16
