In [43]:
import pandas as pd
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from typing import Optional
print("Importing...")


Importing...


In [51]:
import os
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
file_name = "alpha2_dataset.csv"

print("Downloading dataset...")

current_directory = os.getcwd()
file_path = os.path.join(current_directory, '..', file_name)
file_path = os.path.normpath(file_path)

df = pd.read_csv(file_path)
df = df.fillna(pd.NA)

print("EDA dataset...")
print(df.head())
df.info()

# stemmer, lemmatizer and stopwords

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

# Initialize NLTK resources
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'  # Adjective
    elif tag.startswith('V'):
        return 'v'  # Verb
    elif tag.startswith('N'):
        return 'n'  # Noun
    elif tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'  # Default to noun if not recognized


def remove_extra_new_lines(text):
    if pd.isnull(text):  # check if text is nan
        return ''  # replace with an empty string

    clean_text = [i for i in str(text).splitlines() if i.strip()]
    clean_text = ' '.join(clean_text)
    return clean_text


def remove_extra_whitespace(text: str) -> str:
    spaceless_text = re.sub(r'\s+', ' ', text)
    return spaceless_text


def remove_special_chars(text: str, remove_digits: Optional[bool] = False) -> str:
    if remove_digits:
        pattern = r'[^a-zA-Z\s]'
    else:
        pattern = r'[^a-zA-Z0-9\s]'

    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text


def normalize_text(text):
    text = remove_extra_new_lines(text)

    text = remove_extra_whitespace(text)

    text = remove_special_chars(text, remove_digits=False)

    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    tagged_tokens = pos_tag(tokens)
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in tagged_tokens]

    return ' '.join(lemmas)


normalization = ['name', 'description']
for column in normalization:
    df[column + '_normalized'] = df[column].apply(normalize_text)

# print(df.shape)
# print(df.iloc[0])
# print(df.head())

X = df.drop(columns=['parent_category'])
y = df['parent_category']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
print(f'X_train shape: {X_train.shape}')


[nltk_data] Downloading package punkt to /home/rcastillo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rcastillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/rcastillo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rcastillo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Downloading dataset...
EDA dataset...
                                              name      type  price  \
0                Duracell - AAA Batteries (4-Pack)  HardGood   5.49   
1  Duracell - AA 1.5V CopperTop Batteries (4-Pack)  HardGood   5.49   
2                 Duracell - AA Batteries (8-Pack)  HardGood   7.49   
3            Energizer - MAX Batteries AA (4-Pack)  HardGood   4.99   
4                  Duracell - C Batteries (4-Pack)  HardGood   8.99   

                                         description manufacturer  \
0  Compatible with select electronic devices; AAA...     Duracell   
1  Long-lasting energy; DURALOCK Power Preserve t...     Duracell   
2  Compatible with select electronic devices; AA ...     Duracell   
3  4-pack AA alkaline batteries; battery tester i...    Energizer   
4  Compatible with select electronic devices; C s...     Duracell   

                                           url              parent_category  \
0                duracell aaa batteries 4

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define columns
categorical_columns = ['type', 'manufacturer']
numerical_columns = ['price']
text_columns = ['name_normalized', 'description_normalized']

# Fill missing values
X_train[categorical_columns] = X_train[categorical_columns].fillna('missing')
X_test[categorical_columns] = X_test[categorical_columns].fillna('missing')

# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[categorical_columns])
X_test_encoded = encoder.transform(X_test[categorical_columns])

# Print information about X_train_encoded and X_test_encoded
print("Shape of X_train_encoded:", X_train_encoded.shape)
print("Data type of X_train_encoded:", type(X_train_encoded))
print("Data type of elements in X_train_encoded:", X_train_encoded.dtype)
print("Shape of X_test_encoded:", X_test_encoded.shape)
print("Data type of X_test_encoded:", type(X_test_encoded))
print("Data type of elements in X_test_encoded:", X_test_encoded.dtype)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_columns])
X_test_scaled = scaler.transform(X_test[numerical_columns])

# Print information about X_train_scaled and X_test_scaled
print("Shape of X_test_scaled:", X_test_scaled.shape)
print("Data type of elements in X_test_scaled:", X_test_scaled.dtype)

In [None]:
from joblib import dump, load

print("Shape of X_test_scaled2:", X_test_scaled.shape)
print("Shape of X_test_encoded2:", X_test_encoded.shape)

print("saving")
dump(X_test_encoded, 'onehot_encoder_model_v1.joblib')
dump(X_test_scaled, 'scaler_model_v1.joblib')

print("loading")
X_test_encoded2 = load('onehot_encoder_model_v1.joblib')
X_test_scaled2 = load('scaler_model_v1.joblib')

print("Shape of X_test_scaled2:", X_test_scaled2.shape)
print("Data type of elements in X_test_scaled2:", X_test_scaled2.dtype)

print("Shape of X_test_encoded2:", X_test_encoded2.shape)
print("Data type of elements in X_test_encoded2:", X_test_encoded2.dtype)


In [45]:
import pandas as pd

# JSON
json_data = {
    "id": "6bd0fb8f-d850-4fb9-b8a7-5d8d3ad21be7",
    "payload": {
        "name": "name",
        "description": "descripton",
        "price": "100",
        "product_type": "Software",
        "manufacturer": "EnerPlex"
    }
}

# Convertir JSON a DataFrame
df = pd.json_normalize(json_data)
df.columns = df.columns.str.replace('payload.', '')
# Imprimir el DataFrame
print(df)


                                     id  name description price product_type  \
0  6bd0fb8f-d850-4fb9-b8a7-5d8d3ad21be7  name  descripton   100     Software   

  manufacturer  
0     EnerPlex  


In [54]:
from tensorflow.keras.models import load_model
from transformers import AutoTokenizer
import tensorflow as tf
import numpy as np
from transformers import BertTokenizer, TFBertModel
"""
"sku": 1003136,
  "name": "1970s Rock TrackPak - Mac",
  "type": "Software",
  "price": 29.99,
  "upc": "884088157449",
  "category": [
    {
      "id": "abcat0207000",
      "name": "Musical Instruments"
    },
    {
      "id": "pcmcat152100050020",
      "name": "Recording Equipment"
    },
    {
      "id": "pcmcat152100050026",
      "name": "Sound Recording Software"
    }
  ],
  "shipping": 5.49,
  "description": "HAL LEONARD 1970s Rock TrackPak: Features 12 classic rock songs; compatible with GarageBand; includes loops for each instrument",
  "manufacturer": "Hal Leonard",
  "model": "631386",

"""
user_input = {
    "name": "1970s Rock TrackPak - Mac",
    "description": "HAL LEONARD 1970s Rock TrackPak: Features 12 classic rock songs; compatible with GarageBand; includes loops ",
    "price": 29.99,
    "type": "Software",
    "manufacturer": "Hal Leonard",
}


model_1 = load_model('model_1_preberttune.h5')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the chec

In [57]:
from scipy.sparse import hstack

def extract_last_hidden_state(embeddings):
    return embeddings[:, -1, :]

def tokenize_and_get_embeddings(column):
    # Tokenize text
    if isinstance(column, str):
        tokenized_text = tokenizer(column, padding=True, truncation=True, return_tensors='tf')
    elif isinstance(column, list) and all(isinstance(item, str) for item in column):
        tokenized_text = tokenizer(column, padding=True, truncation=True, return_tensors='tf')
    else:
        raise ValueError("Invalid input format.")

    outputs = model(tokenized_text)
    embeddings = outputs.last_hidden_state.numpy()

    return embeddings

def prepare_input(user_input, scaler, encoder, categorical_columns):

    user_input['name'] = normalize_text(user_input['name'])
    user_input['description'] = normalize_text(user_input['description'])

    name_embeddings = tokenize_and_get_embeddings(user_input['name'])
    description_embeddings = tokenize_and_get_embeddings(user_input['description'])
    extracted_name_hidden = extract_last_hidden_state(name_embeddings)
    extracted_description_hidden = extract_last_hidden_state(description_embeddings)
    scaled_price = scaler.transform([[user_input['price']]])[0, 0]

    encoded_user_input = [[user_input[column]] for column in categorical_columns]
    encoded_user_input = np.array(encoded_user_input).reshape(1, -1)
    encoded_categorical_features = encoder.transform(encoded_user_input)
    print(encoded_categorical_features.shape)

    combined_features = hstack([encoded_categorical_features, np.array([[scaled_price]])])
    print(combined_features.shape)
    num_cat_input_array = combined_features.toarray().astype(np.float32)
    final_input_array = np.concatenate((num_cat_input_array,extracted_name_hidden,extracted_description_hidden), axis=1)

    return name_embeddings, description_embeddings, final_input_array

In [58]:
name_embeddings, description_embeddings, final_input_array = prepare_input(user_input, scaler, encoder, categorical_columns)


(1, 2195)
(1, 2196)




In [60]:
categories_dict = {
    'parent_category': {category: index for index, category in enumerate(df['parent_category'].unique())},
    'sub_category_1': {category: index for index, category in enumerate(df['sub_category_1'].unique())},
    'sub_category_2': {category: index for index, category in enumerate(df['sub_category_2'].unique())},
    'sub_category_3': {category: index for index, category in enumerate(df['sub_category_3'].unique())},
    'sub_category_4': {category: index for index, category in enumerate(df['sub_category_4'].unique())},
}
user_input = {
    "name": "1970s Rock TrackPak - Mac",
    "description": "HAL LEONARD 1970s Rock TrackPak: Features 12 classic rock songs; compatible with GarageBand; includes loops ",
    "price": 29.99,
    "type": "Software",
    "manufacturer": "Hal Leonard",
}


def predict_model_1(final_input_array, model_1):
    predictions = model_1.predict(final_input_array)  # creo que este es el modelo correspondiente // acá cargo el h5 vector
    subcategory_pred_labels = np.argmax(predictions, axis=1)
    print(predictions)
    print(subcategory_pred_labels)
    return subcategory_pred_labels

def compare_predictions(subcategory_pred_labels, categories_dict, category_type):
    mapping = {v: k for k, v in categories_dict[category_type].items()}
    predicted_labels = [mapping[idx] for idx in subcategory_pred_labels]

    return predicted_labels

predicted_labels = predict_model_1(final_input_array, model_1)
predicted_labels_2 = compare_predictions(predicted_labels, categories_dict, 'parent_category')

print(predicted_labels)
print(predicted_labels_2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[[5.9872679e-07 4.8258988e-07 2.9880678e-07 1.6442855e-06 1.5122211e-07
  1.8521788e-04 1.9810813e-07 3.2663820e-07 8.2353949e-08 3.8066194e-06
  4.4644270e-07 2.1809466e-05 1.0719603e-05 4.8484844e-07 7.7668361e-08
  1.5808581e-04 1.5995936e-02 1.4492360e-06 2.0094559e-07 4.1292154e-07
  4.3375762e-06 7.1777839e-07 2.0001869e-07 1.1779769e-06 3.9620249e-07
  1.1999542e-06 5.8523278e-07 1.7006349e-07 7.8968235e-08 1.8490706e-06
  1.3293675e-06 4.5076538e-07 6.5526649e-07 3.5007156e-07 6.3509577e-07
  5.8573228e-06 3.4087401e-07 3.8726887e-07 9.5410940e-07 1.4988204e-06
  3.2991964e-06 1.5742604e-05 1.7435389e-04 9.8315400e-01 1.7742989e-04
  8.2149569e-07 6.5203687e-07 1.0753408e-06 3.0232496e-07 7.6895084e-08
  5.1515372e-07 1.7579384e-07 1.4305623e-06 3.7233913e-05 2.7063729e-06
  1.3869836e-06 2.6501255e-07 2.0228770e-05 1.2311910e-06 2.0472783e-07
  6.6331438e-07 5.5620706e-07]]
[43]
[43]
['Analog Audio Cables'