In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input, Concatenate

In [2]:
# Generate realistic dummy data
data = {
    'name': [
        'Apple iPhone 13',
        'Samsung Galaxy S21',
        'Sony WH-1000XM4 Headphones',
        'Dell XPS 13 Laptop',
        'Amazon Echo Dot'
    ],
    'description': [
        'The latest model of the Apple iPhone with A15 Bionic chip and improved camera.',
        'Samsung\'s flagship phone with a stunning display and powerful performance.',
        'Noise-cancelling over-ear headphones with exceptional sound quality.',
        'Lightweight and powerful laptop with an Intel Core i7 processor.',
        'Smart speaker with Alexa voice assistant and improved sound quality.'
    ],
    'price': [999.99, 799.99, 349.99, 1299.99, 49.99],
    'category': ['Electronics', 'Electronics', 'Audio', 'Computers', 'Smart Home']
}

In [4]:
df = pd.DataFrame(data)

In [5]:
print(df)

                         name  \
0             Apple iPhone 13   
1          Samsung Galaxy S21   
2  Sony WH-1000XM4 Headphones   
3          Dell XPS 13 Laptop   
4             Amazon Echo Dot   

                                         description    price     category  
0  The latest model of the Apple iPhone with A15 ...   999.99  Electronics  
1  Samsung's flagship phone with a stunning displ...   799.99  Electronics  
2  Noise-cancelling over-ear headphones with exce...   349.99        Audio  
3  Lightweight and powerful laptop with an Intel ...  1299.99    Computers  
4  Smart speaker with Alexa voice assistant and i...    49.99   Smart Home  


In [6]:
# Encode the categorical labels
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

In [7]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['description'])
sequences = tokenizer.texts_to_sequences(df['description'])
padded_sequences = pad_sequences(sequences, maxlen=10)

In [8]:
# Print tokenized sequences and shapes for debugging
print("Padded Sequences:\n", padded_sequences)
print("Padded Sequences Shape:", padded_sequences.shape)


Padded Sequences:
 [[ 3 11 12  1 13 14 15  2  4 16]
 [17 18 19  1 20 21 22  2  5 23]
 [ 0 24 25 26 27 28  1 29  6  7]
 [30  2  5 31  1 32 33 34 35 36]
 [37 38  1 39 40 41  2  4  6  7]]
Padded Sequences Shape: (5, 10)


In [9]:
# Prepare the final dataset
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size including the padding index
print("Vocabulary Size:", vocab_size)
X_text = padded_sequences
X_numeric = df[['price']].values.reshape(-1, 1)
y = df['category_encoded']

Vocabulary Size: 42


In [10]:
# Print X and y shapes for debugging
print("X_text Shape:", X_text.shape)
print("X_numeric Shape:", X_numeric.shape)
print("y Shape:", y.shape)

X_text Shape: (5, 10)
X_numeric Shape: (5, 1)
y Shape: (5,)


In [11]:
# Split the data into training and testing sets
X_text_train, X_text_test, X_numeric_train, X_numeric_test, y_train, y_test = train_test_split(
    X_text, X_numeric, y, test_size=0.2, random_state=42
)


In [12]:
# Define the model
text_input = Input(shape=(10,), name='text_input')
numeric_input = Input(shape=(1,), name='numeric_input')


In [13]:
# Text processing branch
embedding = Embedding(input_dim=vocab_size, output_dim=10, input_length=10)(text_input)
flattened_text = Flatten()(embedding)

In [14]:
# Concatenate text and numeric branches
concatenated = Concatenate()([flattened_text, numeric_input])


In [15]:
# Dense layers
dense1 = Dense(10, activation='relu')(concatenated)
output = Dense(len(df['category'].unique()), activation='softmax')(dense1)

In [16]:
# Build the model
model = Model(inputs=[text_input, numeric_input], outputs=output)

In [17]:
# Print model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text_input (InputLayer)     [(None, 10)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 10, 10)               420       ['text_input[0][0]']          
                                                                                                  
 flatten (Flatten)           (None, 100)                  0         ['embedding[0][0]']           
                                                                                                  
 numeric_input (InputLayer)  [(None, 1)]                  0         []                            
                                                                                              

In [18]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [19]:
# Train the model
history = model.fit(
    {'text_input': X_text_train, 'numeric_input': X_numeric_train},
    y_train,
    epochs=10,
    validation_data=({'text_input': X_text_test, 'numeric_input': X_numeric_test}, y_test)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
# Print training history
for epoch in range(10):
    print(f"Epoch {epoch+1}/{10}")
    print(f"Loss: {history.history['loss'][epoch]:.4f}")
    print(f"Accuracy: {history.history['accuracy'][epoch]:.4f}")
    print(f"Validation Loss: {history.history['val_loss'][epoch]:.4f}")
    print(f"Validation Accuracy: {history.history['val_accuracy'][epoch]:.4f}")

Epoch 1/10
Loss: 149.6736
Accuracy: 0.2500
Validation Loss: 9.4535
Validation Accuracy: 0.0000
Epoch 2/10
Loss: 147.1979
Accuracy: 0.2500
Validation Loss: 6.6807
Validation Accuracy: 0.0000
Epoch 3/10
Loss: 144.7406
Accuracy: 0.2500
Validation Loss: 3.9400
Validation Accuracy: 0.0000
Epoch 4/10
Loss: 142.3179
Accuracy: 0.2500
Validation Loss: 1.4502
Validation Accuracy: 0.0000
Epoch 5/10
Loss: 140.0381
Accuracy: 0.2500
Validation Loss: 0.2148
Validation Accuracy: 1.0000
Epoch 6/10
Loss: 138.8043
Accuracy: 0.2500
Validation Loss: 0.0411
Validation Accuracy: 1.0000
Epoch 7/10
Loss: 138.1672
Accuracy: 0.2500
Validation Loss: 0.0152
Validation Accuracy: 1.0000
Epoch 8/10
Loss: 137.3505
Accuracy: 0.2500
Validation Loss: 0.0097
Validation Accuracy: 1.0000
Epoch 9/10
Loss: 136.3650
Accuracy: 0.2500
Validation Loss: 0.0095
Validation Accuracy: 1.0000
Epoch 10/10
Loss: 135.2519
Accuracy: 0.2500
Validation Loss: 0.0131
Validation Accuracy: 1.0000


In [21]:
# Evaluate the model
loss, accuracy = model.evaluate({'text_input': X_text_test, 'numeric_input': X_numeric_test}, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.2f}")

Test Loss: 0.0131
Test Accuracy: 1.00
