In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("satrapankti/amazon-beauty-product-recommendation")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\Abdul Haseeb\.cache\kagglehub\datasets\satrapankti\amazon-beauty-product-recommendation\versions\1


In [7]:
#reading the csv file
import pandas as pd
data = pd.read_csv(r'C:\Users\Abdul Haseeb\Desktop\Recommender system\Recommender-System\Amazon_Beauty_Recommendation.csv')
print(data.head())

           UserId   ProductId       ProductType  Rating   Timestamp  \
0  A3NHUQ33CFH3VM  B00LLPT4HI  Eyeliner & Kajal       5  1405814400   
1  A1TIRNQ7O4REOH  B00LLPT4HI  Eyeliner & Kajal       4  1405987200   
2  A2Y36BR4YSY9F7  B00LLPT4HI  Eyeliner & Kajal       5  1405728000   
3  A23H6FAOLEMAKC  B00LLPT4HI  Eyeliner & Kajal       5  1405814400   
4  A3CHYZGF3OO6WD  B00LLPT4HI  Eyeliner & Kajal       5  1405641600   

                                                 URL  
0  https://www.amazon.in/Maybelline-Colossal-Kaja...  
1  https://www.amazon.in/Maybelline-Colossal-Kaja...  
2  https://www.amazon.in/Maybelline-Colossal-Kaja...  
3  https://www.amazon.in/Maybelline-Colossal-Kaja...  
4  https://www.amazon.in/Maybelline-Colossal-Kaja...  


In [8]:
#data preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Display basic info about the dataset
print("Dataset Info:")
print(data.info())

# Handle missing values (if any)
data = data.dropna()

# Remove duplicates (if any)
data = data.drop_duplicates()

# Encode UserId and ProductId into numeric format
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

data['UserId'] = user_encoder.fit_transform(data['UserId'])
data['ProductId'] = product_encoder.fit_transform(data['ProductId'])

# Feature and Target
features = data[['UserId', 'ProductId', 'Rating', 'Timestamp']]
target = data['Rating']

# Normalizing Timestamp (improves model performance)
features = features.copy()
features['Timestamp'] = (features['Timestamp'] - features['Timestamp'].min()) / (features['Timestamp'].max() - features['Timestamp'].min())

prepared_data_path = 'prepared_data.csv'
features.to_csv(prepared_data_path, index=False)
print(f"Prepared data saved to {prepared_data_path}")

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1348246 entries, 0 to 1348245
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   UserId       1348246 non-null  object
 1   ProductId    1348246 non-null  object
 2   ProductType  1348246 non-null  object
 3   Rating       1348246 non-null  int64 
 4   Timestamp    1348246 non-null  int64 
 5   URL          1348246 non-null  object
dtypes: int64(2), object(4)
memory usage: 61.7+ MB
None
Prepared data saved to prepared_data.csv


In [9]:
# Split the data into training and testing sets
X = features[['UserId', 'ProductId', 'Timestamp']]
y = features['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the splits
print(f"Training Features Shape: {X_train.shape}")
print(f"Testing Features Shape: {X_test.shape}")
print(f"Training Target Shape: {y_train.shape}")
print(f"Testing Target Shape: {y_test.shape}")

# Save the splits for later use
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("Training and testing splits saved as CSV files.")

Training Features Shape: (1078596, 3)
Testing Features Shape: (269650, 3)
Training Target Shape: (1078596,)
Testing Target Shape: (269650,)
Training and testing splits saved as CSV files.


In [10]:
#implemeting model
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout

# Define the number of unique users and products
num_users = X_train['UserId'].nunique()
num_products = X_train['ProductId'].nunique()

# Embedding size - hyperparameter
embedding_size = 50

# Input layers
user_input = Input(shape=(1,), name='User_Input')
product_input = Input(shape=(1,), name='Product_Input')

# Embedding layers
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, name='User_Embedding')(user_input)
product_embedding = Embedding(input_dim=num_products, output_dim=embedding_size, name='Product_Embedding')(product_input)

# Flatten the embeddings
user_vec = Flatten(name='Flatten_User')(user_embedding)
product_vec = Flatten(name='Flatten_Product')(product_embedding)

# Concatenate user and product vectors
concat = Concatenate(name='Concatenate')([user_vec, product_vec])

# Add dense layers (fully connected layers)
dense_1 = Dense(128, activation='relu', name='Dense_Layer_1')(concat)
dropout_1 = Dropout(0.3, name='Dropout_1')(dense_1)  # Dropout to prevent overfitting
dense_2 = Dense(64, activation='relu', name='Dense_Layer_2')(dropout_1)

# Output layer (rating prediction)
output = Dense(1, activation='linear', name='Output')(dense_2)

# Define the model
model = Model(inputs=[user_input, product_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

# Model summary
model.summary()


In [16]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Define input data for training
train_user_input = X_train['UserId'].values
train_product_input = X_train['ProductId'].values
train_ratings = y_train.values

val_user_input = X_test['UserId'].values
val_product_input = X_test['ProductId'].values
val_ratings = y_test.values

# Define early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(
    'best_model.keras', save_best_only=True, monitor='val_loss', mode='min'
)

# Train the model
history = model.fit(
    [train_user_input, train_product_input],
    train_ratings,
    validation_data=([val_user_input, val_product_input], val_ratings),
    epochs=10,  # Adjust based on need
    batch_size=256,
    callbacks=[early_stopping, model_checkpoint],
    verbose=1
)

# Save training history for visualization
import pickle
with open('training_history.pkl', 'wb') as f:
    pickle.dump(history.history, f)

print("Model training completed and the best model saved as 'best_model.h5'.")


Epoch 1/10
[1m4214/4214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1276s[0m 303ms/step - loss: 1.8993 - mean_absolute_error: 1.0778 - val_loss: 1.5337 - val_mean_absolute_error: 0.9647
Epoch 2/10
[1m4214/4214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1133s[0m 269ms/step - loss: 1.0426 - mean_absolute_error: 0.7358 - val_loss: 1.7584 - val_mean_absolute_error: 0.9974
Epoch 3/10
[1m4214/4214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1130s[0m 268ms/step - loss: 0.7408 - mean_absolute_error: 0.5624 - val_loss: 1.9321 - val_mean_absolute_error: 1.0595
Epoch 4/10
[1m4214/4214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1133s[0m 269ms/step - loss: 0.5409 - mean_absolute_error: 0.4515 - val_loss: 1.9819 - val_mean_absolute_error: 1.0481
Model training completed and the best model saved as 'best_model.h5'.
