In [1]:
# Standard library imports.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Third party imports.
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

2024-03-25 18:21:03.409663: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carolinajimenez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/carolinajimenez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/carolinajimenez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
class ProductFeaturesExtractor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.tfidf_vectorizer_title = TfidfVectorizer()
        self.tfidf_vectorizer_description = TfidfVectorizer()
        self.scaler = StandardScaler()
        self.image_model = VGG16(weights='imagenet', include_top=False)

    def preprocess_text(self, text):
        text = text.lower()
        words = word_tokenize(text)
        filtered_words = [word for word in words if word not in self.stop_words]
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in filtered_words]
        return ' '.join(lemmatized_words)

    def extract_text_features(self, titles, descriptions):
        titles_processed = [self.preprocess_text(title) for title in titles]
        descriptions_processed = [self.preprocess_text(description) for description in descriptions]
        title_features = self.tfidf_vectorizer_title.fit_transform(titles_processed)
        description_features = self.tfidf_vectorizer_description.fit_transform(descriptions_processed)
        return title_features, description_features

    def extract_image_features(self, img_paths):
        image_features = []
        for img_path in img_paths:
            img = image.load_img(img_path, target_size=(224, 224))
            img_array = image.img_to_array(img)
            img_array = np.expand_dims(img_array, axis=0)
            img_array = preprocess_input(img_array)
            features = self.image_model.predict(img_array)
            image_features.append(features.flatten())
        return np.array(image_features)

    def scale_features(self, features):
        return self.scaler.fit_transform(features)

In [3]:
# Feature extractor instance creation
feature_extractor = ProductFeaturesExtractor()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
# Text Preprocessing
data['title'] = data['title'].apply(feature_extractor.preprocess_text)
data['description'] = data['description'].apply(feature_extractor.preprocess_text)

# Text Feature Extraction
title_features, description_features = feature_extractor.extract_text_features(data['title'], data['description'])

# Image Feature Extraction
image_features = feature_extractor.extract_image_features(data['thumbnail'])

# Price and Availability Feature Scaling
price_availability_features = feature_extractor.scale_features(data[['price', 'availability']].values)