In [19]:
# Standard library imports.
import re
import requests
from io import BytesIO

# Third party imports.
import numpy as np
import pandas as pd
from PIL import Image
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tqdm import tqdm

In [29]:

class FeatureExtraction:
    """
    This class handles feature extraction tasks for the dataset.
    """

    def __init__(self):
        """
        Initializes the FeatureExtraction object.
        """
        self.items_df = None
        self.item_pictures_df = None
        self.dataset_path = "../data/processed"

        self.nlp = spacy.load('en_core_web_sm')

    def load_dataframes(self, items_path, item_pictures_path):
        """
        Load the datasets into the class.

        Parameters:
        items_path (str): File path of the items dataset.
        item_pictures_path (str): File path of the item pictures dataset.
        """
        self.items_df = pd.read_csv(f"{self.dataset_path}/{items_path}.csv")
        self.item_pictures_df = pd.read_csv(f"{self.dataset_path}/{item_pictures_path}.csv")

    def concatenate_text_variables(self):
        """
        Concatenate selected text variables into a single column.
        """
        text_columns = ['title', 'condition', 'listing_type_id', 'buying_mode', 'domain_id']
        self.items_df['text_features'] = self.items_df[text_columns].fillna('').apply(lambda x: ' '.join(x), axis=1)

    def make_text_machine_friendly(self, text):
        """
        Apply text preprocessing techniques to make text machine-friendly.

        Parameters:
        text (str): Input text.

        Returns:
        str: Preprocessed text.
        """
        # Convert words into lowercase
        text = text.lower()

        # Remove leading and trailing whitespaces
        text = text.strip()

        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word not in stop_words]
        text = ' '.join(filtered_text)

        # Expand contractions (if needed)

        # Remove special characters (numbers, emojis, etc.)
        text = re.sub(r'\d+', '', text)  # Remove numbers

        return text

    def apply_tokenization_and_lemmatization(self, text):
        """
        Apply tokenization and lemmatization to the text.

        Parameters:
        text (str): Input text.

        Returns:
        list: List of lemmatized tokens.
        """
        doc = self.nlp(text)
        lemmatized_tokens = [token.lemma_ for token in doc]
        return lemmatized_tokens

    def get_text_features(self):
        """
        Obtain the feature vector for text.

        Returns:
        numpy.ndarray: Feature vector for text.
        """
        text_features = self.items_df['text_features'].apply(self.make_text_machine_friendly)
        text_features = text_features.apply(self.apply_tokenization_and_lemmatization)
        text_features = text_features.apply(lambda x: ' '.join(x))
        return text_features

    def get_image_features(self, model_name='VGG16'):
        """
        Obtain the feature vector for images using pre-trained CNN models.

        Parameters:
        model_name (str): Name of the pre-trained CNN model.

        Returns:
        numpy.ndarray: Feature vector for images.
        """
        if model_name == 'VGG16':
            model = VGG16(weights='imagenet', include_top=False, pooling='avg')
            feature_list = []

            for _, row in self.items_df.iterrows():
                thumbnail_url = row['thumbnail']

                try:
                    response_thumbnail = requests.get(thumbnail_url)
                    img_thumbnail = Image.open(BytesIO(response_thumbnail.content))
                    img_thumbnail = img_thumbnail.resize((224, 224))
                    img_array_thumbnail = image.img_to_array(img_thumbnail)
                    img_array_thumbnail = np.expand_dims(img_array_thumbnail, axis=0)
                    img_array_thumbnail = preprocess_input(img_array_thumbnail)

                    features_thumbnail = model.predict(img_array_thumbnail)
                    feature_list.append(features_thumbnail.flatten())
                except Exception as e:
                    print(f"Error processing thumbnail image {thumbnail_url}: {e}")

            image_features = np.array(feature_list)
            return image_features
        else:
            raise ValueError("Invalid model name. Please choose from 'VGG16'.")


    def save_features(self, text_features, image_features, text_output_path, image_output_path):
        """
        Save the extracted features to files.

        Parameters:
        text_features (numpy.ndarray): Feature vector for text.
        image_features (numpy.ndarray): Feature vector for images.
        text_output_path (str): File path to save text features.
        image_output_path (str): File path to save image features.
        """
        # np.save(f"{self.dataset_path}/{text_output_path}", text_features)
        np.save(f"{self.dataset_path}/{image_output_path}", image_features)


In [30]:
fe = FeatureExtraction()

# Load dataframes
fe.load_dataframes("items", "item_pictures")

# Concatenate text variables
fe.concatenate_text_variables()

# Obtain text features
# text_features = fe.get_text_features()

# Obtain image features
image_features = fe.get_image_features()

# Save features
fe.save_features("text_features", image_features, "text_features.npy", "image_features.npy")



In [10]:
print("text_features:", type(text_features))
print("image_features:", type(image_features))

text_features: <class 'pandas.core.series.Series'>
image_features: <class 'numpy.ndarray'>


In [11]:
image_features.shape

(0,)