In [1]:
# Standard library imports.
import re
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Third party imports.
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing import image
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

2024-03-27 20:28:17.551389: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carolinajimenez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/carolinajimenez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/carolinajimenez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
import pandas as pd

class DataPreprocessing:
    """
    This class handles preprocessing tasks for the dataset.
    """

    def __init__(self):
        """
        Initializes the DataPreprocessing object.
        """
        self.items_df = None
        self.item_pictures_df = None
        self.dataset_path = "../data/raw"

    def load_dataframes(self, items_path, item_pictures_path):
        """
        Load the datasets into the class.

        Parameters:
        items_path (str): File path of the items dataset.
        item_pictures_path (str): File path of the item pictures dataset.
        """
        self.items_df = pd.read_csv(f"{self.dataset_path}/{items_path}.csv")
        self.item_pictures_df = pd.read_csv(f"{self.dataset_path}/{item_pictures_path}.csv")

    def remove_columns(self, dataframe, columns_to_remove):
        """
        Remove specified columns from the dataframe.

        Parameters:
        dataframe (pandas.DataFrame): The dataframe from which columns need to be removed.
        columns_to_remove (list): List of column names to be removed.
        """
        dataframe.drop(columns=columns_to_remove, inplace=True)

    def handle_missing_values(self, dataframe):
        """
        Replace missing values in the dataframe with an empty string.

        Parameters:
        dataframe (pandas.DataFrame): The dataframe in which missing values need to be handled.
        """
        dataframe.fillna('', inplace=True)

In [9]:
preprocessor = DataPreprocessing()

# Load dataframes
preprocessor.load_dataframes("items_MCO", "item_pictures_MCO")

# Remove specified columns
columns_to_remove = ['thumbnail_id', 'catalog_product_id',
       'permalink', 'site_id', 'category_id', 'currency_id',
       'order_backend', 'price', 'original_price', 'sale_price',
       'available_quantity', 'official_store_id', 'use_thumbnail_id',
       'accepts_mercadopago', 'stop_time', 'winner_item_id', 'catalog_listing',
       'discounts', 'promotions', 'inventory_id', 'store_pick_up',
       'free_shipping', 'logistic_type', 'mode', 'tags', 'benefits',
       'promise', 'quantity', 'amount', 'rate', 'seller_id','initial_quantity',
       'warranty', 'differential_pricing', 'variation_filters', 'variations_data',
       'official_store_name', 'location', 'seller_contact']
preprocessor.remove_columns(preprocessor.items_df, columns_to_remove)

columns_to_remove = ['site_id', 'category_id', 'secure_url', 'size',
       'max_size', 'quality']
preprocessor.remove_columns(preprocessor.item_pictures_df, columns_to_remove)

# Handle missing values
preprocessor.handle_missing_values(preprocessor.items_df)
preprocessor.handle_missing_values(preprocessor.item_pictures_df)


In [10]:
dataset_path = "../data/processed"
def convert_and_save_dataframe(arr, df_name):
    dataframe = pd.DataFrame(arr)
    dataframe.to_csv(f"{dataset_path}/{df_name}.csv", index=False)

In [11]:
convert_and_save_dataframe(preprocessor.items_df, "items")
convert_and_save_dataframe(preprocessor.item_pictures_df, "item_pictures")

In [14]:
class DataPreprocessor:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.scaler = StandardScaler()

    def preprocess_text(self, text):
        """
        Preprocesses text data.

        Args:
        text (str): Input text to preprocess.

        Returns:
        str: Preprocessed text.
        """
        # Convert text to lowercase
        text = text.lower()
        # Remove special characters, numbers, and punctuation
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text

    def preprocess_features(self, items_df, pictures_df):
        """
        Preprocesses features including text and images.

        Args:
        items_df (DataFrame): DataFrame containing item data.
        pictures_df (DataFrame): DataFrame containing picture data.

        Returns:
        DataFrame: Preprocessed feature DataFrame.
        """
        # Merge items and pictures dataframes on item_id
        merged_df = pd.merge(items_df, pictures_df.groupby('item_id').first(), on='item_id', how='left')

        # Preprocess text features
        merged_df['title'] = merged_df['title'].apply(self.preprocess_text)
        merged_df['official_store_name'] = merged_df['official_store_name'].apply(self.preprocess_text)

        # Normalize numerical features
        numerical_features = ['price', 'warranty']
        merged_df[numerical_features] = self.scaler.fit_transform(merged_df[numerical_features])

        # Select relevant features
        selected_features = ['title', 'condition_new', 'condition_not_specified', 'condition_used',
                             'listing_type_id_bronze', 'listing_type_id_free', 'listing_type_id_gold',
                             'listing_type_id_gold_premium', 'listing_type_id_gold_pro', 'listing_type_id_gold_special',
                             'listing_type_id_silver', 'buying_mode_buy_it_now', 'buying_mode_classified',
                             'thumbnail', 'price', 'seller_id', 'warranty', 'official_store_name', 'url']

        preprocessed_df = merged_df[selected_features]
        
        return preprocessed_df

In [15]:
dataset_path = "../data/processed"
items_file = f"{dataset_path}/items.csv"
pictures_file = f"{dataset_path}/item_pictures.csv"
items_df = pd.read_csv(items_file)
pictures_df = pd.read_csv(pictures_file)

# Instantiate DataPreprocessor
preprocessor = DataPreprocessor()

# Preprocess features
preprocessed_data = preprocessor.preprocess_features(items_df, pictures_df)

KeyError: 'item_id'

In [7]:
class DataPreprocessor:
     """
     Class for preprocessing data for machine learning models.
     """

     def __init__(self, items_file, pictures_file):
          """
          Initialize the DataPreprocessor object.

          Parameters:
          - items_file (str): Path to the CSV file containing item data.
          - pictures_file (str): Path to the CSV file containing picture data.
          """
          self.items_df = pd.read_csv(items_file)
          self.pictures_df = pd.read_csv(pictures_file)
          self.lemmatizer = WordNetLemmatizer()
          self.vectorizer = CountVectorizer(lowercase=True, stop_words='english', max_features=100)
          self.image_model = VGG16(weights='imagenet', include_top=False)
          self.pca = PCA(n_components=100)  # Reduce image features to 100 dimensions

     def preprocess_text(self, text):
          """
          Preprocess text data by tokenization and lemmatization.

          Parameters:
          - text (str): Input text to be preprocessed.

          Returns:
          - str: Preprocessed text.
          """
          tokens = word_tokenize(text)
          lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
          return ' '.join(lemmatized_tokens)

     def preprocess_image_features(self, item_id):
          """
          Preprocess image features for a given item ID.

          Parameters:
          - item_id (str): ID of the item to preprocess images for.

          Returns:
          - numpy.ndarray: Array of preprocessed image features.
          """
          thumbnail_url = self.items_df[self.items_df['id'] == item_id]['thumbnail'].values[0]
          image_urls = self.pictures_df[self.pictures_df['item_id'] == item_id]['url'].tolist()
          image_urls = [str(thumbnail_url)] + [str(url) for url in image_urls[:1]]  # Convert URLs to string
          image_features = []
          for url in image_urls:
               if os.path.exists(url):
                    img = image.load_img(url, target_size=(224, 224))
                    img_array = image.img_to_array(img)
                    img_array = np.expand_dims(img_array, axis=0)
                    img_array = preprocess_input(img_array)
                    features = self.image_model.predict(img_array)
                    flattened_features = features.flatten()
                    image_features.append(flattened_features)
          if image_features:
               image_features = np.array(image_features)
               image_features = self.pca.fit_transform(image_features)
          else:
               image_features = np.zeros((1, 100))
          return image_features


     def preprocess_features(self):
          """
          Preprocess features for machine learning models.

          Returns:
          - pandas.DataFrame: Preprocessed features.
          """
          # Preprocess text features
          self.items_df['title'] = self.items_df['title'].apply(self.preprocess_text)
          text_features = self.vectorizer.fit_transform(self.items_df['title']).toarray()
          text_feature_names = self.vectorizer.get_feature_names_out()
          text_df = pd.DataFrame(text_features, columns=text_feature_names)

          # Preprocess image features
          image_features = []
          for item_id in self.items_df['id']:
               item_features = self.preprocess_image_features(item_id)
               image_features.append(item_features)
          image_features = np.vstack(image_features)
          image_feature_names = [f"image_feature_{i}" for i in range(image_features.shape[1])]
          image_df = pd.DataFrame(image_features, columns=image_feature_names)

          # Combine text and image features
          all_features = pd.concat([text_df, image_df], axis=1)

          # Select relevant features
          selected_features = ['price', 'warranty']

          all_features = pd.concat([all_features, self.items_df[selected_features]], axis=1)

          return all_features

In [8]:
# Instantiate DataPreprocessor
dataset_path = "../data/processed"
items_file = f"{dataset_path}/items.csv"
pictures_file = f"{dataset_path}/item_pictures.csv"
preprocessor = DataPreprocessor(items_file, pictures_file)

# Preprocess features
features = preprocessor.preprocess_features()

In [9]:
cols = list(features.columns)
len(cols)

202

In [10]:
dataset_path = "../data/processed"
def convert_and_save_dataframe(arr, df_name):
    dataframe = pd.DataFrame(arr)
    dataframe.to_csv(f"{dataset_path}/{df_name}.csv", index=False)

In [11]:
convert_and_save_dataframe(features, "features")

In [12]:
class FeatureExtractor:
    """
    Class for feature extraction from product titles and images.
    """

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.image_model = VGG16(weights='imagenet', include_top=False)
        self.pca = PCA(n_components=100)  # Reduce image features to 100 dimensions

    def extract_text_features(self, titles):
        """
        Extract features from product titles using NLP techniques.

        Parameters:
        - titles (list): List of product titles.

        Returns:
        - numpy.ndarray: Array of text features.
        """
        text_features = []
        for title in titles:
            tokens = word_tokenize(title)
            # Lemmatize tokens
            lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
            # Convert tokens to lowercase
            normalized_tokens = [token.lower() for token in lemmatized_tokens]
            text_features.append(normalized_tokens)
        return np.array(text_features)

    def extract_image_features(self, image_paths):
        """
        Extract features from product images using pre-trained CNNs.

        Parameters:
        - image_paths (list): List of paths to product images.

        Returns:
        - numpy.ndarray: Array of image features.
        """
        image_features = []
        for path in image_paths:
            if os.path.exists(path):  # Check if image file exists
                try:
                    img = image.load_img(path, target_size=(224, 224))
                    img_array = image.img_to_array(img)
                    img_array = np.expand_dims(img_array, axis=0)
                    img_array = preprocess_input(img_array)
                    features = self.image_model.predict(img_array)
                    flattened_features = features.flatten()
                    image_features.append(flattened_features)
                except Exception as e:
                    print(f"Error processing image {path}: {e}")
            else:
                print(f"Image file not found: {path}")
        if image_features:
            image_features = np.array(image_features)
            image_features = self.pca.fit_transform(image_features)  # Reduce dimensionality
        else:
            image_features = np.zeros((len(image_paths), 100))  # Return zeros if no valid images
        return image_features

In [13]:
# Extract text features
text_features = []
for _, row in features.iterrows():
    title = row['title']
    # condition_features = row[['condition_new', 'condition_not_specified', 'condition_used']].tolist()
    # listing_type_features = row[[col for col in features.columns if 'listing_type_id' in col]].tolist()
    # buying_mode_features = row[[col for col in features.columns if 'buying_mode' in col]].tolist()
    # seller_id = row['seller_id']
    warranty = row['warranty']
    # official_store_name = row['official_store_name']
    
    # Combine all text features
    # combined_text = f"{title} {seller_id} {warranty} {official_store_name}"
    combined_text = f"{title} {warranty}"
    # combined_text += " ".join(map(str, condition_features))
    # combined_text += " ".join(map(str, listing_type_features))
    # combined_text += " ".join(map(str, buying_mode_features))
    text_features.append(combined_text)

# Extract image features
image_paths = features['images'].explode().tolist()
extractor = FeatureExtractor()
image_features = extractor.extract_image_features(image_paths)

KeyError: 'title'

In [None]:
print("text_features:", text_features[0])
print("image_features:", image_features[0])

NameError: name 'text_features' is not defined