In [1]:
# Standard library imports.
import re
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Third party imports.
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing import image
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

2024-03-27 21:39:55.276068: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carolinajimenez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/carolinajimenez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/carolinajimenez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd

class DataPreprocessing:
    """
    This class handles preprocessing tasks for the dataset.
    """

    def __init__(self):
        """
        Initializes the DataPreprocessing object.
        """
        self.items_df = None
        self.item_pictures_df = None
        self.dataset_path = "../data/raw"

    def load_dataframes(self, items_path, item_pictures_path):
        """
        Load the datasets into the class.

        Parameters:
        items_path (str): File path of the items dataset.
        item_pictures_path (str): File path of the item pictures dataset.
        """
        self.items_df = pd.read_csv(f"{self.dataset_path}/{items_path}.csv")
        self.item_pictures_df = pd.read_csv(f"{self.dataset_path}/{item_pictures_path}.csv")

    def remove_columns(self, dataframe, columns_to_remove):
        """
        Remove specified columns from the dataframe.

        Parameters:
        dataframe (pandas.DataFrame): The dataframe from which columns need to be removed.
        columns_to_remove (list): List of column names to be removed.
        """
        dataframe.drop(columns=columns_to_remove, inplace=True)

    def handle_missing_values(self, dataframe):
        """
        Replace missing values in the dataframe with an empty string.

        Parameters:
        dataframe (pandas.DataFrame): The dataframe in which missing values need to be handled.
        """
        dataframe.fillna('', inplace=True)

In [3]:
preprocessor = DataPreprocessing()

# Load dataframes
preprocessor.load_dataframes("items_MCO", "item_pictures_MCO")

# Remove specified columns
columns_to_remove = ['thumbnail_id', 'catalog_product_id',
       'permalink', 'site_id', 'category_id', 'currency_id',
       'order_backend', 'price', 'original_price', 'sale_price',
       'available_quantity', 'official_store_id', 'use_thumbnail_id',
       'accepts_mercadopago', 'stop_time', 'winner_item_id', 'catalog_listing',
       'discounts', 'promotions', 'inventory_id', 'store_pick_up',
       'free_shipping', 'logistic_type', 'mode', 'tags', 'benefits',
       'promise', 'quantity', 'amount', 'rate', 'seller_id','initial_quantity',
       'warranty', 'differential_pricing', 'variation_filters', 'variations_data',
       'official_store_name', 'location', 'seller_contact']
preprocessor.remove_columns(preprocessor.items_df, columns_to_remove)

columns_to_remove = ['site_id', 'category_id', 'secure_url', 'size',
       'max_size', 'quality']
preprocessor.remove_columns(preprocessor.item_pictures_df, columns_to_remove)

# Handle missing values
preprocessor.handle_missing_values(preprocessor.items_df)
preprocessor.handle_missing_values(preprocessor.item_pictures_df)


In [4]:
dataset_path = "../data/processed"
def convert_and_save_dataframe(arr, df_name):
    dataframe = pd.DataFrame(arr)
    dataframe.to_csv(f"{dataset_path}/{df_name}.csv", index=False)

In [5]:
convert_and_save_dataframe(preprocessor.items_df, "items")
convert_and_save_dataframe(preprocessor.item_pictures_df, "item_pictures")