In [4]:
import requests
import pandas as pd
import json
from io import StringIO
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Cargar el dataframe.
url = 'https://raw.githubusercontent.com/anyoneai/e-commerce-open-data-set/master/products.json'
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    json_str = json.dumps(data)
    json_data = StringIO(json_str)
    df = pd.read_json(json_data)
    print('DataFrame creado con éxito:')
else:
    print(f'Error al hacer la solicitud: Código de estado {response.status_code}')

DataFrame creado con éxito:


In [9]:
# Filtrar el dataset por las categorias con mayor frecuencia.
all_categories = pd.Series([category for sublist in df['category'] for category in sublist])
category_counts = all_categories.value_counts()
categories_over_100 = category_counts[category_counts > 100].index.tolist()

def has_high_freq_category(category_list):
    return any(category in categories_over_100 for category in category_list)

filtered_df = df[df['category'].apply(has_high_freq_category)]

In [8]:
print(filtered_df.columns)

Index(['sku', 'name', 'type', 'price', 'upc', 'category', 'shipping',
       'description', 'manufacturer', 'model', 'url', 'image'],
      dtype='object')


In [None]:
'name', 'description', 'model', 'url', 'manufacturer'

In [13]:
def normalize_columns(filtered_df, columns):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    def normalize_text(text):
        text = str(text)
        tokens = word_tokenize(text)
        tokens = [word.lower() for word in tokens if word.isalpha()]
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)

    for column in columns:
        filtered_df.loc[:, column] = filtered_df[column].apply(normalize_text)

    return filtered_df


In [15]:
normalized_df = normalize_columns(filtered_df, ['name', 'description', 'model', 'manufacturer'])
print(normalized_df.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.loc[:, column] = filtered_df[column].apply(normalize_text)


      sku                           name      type  price          upc  \
0   43900           duracell aaa battery  HardGood   5.49  41333424019   
1   48530  duracell aa coppertop battery  HardGood   5.49  41333415017   
2  127687            duracell aa battery  HardGood   7.49  41333825014   
3  150115       energizer max battery aa  HardGood   4.99  39800011329   
4  185230             duracell c battery  HardGood   8.99  41333440019   

                                            category shipping  \
0  [{'id': 'pcmcat312300050015', 'name': 'Connect...     5.49   
1  [{'id': 'pcmcat312300050015', 'name': 'Connect...     5.49   
2  [{'id': 'pcmcat312300050015', 'name': 'Connect...     5.49   
3  [{'id': 'pcmcat312300050015', 'name': 'Connect...     5.49   
4  [{'id': 'pcmcat312300050015', 'name': 'Connect...     5.49   

                                         description manufacturer model  \
0  compatible select electronic device aaa size d...     duracell         
1  energy dura