### 1. Construcción del dataset que usaremos

En este notebook vamos a combinar los distintos daraframes de review de amazon originales en uno nuevo que contenga diversas categorias

In [1]:
# Importamos lo que vayamos a necesitar
import pandas as pd
import cudf
import numpy as np

In [17]:
# Esta funcion mezclara todo el df que se le pase para tener los datos mas repartidos
def shuffle_df(data, limit = None):
    index_list = np.array(data.index)
    np.random.shuffle(np.reshape(index_list, (-1, 1)))    
    data = data.loc[index_list[:limit], :] if limit else data.loc[index_list[:], :]
    data.reset_index()
    return data

In [10]:
# Con esta function extraeremos el df:
#     - file_name: Nombre del archivo
#     - main_category: Categoria principal para la clasificacion
#     - limit: numero maximo de filas que tendra nuesto df
def extract_df(file_name, main_category, limit = 10000):
    data = pd.read_json(file_name, lines=True)
    data = data [['reviewText', 'overall' , 'helpful']]  
    data.rename(columns={"reviewText": "review", "overall": "rating"}, inplace=True)
    data['category'] = main_category

    # Procesamos el atributo helpful para que sea un valor numerico normalizado
    aux = np.zeros(len(data))
    for i, it in enumerate(data['helpful']):
        try:
            aux[i] = (0 if it[1] == 0 else it[0] / it[1])
        except:
            print(it)
            aux[i] = 0
    data['helpful'] = aux
    
    return shuffle_df(data, limit)

In [11]:
#  Definimos una lista con los archivos que queremos importar
files_to_read = [
    { 'file_name': './datasets/reviews_Amazon_Instant_Video_5.json.gz', 'main_category': 'Amazon instant videos'},
    { 'file_name': './datasets/reviews_Musical_Instruments_5.json.gz', 'main_category': 'Musical instruments'},
    { 'file_name': './datasets/reviews_Patio_Lawn_and_Garden_5.json.gz', 'main_category': 'Patio lawn/garden'},
    { 'file_name': './datasets/reviews_Automotive_5.json.gz', 'main_category': 'Automotive'},
    { 'file_name': './datasets/reviews_Digital_Music_5.json.gz', 'main_category': 'Digital music'},
    { 'file_name': './datasets/reviews_Office_Products_5.json.gz', 'main_category': 'Office products'},
    { 'file_name': './datasets/reviews_Pet_Supplies_5.json.gz', 'main_category': 'Pet supplies'},
    { 'file_name': './datasets/reviews_Baby_5.json.gz', 'main_category': 'Baby'}
]

In [12]:
# Vamos a leemos todos los arvchivos y los concatenamos
frames = [ extract_df(**f) for f in files_to_read ]
result = pd.concat(frames)

In [15]:
# Mezclamos el df
result = shuffle_df(result)

In [16]:
# Comprobamos que se ha mezclado bien, al vomprobar que las categorias estan mezcladas.
result

Unnamed: 0,review,rating,helpful,category
49813,A fun way to bling up your desk and make sure ...,4,1.0,Office products
12226,I continue to love this show. Raylan and the r...,5,0.0,Amazon instant videos
12226,Arrived in super flash time. Like another rev...,5,0.0,Patio lawn/garden
100386,This treat ball works as expected. I used Temp...,4,0.5,Pet supplies
89572,I know it's extrange but it works! It is easy ...,4,0.0,Baby
...,...,...,...,...
3861,I have been using these for about 2 years now;...,5,0.0,Musical instruments
11965,This unit does everything it says it does. I e...,5,0.0,Automotive
11965,"For years, I had been using Smead hanging fold...",4,0.0,Office products
11824,Pros:- You don't need to touch the dead rat- L...,3,0.0,Patio lawn/garden


In [20]:
# Exportamos el modelo a un csv para su posterior utilizacion
result.to_csv('./datasets/reviews.csv', index=False)