# Overview [25/01/2023]

*version 1*

Unified Code:
* Detele duplicates
* Delete users with less than 10 interactions

# Unified code

In [None]:
#================================================================#
#============= Proceso de extraccion y tratamiento ==============#
#================================================================#

#====================== Import de librerias =====================#

import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
import datetime
import plotly.express as px
import plotly.graph_objects as go


#============ Definicion de valores de configuracion ============#

# Url de dataset que queremos descargar
urls=[
  "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Musical_Instruments.csv"
]

# Nombre de los archivos
files=[
  "Musical_Instruments.csv"
]

# Minimo de reviews por usuario
min_reviews=3
min_usuarios=3

# Nombre de columnas
col_names = {"col_id_product": "asin",
             "col_id_reviewer": "reviewerID",
             "col_unix_time": "timestamp",
             "col_year": "year",
             "col_rating": "overall"}

#============ Definicion de funciones de extraccion y tratamiento ============#

def helper(arg):

   '''
   Funcion auxiliar para dar informacion del proceso de descarga
   '''

   if(os.system(arg) == 0):
        print(arg," completed")
        return None

def get_dataset_basic_info(df, nombre):

  '''
  Funcion que retorna infomracion de # de clientes, productos y reviews en dataset elegido
  '''

  aux=len(nombre)
  numero_clientes=len(df[col_names["col_id_reviewer"]].unique())
  numero_productos=len(df[col_names["col_id_product"]].unique())

  return(print("\n","#"*aux,"\n",nombre.center(aux),"\n","#"*aux,"\n"*2, "Numero total de clientes: ", numero_clientes,"\n",
      "Numero total de productos: ", numero_productos, "\n",
      "Numero total de reviews: ", df.shape[0],"\n"))


def load_data_raw():

  '''
  Funcion que descarga los datasets de las url elegidas. 
  '''

  cols = [col_names["col_id_product"], col_names["col_id_reviewer"], col_names["col_rating"], col_names["col_unix_time"]]

  ### Descarga de información
  for url in urls:
    helper("wget " +url+ " --no-check-certificate")

  for archivo in files:
    df = pd.read_csv(archivo, delimiter=",", names=cols)

  return df


def treat_dataset_src(df, min_reviews,info=False):

    '''
    Funcion que trata el dataset original
    Realiza conversiones de tipo
    Filtra duplicados (usuarios que han puesto mas de una review sobre un producto el mismo día)
    Filtra productos comprados por al menos 3 usuarios
    Filtra solo usuarios con mas de X reviews
    '''

    # Conversion de tipo
    df[col_names["col_rating"]] = pd.to_numeric(df[col_names["col_rating"]].replace(',','', regex=True))
    df[col_names["col_unix_time"]]=df[col_names["col_unix_time"]].apply(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
    df[col_names["col_year"]]= pd.to_datetime(df[col_names["col_unix_time"]]).dt.year

    # Duplicados
    df_duplicates = df[[col_names["col_id_reviewer"],col_names["col_id_product"], col_names["col_unix_time"]]].sort_values(by=[col_names["col_unix_time"]], ascending=False)
    df = df.drop(df_duplicates[df_duplicates[[col_names["col_id_reviewer"],col_names["col_id_product"]]].duplicated()][col_names["col_id_reviewer"]].index.values.tolist())
   
    if (info==True):
      get_dataset_basic_info(df,"Informacion tras eliminar duplicados")

    productos_a_eliminar=[1]
    clientes_a_eliminar=[1]
    iteracion = 1
    while (len(productos_a_eliminar)!=0)and(len(clientes_a_eliminar)!=0):

      # Minimo de usuarios que han comprado el producto productos 
      aux=df.groupby([col_names["col_id_product"]])[col_names["col_id_reviewer"]].count().reset_index()
      aux2=aux[aux[col_names["col_id_reviewer"]]<min_usuarios].reset_index() # usuarios a eliminar
      aux=aux[aux[col_names["col_id_reviewer"]]>=min_usuarios].reset_index() # usuarios a conservar
      productos=aux[col_names["col_id_product"]]
      df=df[df[col_names["col_id_product"]].isin(productos)]

      productos_a_eliminar=aux2[col_names["col_id_product"]]


      if (info==True):
        get_dataset_basic_info(df,f"Iteracion: {iteracion}. Informacion tras eliminar productos comprados por menos de {min_usuarios} personas")

      # Seleccionamos los ids de producto que tienen mas de X reviews 
      aux=df.groupby([col_names["col_id_reviewer"]])[col_names["col_rating"]].count().reset_index()
      aux2=aux[aux[col_names["col_rating"]]<min_reviews].reset_index()
      aux=aux[aux[col_names["col_rating"]]>=min_reviews].reset_index()
      clientes=aux[col_names["col_id_reviewer"]]
      df=df[df[col_names["col_id_reviewer"]].isin(clientes)]

      clientes_a_eliminar=aux2[col_names["col_id_reviewer"]]

      if (info==True):
        get_dataset_basic_info(df,f"Iteracion: {iteracion}. Informacion tras eliminar usuarios con menos de {min_reviews} reviews")

      iteracion+=1

    return df


#============ Visualización ============#

def barplot_reviews(df):

  '''
  Visualizaciones interesantes:
  Barplot con el numero de usuarios en funcion del numero de reviews en dataset
  '''

  # Definicion de dataset
  data = pd.DataFrame(df.groupby([col_names["col_id_reviewer"]]).count()).reset_index().groupby([col_names["col_rating"]]).count().reset_index()

  # Definicion del objeto fig
  fig = go.Figure()

  # Gráfica
  fig.add_trace(go.Bar(x=data[col_names["col_rating"]],
                  y=data[col_names["col_id_reviewer"]],
                  name='Rest of world',
                  marker_color='rgb(55, 83, 109)'
                  ))


  fig.update_layout(
      title='¿Cuantos usuarios tienen x reviews?',
      xaxis_tickfont_size=14,
      yaxis=dict(
          title='# de usuarios',
          titlefont_size=16,
          tickfont_size=14,
      ),
      xaxis=dict(
          title='# de reviews',
          titlefont_size=16,
          tickfont_size=14,
      )
  )

  return(fig.show())

def timeline_reviews(df):

  # Definicion de dataset
  data = pd.DataFrame(df.groupby([col_names["col_year"]]).count()).reset_index()

  # Definicion de objeto figura
  fig = go.Figure()

  # Definicion de grafica
  fig.add_trace(go.Bar(x=data[col_names["col_year"]],
                  y=data[col_names["col_rating"]],
                  name='Rest of world',
                  marker_color='rgb(55, 83, 109)'
                  ))


  fig.update_layout(
      title='¿Cuantas reviews tenemos por año?',
      xaxis_tickfont_size=14,
      yaxis=dict(
          title='# de usuarios',
          titlefont_size=16,
          tickfont_size=14,
      ),
      xaxis=dict(
          title='Año',
          titlefont_size=16,
          tickfont_size=14,
      )
  )

  return(fig.show())



In [None]:
df=load_data_raw()

wget https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Musical_Instruments.csv --no-check-certificate  completed


In [None]:
get_dataset_basic_info(df,"Descarga de la información en bruto RAW")


 ####################################### 
 Descarga de la información en bruto RAW 
 ####################################### 

 Numero total de clientes:  903330 
 Numero total de productos:  112222 
 Numero total de reviews:  1512530 



In [None]:
df1=treat_dataset_src(df, min_reviews, True)


 #################################### 
 Informacion tras eliminar duplicados 
 #################################### 

 Numero total de clientes:  903330 
 Numero total de productos:  112222 
 Numero total de reviews:  1470564 


 ################################################################################### 
 Iteracion: 1. Informacion tras eliminar productos comprados por menos de 3 personas 
 ################################################################################### 

 Numero total de clientes:  869854 
 Numero total de productos:  53698 
 Numero total de reviews:  1393509 


 ####################################################################### 
 Iteracion: 1. Informacion tras eliminar usuarios con menos de 3 reviews 
 ####################################################################### 

 Numero total de clientes:  95690 
 Numero total de productos:  45995 
 Numero total de reviews:  504658 


 #####################################################################

In [None]:
get_dataset_basic_info(df1,"Definición del dataset inicial tras proceamiento SRC")


 #################################################### 
 Definición del dataset inicial tras proceamiento SRC 
 #################################################### 

 Numero total de clientes:  88135 
 Numero total de productos:  26641 
 Numero total de reviews:  461254 



In [None]:
barplot_reviews(df1)

In [None]:
timeline_reviews(df1)

In [None]:
# comprobacion duplicados
df_ratings_item_user = df1.groupby(['reviewerID','asin'])['overall'].agg(cuenta='count').sort_values(['cuenta'], ascending=[False]).reset_index()
df_ratings_item_user = df_ratings_item_user.query('cuenta > 1')
print("\nvalidation:")
print('number of users with multiple ratings of the same item: ' + str(df_ratings_item_user.reviewerID.count()))
print(df_ratings_item_user)


validation:
number of users with multiple ratings of the same item: 0
Empty DataFrame
Columns: [reviewerID, asin, cuenta]
Index: []


In [None]:
# comprobacion numero de usuarios con minimo 3 reviews
aux=df1.groupby([col_names["col_id_reviewer"]])[col_names["col_rating"]].count().reset_index()
aux2=aux[aux[col_names["col_rating"]]<min_reviews].reset_index()
aux2

Unnamed: 0,index,reviewerID,overall


In [None]:
# comprobacion numero de productos con minimo 3 usuarios
aux=df1.groupby([col_names["col_id_product"]])[col_names["col_id_reviewer"]].count().reset_index()
aux2=aux[aux[col_names["col_id_reviewer"]]<min_usuarios].reset_index() 
aux2

Unnamed: 0,index,asin,reviewerID


In [None]:
print(f"Total: " + str(len(df1.index)))
print(f"Duplicates1: " + str(df1.duplicated(subset=["asin", "reviewerID", "overall", "timestamp", "year"]).sum()) )
print(f"Duplicates2: " + str(df1.duplicated(subset=["asin", "reviewerID", "overall"]).sum()) )

Total: 461254
Duplicates1: 0
Duplicates2: 0


# Joan graficas


In [None]:
#============ Visualización ============#

def scatter_user_ratings_count(df, col_names, num_reviews):

   # count of ratings per reviewer
  data = df.groupby([col_names["col_id_reviewer"]])[col_names["col_rating"]].agg(cuenta='count', mean='mean').sort_values(['cuenta', 'mean'], ascending=[False, False]).reset_index()
  data = data.query('cuenta>'+str(num_reviews))
  
  fig = px.scatter(data, x=col_names["col_id_reviewer"], y='cuenta', color='mean',
                 size='mean')

  fig.update_layout(
      title="Count of reviews for reviewers with more than "+ str(num_reviews) + " reviews. Showing "+ str(data.shape[0])  + " of " + str(df.shape[0]) + " ratings",
      yaxis=dict(
          title='ratings count',
          titlefont_size=16,
          tickfont_size=14,
      ),
      xaxis=dict(
          title='reviewers',
          titlefont_size=16,
          tickfont_size=14,
      )
  )
  return fig.show()

def scatter_user_ratings_mean(df, col_names, num_reviews):

   # mean of ratings per reviewer
  data = df.groupby([col_names["col_id_reviewer"]])[col_names["col_rating"]].agg(cuenta='count', mean='mean').sort_values(['cuenta', 'mean'], ascending=[False, False]).reset_index()
  data = data.query('cuenta>'+str(num_reviews))

  fig = px.scatter(data, x=col_names["col_id_reviewer"], y='mean', color='cuenta',
                 size='cuenta')

  fig.update_layout(
      title="Mean of reviews for reviewers with more than "+ str(num_reviews) + " reviews. Showing "+ str(data.shape[0])  + " of " + str(df.shape[0]) + " ratings",
      yaxis=dict(
          title='ratings mean',
          titlefont_size=16,
          tickfont_size=14,
      ),
      xaxis=dict(
          title='reviewers',
          titlefont_size=16,
          tickfont_size=14,
      )
  )

  return fig.show()

def scatter_product_ratings_mean(df, col_names, num_reviews):

   # mean of ratings per reviewer
  data = df.groupby([col_names["col_id_product"]])[col_names["col_rating"]].agg(cuenta='count', mean='mean').sort_values(['cuenta', 'mean'], ascending=[False, False]).reset_index()
  data = data.query('cuenta>'+str(num_reviews))

  fig = px.scatter(data, x=col_names["col_id_product"], y='mean', color='cuenta',
                 size='cuenta')

  fig.update_layout(
      title="Mean of reviews for products with more than "+ str(num_reviews) + " reviews. Showing "+ str(data.shape[0]) + " of " + str(df.shape[0]) + " ratings",
      yaxis=dict(
          title='ratings mean',
          titlefont_size=16,
          tickfont_size=14,
      ),
      xaxis=dict(
          title='products',
          titlefont_size=16,
          tickfont_size=14,
      )
  )

  return fig.show()  

def scatter_product_ratings_count(df, col_names, num_reviews):

   # count of ratings per reviewer
  data = df.groupby([col_names["col_id_product"]])[col_names["col_rating"]].agg(cuenta='count', mean='mean').sort_values(['cuenta', 'mean'], ascending=[False, False]).reset_index()
  data = data.query('cuenta>'+str(num_reviews))

  fig = px.scatter(data, x=col_names["col_id_product"], y='cuenta', color='mean',
                 size='mean')

  fig.update_layout(
      title="Count of reviews for products with more than "+ str(num_reviews) + " reviews. Showing "+ str(data.shape[0])  + " of " + str(df.shape[0]) + " ratings",
      yaxis=dict(
          title='ratings count',
          titlefont_size=16,
          tickfont_size=14,
      ),
      xaxis=dict(
          title='products',
          titlefont_size=16,
          tickfont_size=14,
      )
  )

  return fig.show()  


# Procesado matriz por filas y columnas


La secuencia sería, eliminar el codigo que peta:

zero_positions = np.asarray(np.where(rating_mat.A==0)).T  #devuelve los indices traspuestos de cada posicion
print(rating_mat.A)
print(zero_positions)
print(dims[0])

y sustituir este código:

items2compute = []
for user in trange(dims[0]):
    aux = zero_positions[zero_positions[:, 0] == user][:, 1] #devuelve el item
    items2compute.append(aux[aux >= dims[0]])

por éste:

In [None]:

   # generate test dataset
    items2compute = []
    items_zero_per_user = []
    for user in trange(dims[0]):
        aux1 = rating_mat[user, (dims[0]+1):]        
        items_zero_per_user = np.where(aux1.A==0)
        aux = items_zero_per_user[:] + (dims[0]+1)
        items2compute.append(aux[1])