In [1]:
# ETL australian_user_reviews.csv
import json
import ast
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import pyarrow as pa
import pyarrow.parquet as pq

nltk.download('vader_lexicon')

# Define la función para realizar el análisis de sentimiento
def analyze_sentiment(text):
	analyzer = SentimentIntensityAnalyzer()
	sentiment = analyzer.polarity_scores(text)
	if sentiment['compound'] >= 0.05:
		return 2
	elif sentiment['compound'] <= -0.05:
		return 0
	else:
		return 1

user_reviews = []

# Abre el archivo y recorrerlo para agregar las reseñas a la lista
with open('../data_sources/json/australian_user_reviews.json', encoding='utf-8') as f:
	for line in f:
		object = json.loads(json.dumps(ast.literal_eval(line)))
		user_reviews.append(object)

# Crea el dataframe a partir de la lista
df_user_reviews = pd.DataFrame(user_reviews)

# Normaliza la columna reviews
normalized = pd.json_normalize(user_reviews, record_path=['reviews'], meta=['user_id'] )

# Elimina las filas vacias
normalized = normalized.dropna()

# Eliminar duplicados
clean_items = normalized.drop_duplicates(keep='first')

clean_items['review'] = clean_items['review'].apply(analyze_sentiment)
clean_items['review'].fillna(1, inplace=True)

# Convierte el DataFrame de Pandas a una tabla Arrow
table = pa.Table.from_pandas(clean_items)

# Especifica la ruta del archivo Parquet
file = '../data_sources/parquet/users_reviews.parquet'

# Escribe la tabla Arrow en un archivo Parquet
pq.write_table(table, file, compression='gzip')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/andreeesh/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_items['review'] = clean_items['review'].apply(analyze_sentiment)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_items['review'].fillna(1, inplace=True)
