def sentiment_analysis( año : int ): Según el año de lanzamiento, se devuelve una lista con la cantidad de registros de reseñas de usuarios que se encuentren categorizados con un análisis de sentimiento.

In [220]:
import pandas as pd
import numpy as np

In [222]:
user_revs=pd.read_csv("game_reviews2.csv", usecols=['user_id','item_id', 'Sentiment_Result'])

In [221]:
# Revisar tipo de datos y cantidad de nulos
print(user_revs.info())
print(user_revs.isna().sum())
# Eliminar nulos 
user_revs.dropna(inplace=True)

<class 'pandas.core.frame.DataFrame'>
Index: 59305 entries, 0 to 59332
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           59305 non-null  object 
 1   item_id           59305 non-null  float64
 2   Sentiment_Result  59305 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.8+ MB
None
user_id             0
item_id             0
Sentiment_Result    0
dtype: int64


In [223]:
# Leer dataframe de videojuegos con las columnas necesarias
columns = ['id', 'release_date']
games = pd.read_csv('steam_games.csv', usecols=columns).rename(columns={'id':'item_id'})
games.sample(15)

Unnamed: 0,release_date,item_id
191,2008-10-17,1230.0
26360,2015-06-10,373171.0
15990,2017-08-29,628150.0
5613,2015-02-21,454791.0
7188,2016-05-05,499370.0
17977,2017-04-30,588710.0
29075,2014-05-09,244410.0
7600,2016-10-11,457330.0
1395,2013-06-04,229621.0
28938,2014-05-29,234787.0


In [224]:
# Revisar tipos de datos y cantidad de nulos 
print(games.info())
print(games.isna().sum())
games.dropna(inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32135 entries, 0 to 32134
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   release_date  30068 non-null  object 
 1   item_id       32133 non-null  float64
dtypes: float64(1), object(1)
memory usage: 502.2+ KB
None
release_date    2067
item_id            2
dtype: int64


In [225]:
# Hacer merge (left join) de los dos df
df_sa = user_revs.merge(games, on='item_id', how='left')
# Imputar nulos en la columna fecha con la moda
df_sa.fillna(df_sa['release_date'].mode()[0], inplace=True)
df_sa.sample(10)

Unnamed: 0,user_id,item_id,Sentiment_Result,release_date
57021,Twerking49,242050.0,1,2013-11-19
38448,76561198002610796,226700.0,1,2007-10-10
25460,76561198065661382,200260.0,1,2012-09-07
47960,DerpyAssassin,98200.0,1,2011-05-26
19246,Tarvo69,730.0,1,2012-08-21
11604,76561198076357381,238960.0,1,2007-10-10
30625,76561198024728550,9900.0,1,2010-02-02
44007,ZesK0,4000.0,1,2006-11-29
24909,jtrkyr,261640.0,1,2014-10-14
3174,ducksswamp,4000.0,1,2006-11-29


In [226]:
# Convertir la columna release_date a dtype datetime
df_sa = safe_date_convert(df_sa, 'release_date')
df_sa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   user_id           59333 non-null  object        
 1   item_id           59333 non-null  object        
 2   Sentiment_Result  59333 non-null  int64         
 3   release_date      59333 non-null  object        
 4   date_fixed        59333 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 2.3+ MB


In [228]:
print(df_sa['date_fixed'].dtype)
# Verificar si hay valores nulos o faltantes
print(df_sa['date_fixed'].isnull().sum())
# Convertir la columna "date_fixed" al tipo de dato "datetime" si es necesario
df_sa['date_fixed'] = pd.to_datetime(df_sa['date_fixed'], format='%Y-%m-%d', errors='coerce')

datetime64[ns]
0


In [229]:
# Obtener el año de lanzamiento
df_sa['year_released'] = df_sa['date_fixed'].dt.year
# Quedarse solo con las columnas que se necesitan
df_sa = df_sa[['Sentiment_Result','year_released']]


In [230]:
df_sa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Sentiment_Result  59333 non-null  int64
 1   year_released     59333 non-null  int32
dtypes: int32(1), int64(1)
memory usage: 695.4 KB


In [231]:
# Calcular los porcentajes de reviews positivos, negativos y neutros
sentiment_analysis = pd.DataFrame()
sentiment_analysis['Negative'] = df_sa[df_sa['Sentiment_Result']==0].groupby('year_released').agg('count')
sentiment_analysis['Neutral'] = df_sa[df_sa['Sentiment_Result']==1].groupby('year_released').agg('count')
sentiment_analysis['Positive'] = df_sa[df_sa['Sentiment_Result']==2].groupby('year_released').agg('count')


In [None]:
# Guardar el df final en un csv que consumirá la API
sentiment_analysis.to_csv('sentiment_analysis.csv')

In [None]:
# Función que retorna el porcentaje de reviews negativos, positivos
# y neutros en un año especificado
def sentiment_analysis(año : str):
    df = pd.read_csv('dataquery/sentiment_analysis.csv')
    df['year_released'] = df['year_released'].astype(str)
    if df['year_released'].str.contains(año).any():
        return df[df['year_released'] == año].to_json(orient='records')
    else:
        return 'Year not found'