In [2]:
import sys
sys.path.append("../Lib")
import utils as ut
import pandas as pd
from textblob import TextBlob
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Cargando datos limpios
df_reviews = pd.read_parquet('../Datasets/user_reviews.parquet.gz')
df_games = pd.read_parquet('../Datasets/steam_games.parquet.gz')
df_items = pd.read_parquet('../Datasets/users_items.parquet.gz')


Vamos a crear una nueva columna llamada 'analisis_sentimiento', reemplazando la columna 'reviews_review'. Realizaremos un análisis de sentimientos de los comentarios en la columna 'reviews', clasificándolos de la siguiente manera: 0 si es negativo, 1 si es neutral o no hay reseña, y 2 si es positivo.

In [4]:
# Aplicar la función de análisis de sentimientos
df_reviews['sentiment_analysis'] = df_reviews['review'].apply(lambda x: TextBlob(str(x)).sentiment.polarity if pd.notna(x) and x != 'No data' else None)

def map_sentiment(score):
    if score is None:
        return 1  # Si la reseña está ausente o 'Sin datos', toma el valor de 1
    elif score < 0:
        return 0  # Malo
    elif score == 0:
        return 1  # Neutral
    else:
        return 2  # Positivo

# Aplicar la función de mapeo a la columna 'sentiment_analysis'
df_reviews['sentiment_analysis'] = df_reviews['sentiment_analysis'].apply(map_sentiment)

### PlayTimeGenre 
Crearemos un dataframe auxiliar que nos muestra género, año, horas reproducidas por género.

In [5]:
playtime_item = df_items[['playtime_forever', 'item_id']]
playtime_item

Unnamed: 0,playtime_forever,item_id
0,6,10
1,7,30
2,4733,300
3,1853,240
4,333,3830
...,...,...
3246370,677,304930
3246371,43,227940
3246372,3,388490
3246373,4,521570


In [6]:
# Extraiga las columnas 'item_id', 'genres' y 'release_year' en el marco de datos df_games
df_genre_year=df_games[['item_id', 'genres', 'release_year']]
df_genre_year.head()

Unnamed: 0,item_id,genres,release_year
0,761140,Action,2018
1,761140,Casual,2018
2,761140,Indie,2018
3,761140,Simulation,2018
4,761140,Strategy,2018


Uniendo los marcos de datos df_genre_year y playtime_item y obtenga el marco de datos df_playtime_genre para nuestra función futura

In [7]:
df_playtime_genre=df_genre_year.merge(playtime_item, on='item_id')
df_playtime_genre

Unnamed: 0,item_id,genres,release_year,playtime_forever
0,282010,Action,1997,5
1,282010,Action,1997,13
2,282010,Action,1997,110
3,282010,Action,1997,13
4,282010,Action,1997,8
...,...,...,...,...
7795607,80,Action,2004,203
7795608,80,Action,2004,244
7795609,80,Action,2004,7
7795610,80,Action,2004,9


In [8]:
# Agrupa filas por géneros y años y suma el tiempo jugado
playtime_genre = df_playtime_genre.groupby(['genres', 'release_year'])['playtime_forever'].sum().reset_index()
playtime_genre

Unnamed: 0,genres,release_year,playtime_forever
0,Action,1983,3473
1,Action,1984,384
2,Action,1988,16001
3,Action,1989,607
4,Action,1990,18335
...,...,...,...
657,Web Publishing,2017,9382
658,Werewolves,no data,3128
659,Zombies,2015,2728
660,Zombies,no data,78741


### UserForGenre
Usuario por género Crearemos un dataframe auxiliar que nos muestra género, año, usuario, horas jugadas por usuario por año en cada género.

In [9]:
user_item = df_items[['playtime_forever', 'user_id', 'item_id']]
user_item.head()

Unnamed: 0,playtime_forever,user_id,item_id
0,6,76561197970982479,10
1,7,76561197970982479,30
2,4733,76561197970982479,300
3,1853,76561197970982479,240
4,333,76561197970982479,3830


In [10]:
user_for_genre=df_genre_year.merge(user_item, on='item_id')
user_for_genre

Unnamed: 0,item_id,genres,release_year,playtime_forever,user_id
0,282010,Action,1997,5,UTNerd24
1,282010,Action,1997,13,saint556
2,282010,Action,1997,110,chidvd
3,282010,Action,1997,13,aerpub
4,282010,Action,1997,8,lucifer666678
...,...,...,...,...,...
7795607,80,Action,2004,203,76561198208507532
7795608,80,Action,2004,244,76561198221578852
7795609,80,Action,2004,7,EnVyIsSmexy
7795610,80,Action,2004,9,76561198283312749


Suponemos que las horas están en minutos (ya que hay valores por año que superan con creces el número de horas de un año) y convertimos el tiempo a horas.

In [11]:
user_for_genre['playtime_hours'] = user_for_genre['playtime_forever'] / 60
user_for_genre['playtime_hours'] = user_for_genre['playtime_hours'].round(2)

In [12]:
user_for_genre = user_for_genre.groupby(['genres', 'user_id', 'release_year'])['playtime_hours'].sum().reset_index()
user_for_genre

Unnamed: 0,genres,user_id,release_year,playtime_hours
0,Action,--000--,2009,88.81
1,Action,--000--,2010,0.37
2,Action,--000--,2011,108.69
3,Action,--000--,2012,1822.44
4,Action,--000--,2013,6.05
...,...,...,...,...
3607790,e-sports,zgeddon,2014,2.38
3607791,e-sports,zuilde,2014,0.12
3607792,e-sports,zwanzigdrei,2014,8.53
3607793,e-sports,zzeee,2014,0.02


Para ahorrar memoria en funciones y modelos futuros, tomamos una muestra aleatoria de datos.

In [13]:
random_rows = 3000000
user_for_genre = user_for_genre.sample(n=random_rows, random_state=42)
user_for_genre

Unnamed: 0,genres,user_id,release_year,playtime_hours
2121544,Massively Multiplayer,76561198047380370,2009,60.77
2671977,RPG,thekellygang,2010,1.63
550984,Action,ongkungx2,2013,234.02
1220887,Driving,SvDKILLSWITCH,no data,1.57
182758,Action,76561198065093772,2016,192.75
...,...,...,...,...
3568776,Utilities,farmaobitcoin,2013,0.02
1593041,Free to Play,memegod,2015,24.95
2835305,Simulation,76561198028561555,2014,1.92
3054867,Simulation,pooopnugets,2016,29.53


In [14]:
user_for_genre = user_for_genre.groupby(['genres', 'user_id', 'release_year'])['playtime_hours'].sum().reset_index()
user_for_genre

Unnamed: 0,genres,user_id,release_year,playtime_hours
0,Action,--000--,2009,88.81
1,Action,--000--,2010,0.37
2,Action,--000--,2011,108.69
3,Action,--000--,2012,1822.44
4,Action,--000--,2014,63.79
...,...,...,...,...
2999995,e-sports,zgeddon,2014,2.38
2999996,e-sports,zuilde,2014,0.12
2999997,e-sports,zwanzigdrei,2014,8.53
2999998,e-sports,zzeee,2014,0.02


### UsersRecommend
Crearemos un marco de datos auxiliar que nos muestre las columnas item_name recomendadas publicadas y sentiment_analysis para obtener las funciones de recomendación.

In [15]:
it_recommend = df_games[['item_id', 'item_name']]
it_recommend

Unnamed: 0,item_id,item_name
0,761140,Lost Summoner Kitty
1,761140,Lost Summoner Kitty
2,761140,Lost Summoner Kitty
3,761140,Lost Summoner Kitty
4,761140,Lost Summoner Kitty
...,...,...
84206,681550,Maze Run VR
84207,681550,Maze Run VR
84208,681550,Maze Run VR
84209,681550,Maze Run VR


In [16]:
u_recommend = df_reviews[['item_id', 'posted', 'recommend', 'sentiment_analysis']]
u_recommend

Unnamed: 0,item_id,posted,recommend,sentiment_analysis
0,1250,2011-11-05,True,2
1,22200,2011-07-15,True,2
2,43110,2011-04-21,True,2
3,251610,2014-06-24,True,2
4,227300,2013-09-08,True,0
...,...,...,...,...
59300,70,2024-07-10,True,2
59301,362890,2024-07-08,True,2
59302,273110,2024-07-03,True,2
59303,730,2024-07-20,True,2


In [17]:
user_recommend = it_recommend.merge(u_recommend, on='item_id')
user_recommend

Unnamed: 0,item_id,item_name,posted,recommend,sentiment_analysis
0,282010,Carmageddon Max Pack,2024-01-12,True,1
1,282010,Carmageddon Max Pack,2024-01-12,True,1
2,282010,Carmageddon Max Pack,2024-01-12,True,1
3,70,Half-Life,2015-10-28,True,0
4,70,Half-Life,2011-01-27,True,0
...,...,...,...,...,...
184067,80,Counter-Strike: Condition Zero,2014-10-26,False,1
184068,80,Counter-Strike: Condition Zero,2013-12-08,True,2
184069,80,Counter-Strike: Condition Zero,2015-01-05,True,2
184070,80,Counter-Strike: Condition Zero,2014-10-31,True,1


### sentiment_analysis
Ahora vamos a crear un marco de datos auxiliar donde podremos ver el análisis de sentimiento por año.

In [18]:
release_year = df_games[['item_id', 'release_year']]
release_year = release_year[release_year['release_year'] != 'no data']
release_year

Unnamed: 0,item_id,release_year
0,761140,2018
1,761140,2018
2,761140,2018
3,761140,2018
4,761140,2018
...,...,...
84200,610660,2018
84201,610660,2018
84202,610660,2018
84203,658870,2017


In [19]:
sentiment = df_reviews[['item_id', 'sentiment_analysis']]
sentiment

Unnamed: 0,item_id,sentiment_analysis
0,1250,2
1,22200,2
2,43110,2
3,251610,2
4,227300,0
...,...,...
59300,70,2
59301,362890,2
59302,273110,2
59303,730,2


In [20]:
sentiment_year = release_year.merge(sentiment, on='item_id')
sentiment_year = sentiment_year[['release_year', 'sentiment_analysis']]
sentiment_year['release_year'] = pd.to_numeric(sentiment_year['release_year'], errors='coerce')
sentiment_year

Unnamed: 0,release_year,sentiment_analysis
0,1997,1
1,1997,1
2,1997,1
3,1998,0
4,1998,0
...,...,...
127720,2004,1
127721,2004,2
127722,2004,2
127723,2004,1


Grabando resultado de las funciones requeridas a formato "csv" y "parquet"

In [21]:
dfs = [playtime_genre, user_for_genre, user_recommend, sentiment_year, df_reviews]
# Nombres correspondientes a cada DataFrame
names = ['playtime_genre', 'user_for_genre', 'user_recommend','sentiment_year', 'reviews_clean']

ut.save_to_csv(dfs, names)

DataFrame 'playtime_genre' saved as '../Data/csv/playtime_genre.csv'


DataFrame 'user_for_genre' saved as '../Data/csv/user_for_genre.csv'
DataFrame 'user_recommend' saved as '../Data/csv/user_recommend.csv'
DataFrame 'sentiment_year' saved as '../Data/csv/sentiment_year.csv'
DataFrame 'reviews_clean' saved as '../Data/csv/reviews_clean.csv'


In [22]:
ut.save_to_pq(dfs, names)

DataFrame 'playtime_genre' save as '../Data/parquet/playtime_genre.parquet'
DataFrame 'user_for_genre' save as '../Data/parquet/user_for_genre.parquet'
DataFrame 'user_recommend' save as '../Data/parquet/user_recommend.parquet'
DataFrame 'sentiment_year' save as '../Data/parquet/sentiment_year.parquet'
DataFrame 'reviews_clean' save as '../Data/parquet/reviews_clean.parquet'
