# ETL Games
Importamos pandas y vemos si se leen bien todos los csv para continuar con la transformacion de los datos

In [1]:
import pandas as pd
import ast


In [2]:
df_games = pd.read_csv('steam_games.csv')

In [3]:
df_games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,Kotoshiro,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",http://steamcommunity.com/app/761140/reviews/?...,['Single-player'],4.99,False,761140,Kotoshiro
1,"Making Fun, Inc.","['Free to Play', 'Indie', 'RPG', 'Strategy']",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"['Free to Play', 'Strategy', 'Indie', 'RPG', '...",http://steamcommunity.com/app/643980/reviews/?...,"['Single-player', 'Multi-player', 'Online Mult...",Free To Play,False,643980,Secret Level SRL
2,Poolians.com,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"['Free to Play', 'Simulation', 'Sports', 'Casu...",http://steamcommunity.com/app/670290/reviews/?...,"['Single-player', 'Multi-player', 'Online Mult...",Free to Play,False,670290,Poolians.com
3,彼岸领域,"['Action', 'Adventure', 'Casual']",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"['Action', 'Adventure', 'Casual']",http://steamcommunity.com/app/767400/reviews/?...,['Single-player'],0.99,False,767400,彼岸领域
4,Trickjump Games Ltd,"['Action', 'Adventure', 'Simulation']",Battle Royale Trainer,Battle Royale Trainer,http://store.steampowered.com/app/772540/Battl...,2018-01-04,"['Action', 'Adventure', 'Simulation', 'FPS', '...",http://steamcommunity.com/app/772540/reviews/?...,"['Single-player', 'Steam Achievements']",3.99,False,772540,Trickjump Games Ltd


In [None]:
# Eliminamos las columnas que en principio no vamos a utilizar

In [4]:
games = df_games.drop(columns=['publisher','url', 'reviews_url', 'tags', 'specs', 'price', 'developer'], inplace=True)

In [5]:
# Vemos como va quedando el data frame
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22530 entries, 0 to 22529
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        22530 non-null  object
 1   app_name      22530 non-null  object
 2   title         22530 non-null  object
 3   release_date  22530 non-null  object
 4   early_access  22530 non-null  bool  
 5   id            22530 non-null  int64 
dtypes: bool(1), int64(1), object(4)
memory usage: 902.2+ KB


# Trabajamos con Genre 


In [6]:
# Primero aplicamos a la categoria genero ast para evaluar las expresiones literales de Python
df_games['genres'] = df_games['genres'].apply(ast.literal_eval)

In [7]:
# Ahora realizamos explode para obtener los generos linea por linea
df_games = df_games.explode('genres')


In [8]:
# Pedimos un head para ver como se la columna genero
df_games.head()

Unnamed: 0,genres,app_name,title,release_date,early_access,id
0,Action,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,False,761140
0,Casual,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,False,761140
0,Indie,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,False,761140
0,Simulation,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,False,761140
0,Strategy,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,False,761140


In [9]:
# Creamos la columna year, que vamos a necesitar y lo sacamos de la columna release_date
import numpy as np

In [10]:
df_games['year'] = pd.to_datetime(df_games['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x !=np.nan else np.nan)

In [11]:
df_games.head()

Unnamed: 0,genres,app_name,title,release_date,early_access,id,year
0,Action,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,False,761140,2018
0,Casual,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,False,761140,2018
0,Indie,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,False,761140,2018
0,Simulation,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,False,761140,2018
0,Strategy,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,False,761140,2018


In [12]:
# Para que no tengamos confusion, eliminamos release_date y luego vamos a eliminar valores nulos por si quedon alguno.
df_games = df_games.drop(columns=['release_date'])

In [13]:
df_games = df_games.dropna()

In [14]:
df_games.head()

Unnamed: 0,genres,app_name,title,early_access,id,year
0,Action,Lost Summoner Kitty,Lost Summoner Kitty,False,761140,2018
0,Casual,Lost Summoner Kitty,Lost Summoner Kitty,False,761140,2018
0,Indie,Lost Summoner Kitty,Lost Summoner Kitty,False,761140,2018
0,Simulation,Lost Summoner Kitty,Lost Summoner Kitty,False,761140,2018
0,Strategy,Lost Summoner Kitty,Lost Summoner Kitty,False,761140,2018


In [15]:
df_games.shape

(55612, 6)

In [16]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55612 entries, 0 to 22529
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        55612 non-null  object
 1   app_name      55612 non-null  object
 2   title         55612 non-null  object
 3   early_access  55612 non-null  bool  
 4   id            55612 non-null  int64 
 5   year          55612 non-null  object
dtypes: bool(1), int64(1), object(4)
memory usage: 2.6+ MB


In [17]:
# Ahora con un dataframe mas liviano y sin datos nulos, exportamos a un csv final
df_games.to_csv('games_final.csv', index=False)