def developer( desarrollador : str ): Cantidad de items y porcentaje de contenido Free por año según empresa desarrolladora. Ejemplo de salida:

In [3]:
import pandas as pd
import numpy as np
import re

In [89]:
# Leer dataframe con columnas necesarias
columns = ['id', 'price', 'developer','title', 'release_date']
games = pd.read_csv('steam_games.csv', usecols=columns).rename(columns={'id':'item_id'})
games.sample(10)

Unnamed: 0,title,release_date,price,item_id,developer
2974,USFIV: Challengers Wild Pack 2,2014-12-16,3.99,331356.0,Capcom
21514,Super Jagua,2016-09-15,0.99,514890.0,cbritez
12245,Forever Home Soundtrack,2017-11-07,2.99,749190.0,Pixel Blade Games
10092,DC Wonder: Unlimited,2017-06-14,4.99,652160.0,Nucleax
8661,,,Free To Play,588290.0,
4204,Rocksmith® 2014 – Hit Singles Song Pack,2015-08-18,9.99,369807.0,Ubisoft - San Francisco
23178,Project G,2016-05-13,7.99,465670.0,Nebula Interactive
28179,Door Kickers,2014-10-20,19.99,248610.0,KillHouse Games
11950,Tametsi,2017-10-18,2.99,709920.0,Grip Top Games
27009,Flamberge Demo,2015-01-01,,358010.0,Hydezeke


In [90]:
# Función para corregir el precio, en el que aparecen valores numéricos y en formato string
def fix_price(df):

    errors_list = []
    for i in df['price']:
        try:
            float(i)
        except:
            errors_list.append(i)

    errors = set(errors_list)
    #uniques_not_free = ['Starting at $499.00', 'Starting at $449.00']
    df['price_fixed'] = df['price'].apply(lambda x: 0 if x in errors 
                                                        else 499.0 if x=='Starting at $499.00'
                                                        else 449.0 if x=='Starting at $449.00'
                                                        else x)
    df['price_fixed'] = df['price_fixed'].astype(float)
    return df

In [91]:
# Función para convertir la columna fecha en dtype datetime
def safe_date_convert(df, date_column):
    def convert(x):
        try:
            return pd.to_datetime(x)
        except ValueError:
            # Try to find a year pattern in the string
            year_pattern = re.search(r'\b\d{4}\b', x)
            if year_pattern:
                # Convert the found year into a datetime format
                return pd.to_datetime(year_pattern.group(0), format='%Y')
            else:
                # If no year pattern is found, return NaT
                return pd.NaT

    df['date_fixed'] = df[date_column].astype(str).apply(convert)
    return df

In [92]:
# Transformar la columna price a numérico
games=fix_price(games)
# Transformas la columna de fechas a dtype datetime
games = safe_date_convert(games, 'release_date')

  return pd.to_datetime(x)


In [93]:
# Obtener el año de lanzamiento
games['release_year'] = games['date_fixed'].dt.year
# Eliminar columnas innecesarias
games.drop(columns=['release_date','price','date_fixed'],inplace=True)
games.drop(columns=['title'], inplace=True)
games.head()

Unnamed: 0,item_id,developer,price_fixed,release_year
0,761140.0,Kotoshiro,4.99,2018.0
1,643980.0,Secret Level SRL,0.0,2018.0
2,670290.0,Poolians.com,0.0,2017.0
3,767400.0,彼岸领域,0.99,2017.0
4,773570.0,,2.99,


In [94]:
# Imputar los nulos de precio como 0 y los na de release_year por la moda
# Los nulos de desarrollador no puedo imputarlos, así que los elimino.
games.dropna(subset=['developer'], inplace=True)
games['price_fixed'].fillna(0,inplace=True)
games['release_year'].fillna(games['release_year'].mode().iloc[0],inplace=True)

In [95]:
# Verificar y eliminar duplicado
games.duplicated().sum()
games.drop_duplicates(inplace=True)

In [96]:
# Agrupar por desarrollador para calcular el numero de videojuegos lanzados por año
dev = games.groupby(['developer', 'release_year'], as_index=False).agg({'item_id':'count'}).rename(columns={'item_id':'item_count'})
# Calcular número de juegos gratis
free_games = games[games['price_fixed']==0].groupby(['developer','release_year'], as_index=False)['item_id'].agg('count').rename(columns={'item_id':'free'})
# Merge de los dos df (left join)
dev = pd.merge(dev, free_games, on=['developer', 'release_year'], how='left')
# Imputar nulos de la columna free con 0
dev['free'].fillna(0,inplace=True)
# Calcular porcentaje de videojuegos gratis por desarrollador por año
dev['porcentaje_free'] = dev['free'] / dev['item_count'] * 100
# Sample
dev.sample(10)

Unnamed: 0,developer,release_year,item_count,free,porcentaje_free
704,"Amaranth Games, LLC",2016.0,1,0.0,0.0
6460,"J. Fletcher,Michael Flynn",2016.0,2,0.0,0.0
8434,Myoubouh Corp,2016.0,1,0.0,0.0
1787,"Blue Tongue Entertainment,Blitworks",2017.0,2,0.0,0.0
1638,Black Jacket Studios,2009.0,1,0.0,0.0
11003,Savant,2017.0,4,0.0,0.0
4200,Eric Billingsley,2011.0,1,0.0,0.0
5567,Gunsquad,2017.0,4,2.0,50.0
8612,Neognosis,2016.0,1,0.0,0.0
14529,disco.zone,2017.0,1,1.0,100.0


In [97]:
# Convertir el año a entero
dev['release_year'] = dev['release_year'].astype(int)
# Convertir los nombres de desarrolladores a minúscula y eliminar espacios en blanco
dev['developer'] = dev['developer'].str.strip().str.lower()
# Redondear porcentaje de videojuegos gratis
dev['porcentaje_free'] = dev['porcentaje_free'].round(2)
# Eliminar la columna free
dev.drop(columns=['free'], inplace=True)
dev.sample(10)

Unnamed: 0,developer,release_year,item_count,porcentaje_free
3347,dead mage,2015,2,0.0
10528,"richmakegame,colin northway",2014,1,0.0
3299,david ruland,2017,1,0.0
12983,totallynotreptillian,2016,1,0.0
1864,boomzap entertainment,2016,6,16.67
3144,dagger games,2010,1,0.0
14188,yakiniku banzai,2017,1,0.0
5907,horizon studio,2017,1,0.0
5046,gfi russia,2008,1,0.0
2652,colin northway with art by thomas shahan,2013,1,0.0


In [98]:
# Guardar el df final a un csv que consumirá la API
dev.to_csv('df-funcion-5.csv',index=False)

In [101]:
# Función que retorna la cantidad de videojuegos lanzados en un año dado
# y el porcentaje de videojuegos gratis
def developer(desarrollador : str):
    df = pd.read_csv('df-funcion-5.csv')
    desarrollador = desarrollador.strip().lower()
    if df['developer'].str.contains(desarrollador).any():
        data = df[df['developer'] == desarrollador]
        data = data.sort_values('release_year', ascending=False)
        response = data[['release_year','item_count','porcentaje_free']].to_dict(orient='records')
        return response
    else:
        return 'Error: Developer not found'

In [102]:
# Probar la función
developer('capcom')

[{'release_year': 2017, 'item_count': 40, 'porcentaje_free': 2.5},
 {'release_year': 2016, 'item_count': 3, 'porcentaje_free': 0.0},
 {'release_year': 2016, 'item_count': 9, 'porcentaje_free': 11.11},
 {'release_year': 2015, 'item_count': 33, 'porcentaje_free': 0.0},
 {'release_year': 2014, 'item_count': 21, 'porcentaje_free': 0.0},
 {'release_year': 2013, 'item_count': 14, 'porcentaje_free': 0.0},
 {'release_year': 2011, 'item_count': 8, 'porcentaje_free': 0.0},
 {'release_year': 2009, 'item_count': 3, 'porcentaje_free': 0.0},
 {'release_year': 2008, 'item_count': 2, 'porcentaje_free': 0.0}]