# Games metadata
The dataset: https://www.kaggle.com/datasets/fronkongames/steam-games-dataset?select=games.json.

This notebook would clean the dataset and save it into `dataset/games_metadata_cleaned.csv`.

# Imports

In [4]:
%reset -f

import json
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

# Prepare dataset

In [5]:
games_metadata = {}
with open('../dataset/games_metadata.json') as f:
    games_metadata = json.load(f)

len(games_metadata)

85103

Clear games with almost no reviews.

In [6]:
rem = [appid for appid, gm in games_metadata.items() if gm['positive'] + gm['negative'] <= 20]

for key in rem:
    del games_metadata[key]

len(games_metadata)

32932

Load the json metadata into a pandas dataframe.

In [7]:
appids = []
prices = []
positive = []
negative = []
positive_percs = []
average_playtimes = []
descriptions = []

def convert_json_to_pandas():
    for i, (appid, gm) in enumerate(games_metadata.items()):
        appids.append(int(appid))
        prices.append(gm['price'])
        positive.append(gm['positive'])
        negative.append(gm['negative'])
        positive_percs.append((1 + gm['positive']) / (1 + gm['positive'] + gm['negative']))
        average_playtimes.append(gm['average_playtime_forever'])
        
        atg = gm['about_the_game']
        dd = gm['detailed_description']
        if len(atg) > len(dd):
            descriptions.append(atg)
        else:
            descriptions.append(dd)        

    return pd.DataFrame({
        'appid': appids,
        'price': prices,
        'positive_perc': positive_percs,
        'average_plt': average_playtimes,
    })

In [8]:
games = convert_json_to_pandas()
games

Unnamed: 0,appid,price,positive_perc,average_plt
0,655370,0.99,0.915254,0
1,1139950,0.00,0.864407,0
2,1469160,0.00,0.642336,0
3,1659180,10.99,0.758621,0
4,1178150,14.99,0.927711,0
...,...,...,...,...
32927,2160220,7.99,0.960000,0
32928,2487350,17.76,0.949126,0
32929,2642700,7.64,0.623188,0
32930,2674190,2.54,0.909091,0


In [16]:
pd.set_option('display.expand_frame_repr', False)

print(games[["price", "positive_perc", "average_plt"]].describe().T)

                 count        mean          std       min       25%       50%         75%       max
price          32932.0    9.405919    11.044177  0.000000  1.790000  5.990000   14.990000     299.9
positive_perc  32932.0    0.767837     0.172347  0.037037  0.671233  0.806452    0.903226       1.0
average_plt    32932.0  266.511569  1822.692519  0.000000  0.000000  0.000000  174.000000  145727.0


Attach descriptions in vectorized format into the dataframe.

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf_vectorizer.fit_transform(descriptions).toarray()

tfidf_df = pd.DataFrame(tfidf_matrix, index=appids)
tfidf_df.columns = [f'tfidf_{i}' for i in range(tfidf_df.shape[1])]

tfidf_df

Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_90,tfidf_91,tfidf_92,tfidf_93,tfidf_94,tfidf_95,tfidf_96,tfidf_97,tfidf_98,tfidf_99
655370,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.182488,0.000000,0.000000,0.000000,0.000000,0.243500,0.000000,0.000000,0.271614,0.293291
1139950,0.207205,0.142348,0.000000,0.090211,0.000000,0.166137,0.054517,0.000000,0.079458,0.000000,...,0.000000,0.000000,0.000000,0.127388,0.247487,0.084773,0.125806,0.000000,0.063041,0.136144
1469160,0.000000,0.000000,0.000000,0.046001,0.000000,0.000000,0.417002,0.088347,0.040518,0.054282,...,0.000000,0.000000,0.000000,0.000000,0.063101,0.086457,0.128305,0.201286,0.289319,0.069424
1659180,0.091695,0.000000,0.000000,0.000000,0.000000,0.147042,0.048251,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.111449,0.112746,0.000000,0.150060,0.111346,0.174681,0.278977,0.180745
1178150,0.000000,0.000000,0.109616,0.000000,0.000000,0.065291,0.428504,0.000000,0.062454,0.000000,...,0.000000,0.000000,0.000000,0.100126,0.097262,0.000000,0.000000,0.077564,0.247749,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2160220,0.000000,0.000000,0.048968,0.000000,0.046765,0.116669,0.287134,0.060833,0.167397,0.112131,...,0.133846,0.000000,0.044214,0.044729,0.000000,0.059532,0.066260,0.069299,0.000000,0.047803
2487350,0.000000,0.098376,0.096381,0.062344,0.000000,0.000000,0.452118,0.059867,0.109825,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.085518,0.000000,0.000000,0.204596,0.043567,0.047044
2642700,0.000000,0.000000,0.000000,0.000000,0.063206,0.039422,0.258724,0.041111,0.037709,0.101037,...,0.000000,0.129854,0.119518,0.060455,0.000000,0.040231,0.059704,0.000000,0.000000,0.000000
2674190,0.133215,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.136876,...,0.000000,0.175916,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.162119,0.000000


In [7]:
games = games.set_index('appid').join(tfidf_df)
games

Unnamed: 0_level_0,price,positive_perc,average_plt,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,...,tfidf_90,tfidf_91,tfidf_92,tfidf_93,tfidf_94,tfidf_95,tfidf_96,tfidf_97,tfidf_98,tfidf_99
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
655370,0.99,0.915254,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.182488,0.000000,0.000000,0.000000,0.000000,0.243500,0.000000,0.000000,0.271614,0.293291
1139950,0.00,0.864407,0,0.207205,0.142348,0.000000,0.090211,0.000000,0.166137,0.054517,...,0.000000,0.000000,0.000000,0.127388,0.247487,0.084773,0.125806,0.000000,0.063041,0.136144
1469160,0.00,0.642336,0,0.000000,0.000000,0.000000,0.046001,0.000000,0.000000,0.417002,...,0.000000,0.000000,0.000000,0.000000,0.063101,0.086457,0.128305,0.201286,0.289319,0.069424
1659180,10.99,0.758621,0,0.091695,0.000000,0.000000,0.000000,0.000000,0.147042,0.048251,...,0.000000,0.000000,0.111449,0.112746,0.000000,0.150060,0.111346,0.174681,0.278977,0.180745
1178150,14.99,0.927711,0,0.000000,0.000000,0.109616,0.000000,0.000000,0.065291,0.428504,...,0.000000,0.000000,0.000000,0.100126,0.097262,0.000000,0.000000,0.077564,0.247749,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2160220,7.99,0.960000,0,0.000000,0.000000,0.048968,0.000000,0.046765,0.116669,0.287134,...,0.133846,0.000000,0.044214,0.044729,0.000000,0.059532,0.066260,0.069299,0.000000,0.047803
2487350,17.76,0.949126,0,0.000000,0.098376,0.096381,0.062344,0.000000,0.000000,0.452118,...,0.000000,0.000000,0.000000,0.000000,0.085518,0.000000,0.000000,0.204596,0.043567,0.047044
2642700,7.64,0.623188,0,0.000000,0.000000,0.000000,0.000000,0.063206,0.039422,0.258724,...,0.000000,0.129854,0.119518,0.060455,0.000000,0.040231,0.059704,0.000000,0.000000,0.000000
2674190,2.54,0.909091,0,0.133215,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.175916,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.162119,0.000000


We would split the data into bins according to price and almost average play time. After testing I found these values to be working the best.

In [8]:
price_bins = [-float('inf'), 0.00, 10.00, 20.00, float('inf')]
games['price_bin'] = pd.cut(games['price'], bins=price_bins, labels=[f'price_{i}' for i in range(len(price_bins)-1)])

games.drop(columns=['price'], inplace=True)
games = pd.get_dummies(games, columns=['price_bin'])
games = games.astype('float')

games

Unnamed: 0_level_0,positive_perc,average_plt,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,...,tfidf_94,tfidf_95,tfidf_96,tfidf_97,tfidf_98,tfidf_99,price_bin_price_0,price_bin_price_1,price_bin_price_2,price_bin_price_3
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
655370,0.915254,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.243500,0.000000,0.000000,0.271614,0.293291,0.0,1.0,0.0,0.0
1139950,0.864407,0.0,0.207205,0.142348,0.000000,0.090211,0.000000,0.166137,0.054517,0.000000,...,0.247487,0.084773,0.125806,0.000000,0.063041,0.136144,1.0,0.0,0.0,0.0
1469160,0.642336,0.0,0.000000,0.000000,0.000000,0.046001,0.000000,0.000000,0.417002,0.088347,...,0.063101,0.086457,0.128305,0.201286,0.289319,0.069424,1.0,0.0,0.0,0.0
1659180,0.758621,0.0,0.091695,0.000000,0.000000,0.000000,0.000000,0.147042,0.048251,0.000000,...,0.000000,0.150060,0.111346,0.174681,0.278977,0.180745,0.0,0.0,1.0,0.0
1178150,0.927711,0.0,0.000000,0.000000,0.109616,0.000000,0.000000,0.065291,0.428504,0.000000,...,0.097262,0.000000,0.000000,0.077564,0.247749,0.000000,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2160220,0.960000,0.0,0.000000,0.000000,0.048968,0.000000,0.046765,0.116669,0.287134,0.060833,...,0.000000,0.059532,0.066260,0.069299,0.000000,0.047803,0.0,1.0,0.0,0.0
2487350,0.949126,0.0,0.000000,0.098376,0.096381,0.062344,0.000000,0.000000,0.452118,0.059867,...,0.085518,0.000000,0.000000,0.204596,0.043567,0.047044,0.0,0.0,1.0,0.0
2642700,0.623188,0.0,0.000000,0.000000,0.000000,0.000000,0.063206,0.039422,0.258724,0.041111,...,0.000000,0.040231,0.059704,0.000000,0.000000,0.000000,0.0,1.0,0.0,0.0
2674190,0.909091,0.0,0.133215,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.162119,0.000000,0.0,1.0,0.0,0.0


In [9]:
average_plt_bins = [-float('inf'), 0, 10, 100, 1000, float('inf')] 
games['average_plt_bin'] = pd.cut(games['average_plt'], bins=average_plt_bins, labels=[f'average_plt_{i}' for i in range(len(average_plt_bins)-1)])

games.drop(columns=['average_plt'], inplace=True)
games = pd.get_dummies(games, columns=['average_plt_bin'])
games = games.astype('float')

games

Unnamed: 0_level_0,positive_perc,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,...,tfidf_99,price_bin_price_0,price_bin_price_1,price_bin_price_2,price_bin_price_3,average_plt_bin_average_plt_0,average_plt_bin_average_plt_1,average_plt_bin_average_plt_2,average_plt_bin_average_plt_3,average_plt_bin_average_plt_4
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
655370,0.915254,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.293291,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1139950,0.864407,0.207205,0.142348,0.000000,0.090211,0.000000,0.166137,0.054517,0.000000,0.079458,...,0.136144,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1469160,0.642336,0.000000,0.000000,0.000000,0.046001,0.000000,0.000000,0.417002,0.088347,0.040518,...,0.069424,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1659180,0.758621,0.091695,0.000000,0.000000,0.000000,0.000000,0.147042,0.048251,0.000000,0.000000,...,0.180745,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1178150,0.927711,0.000000,0.000000,0.109616,0.000000,0.000000,0.065291,0.428504,0.000000,0.062454,...,0.000000,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2160220,0.960000,0.000000,0.000000,0.048968,0.000000,0.046765,0.116669,0.287134,0.060833,0.167397,...,0.047803,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2487350,0.949126,0.000000,0.098376,0.096381,0.062344,0.000000,0.000000,0.452118,0.059867,0.109825,...,0.047044,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2642700,0.623188,0.000000,0.000000,0.000000,0.000000,0.063206,0.039422,0.258724,0.041111,0.037709,...,0.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2674190,0.909091,0.133215,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [10]:
games.to_csv("../dataset/games_metadata_cleaned.csv", index=True)