In [40]:
import os
import json
from pandas import json_normalize
import pandas as pd
import datetime

In [93]:
#Load json file
first_file = './M244_Project/data/games.json'
with open(first_file, 'r') as file:
   json_data = json.load(file)

In [94]:
unnecessary_vars = [
    'packages', 'screenshots', 'movies', 'header_image', 'website', 'support_url', 'notes', 'support_email',
    'required_age', 'metacritic_url', 'detailed_description', 'about_the_game','achievements', 'full_audio_languages', 'dlc_count', 'supported_languages', 'developers'
]

In [95]:
games = [{
    **{k: v for k, v in game_info.items() if k not in unnecessary_vars},
    'tags': list(tags.keys()) if isinstance((tags := game_info.get('tags', {})), dict) else [],
    'tag_frequencies': list(tags.values()) if isinstance(tags, dict) else [],
    'app_id': app_id
} for app_id, game_info in json_data.items()]

# Create a DataFrame from the processed list
df = pd.DataFrame(games)

In [25]:
df.to_csv('games.csv')

In [96]:
#Filter out games with no playtime  
df = df[df['median_playtime_forever'] > 0]

#Select only the columns we want
df = df[['name', 'release_date', 'price', 'windows', 'mac', 'linux', 'publishers', 'genres', 'positive', 'negative', 'estimated_owners', 'median_playtime_forever', 
         'peak_ccu']]

In [99]:
#Data Wrangling
#Take only the year of the release date
df['release_year'] = pd.to_datetime(df['release_date'], format='mixed').dt.year

In [100]:
#Take the sum of the systems
def get_system(df, col_list):
    for col in col_list:
        df[col] = df[col].astype(int)

    df['compatible_systems'] = df[col_list].sum(axis=1)
    
    return df

df = get_system(df, ['windows', 'mac', 'linux'])

In [101]:
df['positive_rate'] = df['positive'] / (df['positive'] + df['negative'])

In [102]:
df['estimated_owners'] = pd.Categorical(df['estimated_owners'])

In [103]:
df['genres'] = df['genres'].apply(lambda x: x[:1] if len(x) > 1 else x)  

In [104]:
df = df.drop(columns=['release_date', 'windows', 'mac', 'linux', 'positive', 'negative'])

In [105]:
table_summary_stats = df.describe(include='all')