In [1]:
import gzip
import string
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
def readGz(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield eval(l)

In [3]:
usersItems = []

for d in readGz("australian_users_items.json.gz"):
    usersItems.append(d)

userReviews = []

for d in readGz("australian_user_reviews.json.gz"):
    userReviews.append(d)

In [4]:
features = []
for d in usersItems:
    for i in d['items']:
        features.append((d['user_id'], i['item_name'], i['playtime_forever']))

In [33]:
df = pd.DataFrame(features, columns = ['User', 'Game', 'Playtime(min)'])
df = df.sort_values(by=['Playtime(min)'], ascending=False)
df = df.reset_index(drop=True)
df_0 = df.loc[df['Playtime(min)'] == 0]
df_1 = df.loc[df['Playtime(min)'] != 0]
df.head(10)

Unnamed: 0,User,Game,Playtime(min)
0,wolop,Garry's Mod,642773
1,Evilutional,Mabinogi,635295
2,76561198019826668,Call of Duty: Black Ops - Multiplayer,632452
3,76561198039832932,Garry's Mod,613411
4,tsunamitad,Universe Sandbox,600068
5,jimmynoe,Garry's Mod,551719
6,shinomegami,EVE Online,530882
7,lildoughnut,Garry's Mod,501498
8,ThisIsWhereIGetOff,Garry's Mod,495058
9,76561197977470391,Half-Life 2: Deathmatch,493791


In [22]:
# Number of unique users
len(df['User'].unique())

70912

In [7]:
# Number of unique games
len(df['Game'].unique())

10947

In [16]:
# Number of game copies with playtime
len(df_1['Game'])

3285246

In [17]:
# Number of game copies without playtime
len(df_0['Game'])

1867963

In [15]:
# Total number of game copies
len(df['Game'])

5153209

In [18]:
df_1.describe()

Unnamed: 0,Playtime(min)
count,3285246.0
mean,1555.251
std,6721.032
min,1.0
25%,44.0
50%,205.0
75%,808.0
max,642773.0


In [19]:
# Median playtime instead of mean 
df_1['Playtime(min)'].median()

205.0

In [59]:
df_game = df
df_game['User'] = 1
df_game.head(10)

Unnamed: 0,User,Game,Playtime(min)
0,1,Garry's Mod,642773
1,1,Mabinogi,635295
2,1,Call of Duty: Black Ops - Multiplayer,632452
3,1,Garry's Mod,613411
4,1,Universe Sandbox,600068
5,1,Garry's Mod,551719
6,1,EVE Online,530882
7,1,Garry's Mod,501498
8,1,Garry's Mod,495058
9,1,Half-Life 2: Deathmatch,493791


In [28]:
df_game = df_game.groupby('Game').sum().reset_index()
df_game = df_game.sort_values('Playtime(min)', ascending = False).reset_index(drop=True)
df_game.head(10)

Unnamed: 0,Game,User,Playtime(min)
0,Counter-Strike: Global Offensive,43776,785184267
1,Garry's Mod,43301,448366616
2,Terraria,29239,154974541
3,The Elder Scrolls V: Skyrim,22285,136678626
4,Warframe,25807,124027703
5,Counter-Strike: Source,24220,112612047
6,Left 4 Dead 2,37044,102189423
7,PAYDAY 2,23729,99763914
8,Sid Meier's Civilization V,15303,82380684
9,Rust,16201,81120416


In [47]:
totalPlayed = len(df['Game'])
mostPopular = df_game[['Game', 'User']].to_numpy()

In [56]:
return1 = set()
count = 0
for i, ic in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/2: break

In [58]:
return1

{'7 Days to Die',
 'APB Reloaded',
 'ARK: Survival Evolved',
 'Ace of Spades',
 'AdVenture Capitalist',
 'Age of Empires II: HD Edition',
 'Age of Empires Online',
 'Age of Empires® III: Complete Collection',
 'Age of Mythology: Extended Edition',
 'Age of Wonders III',
 'AirMech',
 'Alan Wake',
 'Alien: Isolation',
 'Aliens vs. Predator',
 "America's Army: Proving Grounds",
 'Amnesia: The Dark Descent',
 'Anno 2070',
 'ArcheAge',
 'Arma 2',
 'Arma 2: DayZ Mod',
 'Arma 2: Operation Arrowhead',
 'Arma 2: Operation Arrowhead Beta (Obsolete)',
 'Arma 3',
 "Assassin's Creed Brotherhood",
 "Assassin's Creed II",
 "Assassin's Creed IV Black Flag",
 "Assassin's Creed Revelations",
 "Assassin's Creed Unity",
 "Assassin's Creed® III",
 'Assetto Corsa',
 'Audiosurf',
 'Aura Kingdom',
 'Awesomenauts',
 'BLOCKADE 3D',
 'BRINK',
 'Banished',
 'Bastion',
 'Batman: Arkham Asylum GOTY Edition',
 'Batman: Arkham City GOTY',
 'Batman: Arkham City™',
 'Batman™: Arkham Knight',
 'Batman™: Arkham Origins',