# Extract Features from the Steam Dataset

Credit: Professor Julian McAuley CSE158 Homework Source Code

In [1]:
import numpy as np
import pandas as pd
import json
import ast
import os
import statistics
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Read In Dataset

Read in the user_items dataset and the steam_games dataset line by line, and store thme into two arrays

In [2]:
notebook_path = os.path.abspath("Steam_Data_Processor.ipynb")
user_item_file_path = os.path.join(os.path.dirname(notebook_path), "data/australian_users_items.json")
steam_games_file_path = os.path.join(os.path.dirname(notebook_path), "data/steam_games.json")

In [3]:
users_items = []
with open(user_item_file_path, 'r') as data:
    for line in data:
        users_items.append(ast.literal_eval(line))

In [4]:
games_data = []
with open(steam_games_file_path, 'r') as file:
    for line in file:
        games_data.append(ast.literal_eval(line))

In [5]:
games_dict = {}
games_no_id = []
for game in games_data:
    if 'id' in game:
        game['popularity'] = 0
        game['total_playtime'] = 0
        games_dict[game['id']] = game
    else:
        games_no_id.append(game)

# Feature Extraction for both Users and Items

Extract a list of features for users (users_meta_data):
- how many games a user owned under each developer
- how much time did user spend on each developer's games
- what genres does the user prefer (genres - count, playtime)
- what tags does the user prefer (tags - count, playtime)
- what specs does the user prefer (specs - count, playtime)
- total amount of money spent on all games (total_cost)
- total amount of time spent on all games (total_playtime)

Extract a list of features for items (games_dict):
- name (app_name)
- developer (developer)
- early access (early_access)
- genres
- id
- median playtime (median_playtime)
- metascore
- owners
- popularity
- price
- publisher
- release date (release_date)
- reviews url (reviews_url)
- sentiment
- specs
- tags
- title
- total_playtime
- url

In [6]:
def extractItemPrice(item_id, games_dict):
    if 'price' in games_dict[item_id]:
        if type(games_dict[item_id]['price']) is float:
            return float(games_dict[item_id]['price'])
    return 0

In [7]:
def extractItemPlaytimeAndPopularity(item_id, item, games_dict):
    # get total_playtime and popularity for an item
    if 'total_playtime' not in games_dict:
        games_dict[item_id]['total_playtime'] = item['playtime_forever']
    else:
        games_dict[item_id]['total_playtime'] += item['playtime_forever']
    
    if 'popularity' not in games_dict:
        games_dict[item_id]['popularity'] = 1
    else:
        games_dict[item_id]['popularity'] += 1

In [8]:
def collectOwners(item, item_id, user_id, games_dict):
    # get dictionary of owners
    if 'owners' in games_dict[item_id]:
        games_dict[item_id]['owners'][user_id] = item['playtime_forever']
    else:
        games_dict[item_id]['owners'] = {
            user_id: item['playtime_forever']
        }

In [9]:
def extractItemCategoricalData(field, item, item_id, user_id, games_dict, users_meta_data):
    # retrieve all the available item data
    if field in games_dict[item_id]:
        field_set = games_dict[item_id][field]
        count_meta_data = users_meta_data[user_id][field]['count']
        playtime_meta_data = users_meta_data[user_id][field]['playtime']

        for value in field_set:
            if value in count_meta_data:
                count_meta_data[value] += 1
                playtime_meta_data[value] += item['playtime_forever']
            else:
                count_meta_data[value] = 1
                playtime_meta_data[value] = item['playtime_forever']

In [10]:
def extract(item, user_id, games_dict, users_meta_data):
    item_id = item['item_id']
    users_meta_data[user_id]['total_playtime'] += item['playtime_forever']
    
    if item_id not in games_dict:
        games_dict[item_id] = {
            'title': item['item_name']
        }
    
    extractItemPlaytimeAndPopularity(item_id, item, games_dict)
    collectOwners(item, item_id, user_id, games_dict)
    users_meta_data[user_id]['total_cost'] += extractItemPrice(item_id, games_dict)
    extractItemCategoricalData('genres', item, item_id, user_id, games_dict, users_meta_data)
    extractItemCategoricalData('tags', item, item_id, user_id, games_dict, users_meta_data)
    extractItemCategoricalData('specs', item, item_id, user_id, games_dict, users_meta_data)
    extractItemCategoricalData('developers', item, item_id, user_id, games_dict, users_meta_data)

In [11]:
def usersItemsMetaDataExtractor(users_items, games_dict):
    users_meta_data = {}
    for user in users_items:
        user_id = user['user_id']
        # create a new user meta_data
        users_meta_data[user_id] = {}

        # genres meta-data
        users_meta_data[user_id]['genres'] = {
            'count': {},
            'playtime': {}
        }
        # tags meta-data
        users_meta_data[user_id]['tags'] = {
            'count': {},
            'playtime': {}
        }
        # specs meta-data
        users_meta_data[user_id]['specs'] = {
            'count': {},
            'playtime': {}
        }
        # developers meta-data
        users_meta_data[user_id]['developers'] = {
            'count': {},
            'playtime': {}
        }
        # total playtime
        users_meta_data[user_id]['total_playtime'] = 0
        # total cost
        users_meta_data[user_id]['total_cost'] = 0.0
        # total item count
        users_meta_data['items_count'] = user['items_count']

        for item in user['items']:
            extract(item, user_id, games_dict, users_meta_data)
            
    return users_meta_data

In [12]:
users_meta_data = usersItemsMetaDataExtractor(users_items, games_dict)

In [13]:
def medianItemPlaytime(item):
    if 'owners' in item:
        playtimes = [x for x in list(item['owners'].values()) if x > 0]
        if len(playtimes) > 0:
            return statistics.median(playtimes)
        else:
            return 0
    return 0

In [14]:
for game in games_dict:
    games_dict[game]['median_playtime'] = medianItemPlaytime(games_dict[game])

In [15]:
games_dict['70']

{'app_name': 'Half-Life',
 'developer': 'Valve',
 'early_access': False,
 'genres': ['Action'],
 'id': '70',
 'median_playtime': 160,
 'metascore': 96,
 'owners': {'76561197970982479': 0,
  'js41637': 0,
  'Riot-Punch': 0,
  'doctr': 108,
  'NitemarePK': 0,
  'themanwich': 32,
  'maplemage': 10,
  'corrupted_soul': 227,
  'cadmusthreepointoh': 36,
  '76561198089393905': 0,
  'WeiEDKrSat': 1,
  'thequeenpanda': 633,
  'death-hunter': 27,
  'bbaekhyun': 0,
  'Fr0stedLine': 1489,
  'diego9031': 3,
  'UTNerd24': 16,
  'kube134': 4,
  'Posiblydead': 33,
  'sad-commie': 0,
  '76561198030567998': 21,
  'therealmorty': 890,
  'itslonk': 8,
  '76561198056857968': 451,
  'xfluttersx': 622,
  'Xx-Woods': 99,
  'jonasdbomb': 0,
  'Sanctus94': 0,
  '76561197990792016': 1841,
  'AVATAR715': 4,
  'Magjiikal': 168,
  'ohnospaghet': 392,
  'exiaez': 0,
  'EizanAratoFujimaki': 1395,
  'ZyhgarThestarweaver': 33,
  'washington_': 33,
  'CobRazor': 539,
  'PabloSanches': 95,
  '76561198069713121': 0,
  '13

In [16]:
with open('data/users_meta_data.json', 'w') as file:
    json.dump(users_meta_data, file)

In [17]:
with open('data/items_meta_data.json', 'w') as file:
    json.dump(games_dict, file)