In [142]:
#!pip install tensorflow
#!pip install dotenv
#!pip install sklearn
#!pip install requests

In [130]:
#Packages used for this project (tensorflow must be installed prior to running this cell using i.e. !pip install)
import os
import requests
import json
import pandas as pd
import numpy as np
import tensorflow as tf
import time
from dotenv import load_dotenv
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer

In [5]:
#Extracting the data for one game to look at its general structure for extracting data of 1000 games later on.
app_id = '1284210'
url = f'https://store.steampowered.com/api/appdetails/?appids={app_id}'
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    if app_id in data and 'data' in data[app_id]:
        game_data = data[app_id]['data']
        json_str = json.dumps(game_data, indent=4)
        print(json_str)
    else:
        print('Failed to retrieve game information.')
else:
    print('API request failed.')

{
    "type": "game",
    "name": "Guild Wars 2",
    "steam_appid": 1284210,
    "required_age": 0,
    "is_free": true,
    "dlc": [
        1996840,
        2008610,
        2450010,
        2486290
    ],
    "detailed_description": "<img src=\"https://cdn.akamai.steamstatic.com/steam/apps/1284210/extras/GW2-01-GettingStarted-EN__1___1_.gif?t=1687884630\" /><br><br>Guild Wars 2's open world is all about discovery and exploration. Check your content guide for suggestions when you set out on your adventures, consult your compass to find interesting landmarks\u2026or just pick your favorite direction to travel in and let adventure find you. Tyria is full of characters with their own stories and goals, and you'll be rewarded for helping them out\u2014or thwarting their plans\u2014by completing renown hearts and dynamic events. Read our new player guide for more tips!<br><br><img src=\"https://cdn.akamai.steamstatic.com/steam/apps/1284210/extras/GW2-02-CustomizeYourGameplay-EN__1___1_.g

In [3]:
#Extracting the complete list of apps on Steam (this code only returns a list of app names and their ids, further queries required)
url = 'http://api.steampowered.com/ISteamApps/GetAppList/v0002/?format=json'
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    game_list = data['applist']['apps']
    with open('./data/JSON/game_list.json', 'w') as file:
        json.dump(game_list, file)
        print('Game list saved successfully.')
else:
    print('Failed to retrieve the game list.')


Game list saved successfully.


In [48]:
curated_game_list = []
exception_indexes = []

with open('./data/JSON/game_list.json', 'r') as games_file:
    game_list = json.load(games_file)
#25200
#172611 Is the number of entries on game_list
i = 163900
#while i < len(game_list) - 1:
while i < 172611:
    current_game = {}
    game_name = game_list[i]['name']
    
    if game_name != '' and game_name[0] != '\\' and game_name[0] != '/':
        try:
            app_id = str(game_list[i]['appid'])
            url = f'https://store.steampowered.com/api/appdetails/?appids={app_id}'
            response = requests.get(url)
            time.sleep(1.6)  # Add a delay of 1.6 seconds between API requests
            # Remove the BOM from the response content
            response_text = response.content.decode('utf-8-sig')

            data = json.loads(response_text)
        except:
            exception_indexes.append(i)
            i += 1
            continue
        
        try:
            if app_id in data and 'data' in data[app_id]:
                game_data = data[app_id]['data']
                if game_data['type'] == 'game' and 'categories' in game_data and 'genres' in game_data:
                    current_game['name'] = game_data['name']
                    current_game['steam_appid'] = game_data['steam_appid']
                    current_game['metascore'] = game_data.get('metacritic', {}).get('score', -1)
                    current_game['categories'] = game_data['categories']
                    current_game['genres'] = game_data['genres']
                    current_game['recommendations'] = game_data.get('recommendations', -1)
                    curated_game_list.append(current_game)
        except:
            i += 1
            continue
    i += 1
    
filename = './data/JSON/curated_game_list.json'

with open(filename, "r") as file:
    data = json.load(file)

data.extend(curated_game_list)

with open(filename, "w") as file:
    json.dump(data, file)
    
print("Extended the curated_game_list")

#with open(filename, 'w') as file:
#    json.dump(curated_game_list, file)

Extended the curated_game_list


In [56]:
#Getting the user's library in a similar fashion as the creation of the games JSON
def getUserLibrary(steam_id, api_key):
    user_library_info = []
    url = f'https://api.steampowered.com/IPlayerService/GetOwnedGames/v1/?key={api_key}&steamid={steam_id}&format=json&include_appinfo=True'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        game_list = data['response']['games']
        
        # Filter the games with playtime_forever > 300
        filtered_games = [game for game in game_list if game['playtime_forever'] > 300]
        
        #Only save the games that meet certain requirements
        for i in range(len(filtered_games)):
            current_user_game = {}
            app_id = str(filtered_games[i]["appid"])
            url = f'https://store.steampowered.com/api/appdetails/?appids={app_id}'
            response = requests.get(url)
            data = response.json()
            time.sleep(1.6)
            if app_id in data and 'data' in data[app_id] and data != None:
                game_data = data[app_id]['data']
                if game_data['type'] == 'game' and 'categories' in game_data and 'genres' in game_data:
                    current_user_game['name'] = game_data['name']
                    current_user_game['steam_appid'] = game_data['steam_appid']
                    current_user_game['metascore'] = game_data.get('metacritic', {}).get('score', -1)
                    current_user_game['categories'] = [category['description'] for category in game_data['categories']]
                    current_user_game['genres'] = [genre['description'] for genre in game_data['genres']]
                    current_user_game['recommendations'] = game_data.get('recommendations', -1)
                    user_library_info.append(current_user_game)
        # Save user's library to a JSON file
        with open('./data/JSON/user_library_Survax.json', 'w') as file:
            json.dump(user_library_info, file, indent=4)
            print("User's library saved to user_library_Survax.json")
    else:
        print(f"API request failed with status code {response.status_code}.")

In [57]:
#Use environment variables for security
load_dotenv()
os.environ['STEAM_USER_ID'] = os.getenv("STEAM_USER_ID")
os.environ['STEAM_API_KEY'] = os.getenv("STEAM_API_KEY")
getUserLibrary(os.environ['STEAM_USER_ID'], os.environ['STEAM_API_KEY'])

User's library saved to user_library.json


In [106]:
def preprocessInputFeatures(X):
    # Preprocess the 'categories' column
    categories_encoded = []
    for row in X['categories']:
        if isinstance(row[0], dict):  # Check if the row contains dictionaries
            category_descriptions = [cat['description'] for cat in row]
        else:  # If the row contains strings, use them directly
            category_descriptions = row
        categories_encoded.append(category_descriptions)
        
    #This approach of checking if there are dictionaries or strings is a way to hack around a bug I encountered
    #The data is a list of dictionaries but for some reason at times it's a list of strings
    
    # Preprocess the 'genres' column
    genres_encoded = []
    for row in X['genres']:
        if isinstance(row[0], dict):  # Check if the row contains dictionaries
            genre_descriptions = [genre['description'] for genre in row]
        else:  # If the row contains strings, use them directly
            genre_descriptions = row
        genres_encoded.append(genre_descriptions)

    # One-hot encode the 'categories' column
    categories_encoder = MultiLabelBinarizer()
    categories_encoded = categories_encoder.fit_transform(categories_encoded)
    # Convert to DataFrame
    categories_df = pd.DataFrame(categories_encoded, columns=categories_encoder.classes_)
    
    # One-hot encode the 'genres' column
    genres_encoder = MultiLabelBinarizer()
    genres_encoded = genres_encoder.fit_transform(genres_encoded)
    # Convert to DataFrame
    genres_df = pd.DataFrame(genres_encoded, columns=genres_encoder.classes_)

    # Concatenate the encoded features with the remaining columns in X
    X_encoded = pd.concat([categories_df, genres_df], axis=1)

    return X_encoded, categories_encoder, genres_encoder

In [137]:
def buildTrainRecommendationModel():
    # Load and preprocess the 1000 games dataset
    with open('./data/JSON/curated_game_list.json', 'r') as file:
        games_data = json.load(file)

    # Convert JSON data to pandas DataFrame
    df = pd.DataFrame(games_data)

    # Prepare the data for training
    X = df.drop(columns=['name', 'steam_appid'])   # Input features

    # Preprocess the input features
    X_encoded, categories_encoder, genres_encoder = preprocessInputFeatures(X)

    # Prepare the target variable
    y = df['steam_appid']

    # Create a dictionary to map game IDs to integer indices
    unique_game_ids = y.unique()
    #game_id_map = {game_id: index for index, game_id in enumerate(y)}
    
    game_id_map = {game_id: index for index, game_id in enumerate(unique_game_ids)}

    # Convert game IDs to integer indices
    y_encoded = np.array([game_id_map[game_id] for game_id in y])
    
    print("game_id_map length: ", len(game_id_map))
    print(y_encoded.min(), y_encoded.max())

    # Define the model architecture and compile it
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(X_encoded.shape[1],)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(len(game_id_map), activation='softmax')
    ])
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


    # Train the model
    #model.fit(X_encoded, y_encoded, epochs=10, batch_size=32)
    model.fit(X_encoded, y_encoded, epochs=15, batch_size=64)
    
    # Get the input column names
    input_columns = X.columns.tolist()

    return model, categories_encoder, genres_encoder, X_encoded, game_id_map

In [138]:
def prepareUserLibrary(user_library, categories_encoder, genres_encoder, input_columns):
    game_features_list = []

    for game_data in user_library:
        # Get the categories and genres for the game
        categories = [str(category['id']) for category in game_data['categories']]
        genres = [str(genre['id']) for genre in game_data['genres']]

        # One-hot encode the categories
        categories_encoded = categories_encoder.transform([categories])

        # One-hot encode the genres
        genres_encoded = genres_encoder.transform([genres])

        # Create a dictionary with the desired columns
        game_features = {}
        game_features.update(dict(zip(categories_encoder.classes_, categories_encoded[0])))
        game_features.update(dict(zip(genres_encoder.classes_, genres_encoded[0])))

        # Add the game features to a list
        game_features_list.append(game_features)

    # Create a DataFrame from the list of game features
    user_library_df = pd.DataFrame(game_features_list, columns=input_columns)
    
    # Fill NaN values with zeros (since the encoders would not generate columns for non-existing features)
    user_library_df.fillna(0, inplace=True)

    return user_library_df

In [139]:
#This function sioi used to make the input dataframe and user library dataframe the same shape
def align_features(X_train_encoded, X_user_encoded):
    # Get the feature names from the training data and user data
    train_features = set(X_train_encoded.columns)
    user_features = set(X_user_encoded.columns)

    # Get the features present in the training data but not in the user data
    missing_features = train_features - user_features

    # Add these missing features to the user data with a default value of 0
    for feature in missing_features:
        X_user_encoded[feature] = 0

    # Reorder the columns of the user data to match the training data
    X_user_encoded = X_user_encoded[X_train_encoded.columns]

    return X_user_encoded

In [118]:
# Load the raw JSON data
with open('./data/JSON/curated_game_list.json', 'r') as f:
    data = json.load(f)

# Convert the list of dictionaries into a DataFrame
df = pd.json_normalize(data)

# Check the first few rows to understand the structure
print(df.head())

# Create the mapping dictionary
game_id_to_name_map = pd.Series(df['name'].values, index=df['steam_appid']).to_dict()

                 name  steam_appid  metascore  \
0    Now Testing: 407      1609310         -1   
1          LAST CRASH      1609370         -1   
2            DataJack      1609390         -1   
3  Cemetery Warrior V      1609440         -1   
4       The Frog Game      1609470         -1   

                                          categories  \
0        [{'id': 2, 'description': 'Single-player'}]   
1  [{'id': 2, 'description': 'Single-player'}, {'...   
2        [{'id': 2, 'description': 'Single-player'}]   
3  [{'id': 2, 'description': 'Single-player'}, {'...   
4        [{'id': 2, 'description': 'Single-player'}]   

                                              genres  recommendations  \
0  [{'id': '1', 'description': 'Action'}, {'id': ...             -1.0   
1  [{'id': '1', 'description': 'Action'}, {'id': ...             -1.0   
2  [{'id': '1', 'description': 'Action'}, {'id': ...             -1.0   
3  [{'id': '1', 'description': 'Action'}, {'id': ...             -1.0   
4  

In [140]:
def makeRecommendations(model, user_library_features, X_train_encoded, game_id_map):
    # Convert user_library_features to float
    user_library_features = user_library_features.astype(float)

    # Align the features of the user's library with the training data
    user_library_aligned = align_features(X_train_encoded, user_library_features)

    # Make predictions based on the user's library
    predictions = model.predict(user_library_aligned)
    
    # Create a reverse mapping from index to game_id
    index_to_game_id = {index: game_id for game_id, index in game_id_map.items()}

    # Get the indices of the top 10 predicted values across all predictions
    top10_indices = np.argpartition(predictions.flatten(), -10)[-10:]
    
    # Get the unique game IDs corresponding to the top 10 indices
    unique_top10_game_ids = np.unique([index_to_game_id[idx % len(index_to_game_id)] for idx in top10_indices])
    
    recommended_game_names = [game_id_to_name_map[int(game_id)] for game_id in unique_top10_game_ids]
    
    print('Recommended Game Names:')
    print(recommended_game_names)

    print('Recommended App IDs:')
    print(unique_top10_game_ids)

In [141]:
model, categories_encoder, genres_encoder, X_encoded, game_id_map = buildTrainRecommendationModel()

# Preprocess the user's library
with open('./data/JSON/user_library.json', 'r') as file:
    user_library = json.load(file)

# Prepare the user's library data for prediction
user_library_features, _, _ = preprocessInputFeatures(pd.DataFrame(user_library))

makeRecommendations(model, user_library_features, X_encoded, game_id_map)

game_id_map length:  70717
0 70716
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Recommended Game Names:
['Portal', 'Portal 2', 'Fallout: New Vegas', 'LEGO® Marvel™ Super Heroes', 'Nex Machina', 'ENGAGE', 'Cook, Serve, Delicious! 3?!', 'Shovel Knight Showdown', 'Louie', 'Portal with RTX']
Recommended App IDs:
[    400     620   22380  249130  404540  449130 1000030 1116770 1172480
 2012840]


  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_encoded[feature] = 0
  X_user_e

In [100]:
df['name'] = df['name'].astype(str)
df['steam_appid'] = df['steam_appid'].astype(int)

# Check for duplicate rows based on a subset of columns
duplicates = df.duplicated(subset=['name', 'steam_appid'])

# Display the duplicate rows
print(df[duplicates])

# Remove duplicate rows
df = df.drop_duplicates(subset=['name', 'steam_appid'])

# Display the first few rows of the dataframe after removing duplicates
print(df.head())

# Save the cleaned data back to a JSON file
df.to_json('./data/JSON/curated_game_list.json', orient='records', lines=True)


                                                name  steam_appid  metascore  \
13215  Shadow of the Tomb Raider: Definitive Edition       750920         77   
13216  Shadow of the Tomb Raider: Definitive Edition       750920         77   
13217  Shadow of the Tomb Raider: Definitive Edition       750920         77   
13218  Shadow of the Tomb Raider: Definitive Edition       750920         77   
13219  Shadow of the Tomb Raider: Definitive Edition       750920         77   
...                                              ...          ...        ...   
82156  Shadow of the Tomb Raider: Definitive Edition       750920         77   
82940                        Train Simulator Classic        24010         -1   
83438                  Dark Messiah of Might & Magic         2100         72   
83654                                 System Shock 2       238210         92   
87976                Tom Clancy's Rainbow Six® Siege       359550         -1   

                                       