In [28]:
import pandas as pd
import matplotlib.pyplot as plt

Read the data using the Kaggle Overview as a guide for parsing. Then save into a dataframe

In [30]:
import pandas as pd
import json

# Read the JSON file
with open('games.json', 'r', encoding='utf-8') as fin:
    dataset = json.load(fin)

# Create a list to store the flattened data
games_data = []

for app_id, game in dataset.items():
    # Flatten nested lists into comma-separated strings
    developers_str = ','.join(game['developers']) if game['developers'] else ''
    publishers_str = ','.join(game['publishers']) if game['publishers'] else ''
    categories_str = ','.join(game['categories']) if game['categories'] else ''
    genres_str = ','.join(game['genres']) if game['genres'] else ''
    tags_str = ','.join(str(tag) for tag in game['tags']) if game['tags'] else ''
    screenshots_str = ','.join(game['screenshots']) if game['screenshots'] else ''
    movies_str = ','.join(game['movies']) if game['movies'] else ''

    # Create a flattened dictionary for this game
    game_dict = {
        'AppID': app_id,
        'Name': game['name'],
        'Release date': game['release_date'],
        'Estimated owners': game['estimated_owners'],
        'Peak CCU': game['peak_ccu'],
        'Required age': game['required_age'],
        'Price': game['price'],
        'DiscountDLC count': game['dlc_count'],
        'About the game': game['detailed_description'],
        'Supported languages': game['supported_languages'],
        'Full audio languages': game['full_audio_languages'],
        'Reviews': game['reviews'],
        'Header image': game['header_image'],
        'Website': game['website'],
        'Support url': game['support_url'],
        'Support email': game['support_email'],
        'Windows': game['windows'],
        'Mac': game['mac'],
        'Linux': game['linux'],
        'Metacritic score': game['metacritic_score'],
        'Metacritic url': game['metacritic_url'],
        'User score': game['user_score'],
        'Positive': game['positive'],
        'Negative': game['negative'],
        'Score rank': game['score_rank'],
        'Achievements': game['achievements'],
        'Recommendations': game['recommendations'],
        'Notes': game['notes'],
        'Average playtime forever': game['average_playtime_forever'],
        'Average playtime two weeks': game['average_playtime_2weeks'],
        'Median playtime forever': game['median_playtime_forever'],
        'Median playtime two weeks': game['median_playtime_2weeks'],
        'Developers': developers_str,
        'Publishers': publishers_str,
        'Categories': categories_str,
        'Genres': genres_str,
        'Tags': tags_str,
        'Screenshots': screenshots_str,
        'Movies': movies_str
    }
    games_data.append(game_dict)

# Create DataFrame
df = pd.DataFrame(games_data)

Do some preprocessing of the text in the description column and save it into the processed_description column

Preprocessing does:
- convert to lower case
- remove anything that is not a letter
- remove extra whitespace
- tokenize by splitting on whitespace
- remove stopwords belonging to nltk.corpus.stopwords.words('english')
- Lemmatize with WordNetLemmatizer()

In [31]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def clean_text(text):
    """
    Clean text by removing special characters and extra whitespace
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

def process_description(text):
    """
    Process game description by removing stopwords, lemmatizing, and tokenizing
    """
    # Clean the text first
    text = clean_text(text)
    
    # Simple tokenization by splitting on whitespace
    tokens = text.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join back into a string
    processed_text = ' '.join(tokens)
    
    return processed_text

def process_game_data(df):
    """
    Process the entire dataframe of game descriptions
    """
    # Create a copy to avoid modifying the original
    processed_df = df.copy()
    
    # Process the "About the game" column
    processed_df['processed_description'] = processed_df['About the game'].apply(
        lambda x: process_description(str(x)) if pd.notnull(x) else ''
    )
    
    return processed_df

[nltk_data] Downloading package punkt to /home/ben/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ben/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ben/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [32]:
processed_df = process_game_data(df) # takes about a minute and 20 seconds.

In [33]:
print(processed_df['processed_description'])

0        galactic bowling exaggerated stylized bowling ...
1        law look showdown atop train last fight good l...
2        jolt project army new robotics project jolt co...
3        henosis mysterious platform puzzler player pro...
4        game play hacker arranged deal gangster protag...
                               ...                        
97405    femdom game world fascinating series video gam...
97406    join discord server game enter charming world ...
97407    mission brief deployed heavily guarded compoun...
97408    welcome escape garage thrilling escape game se...
97409    scan brain lobe organize amp clear dangerous n...
Name: processed_description, Length: 97410, dtype: object


Making the index with more fields than we care for takes about 7 and a half minutes. Most of the processing time here is from storing the description I'm guessing.

In [40]:
from whoosh.index import create_in
from whoosh.fields import *
import shutil
import os

# Define the schema for game data
schema = Schema(
    app_id=ID(stored=True),
    name=TEXT(stored=True),
    # description=TEXT(stored=True),
    processed_description=TEXT(stored=True),
    short_description=TEXT(stored=True),
    developers=KEYWORD(stored=True, commas=True),
    publishers=KEYWORD(stored=True, commas=True),
    categories=KEYWORD(stored=True, commas=True),
    genres=KEYWORD(stored=True, commas=True),
    tags=KEYWORD(stored=True, commas=True),
    price=NUMERIC(stored=True, numtype=float),
    release_date=TEXT(stored=True),
    languages=KEYWORD(stored=True, commas=True),
    metacritic_score=NUMERIC(stored=True)
)

# Create index directory
if os.path.exists("index"):
    shutil.rmtree("index")  # Removes directory and all contents for overwriting previous index
    
os.mkdir("index")

# Create the index
ix = create_in("index", schema)
writer = ix.writer()

print("adding docs to writer")
# Add documents from your DataFrame
for _, game in processed_df.iterrows():
    writer.add_document(
        app_id=str(game['AppID']),
        name=game['Name'],
        # description=game['About the game'],
        processed_description=game['processed_description'],
        developers=game['Developers'],
        publishers=game['Publishers'],
        categories=game['Categories'],
        genres=game['Genres'],
        tags=game['Tags'],
        price=float(game['Price']),
        release_date=game['Release date'],
        languages=game['Supported languages'],
        metacritic_score=int(game['Metacritic score'])
    )

print("commiting to index")
writer.commit()

adding docs to writer
commiting to index


Example query using Whoosh. I think a good next step is modifying the MyIndexReader from assignment three to get postings or something along those lines from the processed descriptions for terms. Then also want description length and collection length for performing dirichlet smoothing. For the powerpoint, it cold also be interesting to make some basic plots about the distribution of word frequencies, distribution of categories/genres etc.

In [41]:
# Example search usage:
from whoosh.qparser import QueryParser
# Takes about 
with ix.searcher() as searcher:
    query = QueryParser("description", ix.schema).parse("multiplayer action")
    results = searcher.search(query)
    for r in results:
        print(f"{r['name']} - {r['app_id']}")