In [None]:
pip install pandas requests tqdm

In [None]:
pip install pandas requests tqdm beautifulsoup4 lxml

In [None]:
import pandas as pd
import requests
import json
import time
import os
from tqdm import tqdm
from bs4 import BeautifulSoup

# --- Configuration ---
INPUT_FILENAME = 'games_list.csv'
OUTPUT_FILENAME = 'game_features.csv'
API_CALL_DELAY_SECONDS = 1.5  # Delay between requests to be respectful to Steam's servers.

def get_game_details(app_id: int) -> dict | None:
    """
    Fetches details from the Steam API and scrapes the store page for user tags.
    """
    # --- Part 1: Get structured data from the official API ---
    api_url = f"https://store.steampowered.com/api/appdetails?appids={app_id}&cc=us"
    game_details = {}

    try:
        api_response = requests.get(api_url, timeout=15)
        api_response.raise_for_status()
        api_data = api_response.json()
        app_id_str = str(app_id)

        if api_data and app_id_str in api_data and api_data[app_id_str].get('success'):
            game_data = api_data[app_id_str]['data']

            genres_list = game_data.get('genres', [])
            developer_genres = ', '.join([g['description'] for g in genres_list]) if genres_list else 'N/A'

            if game_data.get('is_free', False):
                price_usd = 0.0
            elif 'price_overview' in game_data:
                price_usd = game_data['price_overview'].get('final', 0) / 100.0
            else:
                price_usd = 'N/A'

            # Store the data retrieved from the API
            game_details = {
                'id': app_id,
                'name': game_data.get('name', 'N/A'),
                'developer_genres': developer_genres,
                'price_usd': price_usd,
                'metacritic_score': game_data.get('metacritic', {}).get('score', 'N/A'),
                'release_date': game_data.get('release_date', {}).get('date', 'N/A'),
                'developers': ', '.join(game_data.get('developers', ['N/A'])),
                'publishers': ', '.join(game_data.get('publishers', ['N/A'])),
            }
        else:
            tqdm.write(f"Warning: API data not found for App ID {app_id}. Skipping.")
            return None
    except requests.exceptions.RequestException as e:
        tqdm.write(f"Error fetching API data for App ID {app_id}: {e}")
        return None  # If the API call fails, we can't proceed with this item.

    # --- Part 2: Scrape the public store page to get user tags ---
    store_url = f"https://store.steampowered.com/app/{app_id}/"
    user_tags = '[]'  # Default to an empty JSON array string

    try:
        # This cookie is crucial to bypass age verification pages for mature games.
        cookies = {'birthtime': '568022401', 'wants_mature_content': '1'}
        store_response = requests.get(store_url, cookies=cookies, timeout=15)
        store_response.raise_for_status()

        # Parse the page HTML
        soup = BeautifulSoup(store_response.text, 'lxml')

        # Find all hyperlink tags with the CSS class 'app_tag'.
        tag_elements = soup.find_all('a', class_='app_tag')

        if tag_elements:
            # Extract the clean text from each tag and store it in a list.
            tag_list = [tag.get_text(strip=True) for tag in tag_elements]
            user_tags = json.dumps(tag_list)

    except Exception as e:
        # This can happen if a page is not found, has a different layout, or on a network error.
        tqdm.write(f"Warning: Could not scrape tags for App ID {app_id}. Reason: {e}")
        # We will still return the data from the API, but the tags will be empty.

    # Add the scraped tags to our dictionary and return the complete record.
    game_details['user_tags'] = user_tags
    return game_details

def main():
    """
    Main function to read App IDs, fetch data, and write to a new CSV.
    """
    if not os.path.exists(INPUT_FILENAME):
        print(f"Error: Input file '{INPUT_FILENAME}' not found.")
        print(f"Creating a sample '{INPUT_FILENAME}' with some App IDs...")
        sample_df = pd.DataFrame({
            'id': [620, 730, 578080], # Portal 2, CS:GO, PUBG
            'some_other_column': ['will be ignored', 'also ignored', 'ignored']
        })
        sample_df.to_csv(INPUT_FILENAME, index=False)
        print("Please populate this file with your desired Steam App IDs and run the script again.")
        return

    try:
        input_df = pd.read_csv(INPUT_FILENAME)
        if 'id' not in input_df.columns:
            print(f"Error: Input CSV '{INPUT_FILENAME}' must contain an 'id' column.")
            return
        app_ids = input_df['id'].dropna().unique().astype(int).tolist()
    except Exception as e:
        print(f"Error reading or parsing '{INPUT_FILENAME}': {e}")
        return

    print(f"Found {len(app_ids)} unique App IDs to process.")

    all_game_data = []
    for app_id in tqdm(app_ids, desc="Fetching game data from Steam"):
        details = get_game_details(app_id)
        if details:
            all_game_data.append(details)
        time.sleep(API_CALL_DELAY_SECONDS)

    if not all_game_data:
        print("No data was successfully fetched. The output file will not be created.")
        return

    output_df = pd.DataFrame(all_game_data)

    final_columns = [
        'id', 'name', 'user_tags', 'developer_genres', 'price_usd',
        'metacritic_score', 'release_date', 'developers', 'publishers'
    ]
    output_df = output_df[final_columns]

    output_df.to_csv(OUTPUT_FILENAME, index=False, encoding='utf-8-sig')
    print(f"\n✅ Success! Processed {len(output_df)} games.")
    print(f"Data saved to '{OUTPUT_FILENAME}'.")

if __name__ == '__main__':
    main()

In [None]:
import pandas as pd
import numpy as np
import json
import os
import joblib
import argparse
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# --- Configuration is now handled by argparse ---
MIN_RATINGS_PER_CRITIC = 10

# (The preprocess_data and build_feature_pipeline functions remain exactly the same as before)
# ... copy them here ...
def preprocess_data(games_df, ratings_df):
    """Merges, cleans, and prepares the data for modeling."""
    if 'appid' in games_df.columns:
        games_df = games_df.rename(columns={'appid': 'id'})
    merged_df = pd.merge(ratings_df, games_df, left_on='game_id', right_on='id', how='left')
    merged_df['will_skip'] = merged_df['score'].isnull() | (merged_df['score'] == 'skipped')
    merged_df['score_numeric'] = pd.to_numeric(merged_df['score'], errors='coerce')
    text_cols = ['user_tags', 'developer_genres', 'developers', 'publishers']
    for col in text_cols:
        merged_df[col] = merged_df[col].fillna('[]' if 'tags' in col else '')
    def parse_tags(tags_str):
        try:
            tags_list = json.loads(tags_str.replace("'", "\""))
            return ' '.join(tags_list)
        except: return ''
    if 'user_tags' in merged_df.columns:
         merged_df['user_tags'] = merged_df['user_tags'].apply(parse_tags)
    for col in ['developer_genres', 'developers', 'publishers']:
         merged_df[col] = merged_df[col].str.replace(',', ' ')
    merged_df['metacritic_score'] = pd.to_numeric(merged_df['metacritic_score'], errors='coerce')
    merged_df['price_usd'] = pd.to_numeric(merged_df['price_usd'], errors='coerce')
    merged_df['release_year'] = pd.to_datetime(merged_df['release_date'], errors='coerce').dt.year
    return merged_df

def build_feature_pipeline():
    """Builds a scikit-learn pipeline to transform features."""
    numeric_features = ['metacritic_score', 'price_usd', 'release_year']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    text_features = ['user_tags', 'developer_genres', 'developers', 'publishers']
    text_transformer = TfidfVectorizer(stop_words='english', max_features=100)
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('tags', text_transformer, 'user_tags'),
            ('genres', TfidfVectorizer(max_features=50), 'developer_genres'),
            ('devs', TfidfVectorizer(max_features=50), 'developers'),
            ('pubs', TfidfVectorizer(max_features=50), 'publishers')
        ],
        remainder='drop'
    )
    return preprocessor


def main(args):
    """Main function to run the analysis."""
    print("Loading data...")
    games_path = os.path.join(args.data_dir, 'games_details.csv')
    ratings_path = os.path.join(args.data_dir, 'ratings.csv')
    try:
        games_df = pd.read_csv(games_path)
        ratings_df = pd.read_csv(ratings_path)
    except FileNotFoundError as e:
        print(f"Error: {e}. Make sure your CSV files are in the '{args.data_dir}' directory.")
        return

    # Ensure output directories exist
    os.makedirs(args.models_dir, exist_ok=True)
    os.makedirs(args.results_dir, exist_ok=True)

    print("Preprocessing data...")
    data = preprocess_data(games_df, ratings_df)

    # Prepare all_games_features DataFrame for prediction
    all_games_features = games_df.drop_duplicates(subset=['id']).copy()
    # (Apply same preprocessing as in preprocess_data)
    text_cols = ['user_tags', 'developer_genres', 'developers', 'publishers']
    for col in text_cols:
        all_games_features[col] = all_games_features[col].fillna('[]' if 'tags' in col else '')
    if 'user_tags' in all_games_features.columns:
        all_games_features['user_tags'] = all_games_features['user_tags'].apply(lambda x: ' '.join(json.loads(x.replace("'", "\""))) if x.startswith('[') else '')
    for col in ['developer_genres', 'developers', 'publishers']:
         all_games_features[col] = all_games_features[col].str.replace(',', ' ')
    all_games_features['metacritic_score'] = pd.to_numeric(all_games_features['metacritic_score'], errors='coerce')
    all_games_features['price_usd'] = pd.to_numeric(all_games_features['price_usd'], errors='coerce')
    all_games_features['release_year'] = pd.to_datetime(all_games_features['release_date'], errors='coerce').dt.year


    # --- Model Training/Loading Loop ---
    critics = data['critic_id'].unique()
    all_predictions = []
    all_importances = []

    print(f"Found {len(critics)} critics. Starting model processing loop...")
    for critic_id in critics:
        critic_data = data[data['critic_id'] == critic_id]

        if len(critic_data) < MIN_RATINGS_PER_CRITIC:
            print(f"Skipping critic {critic_id}: not enough ratings.")
            continue

        clf_path = os.path.join(args.models_dir, f'{critic_id}_classifier.joblib')
        reg_path = os.path.join(args.models_dir, f'{critic_id}_regressor.joblib')

        # Check if models exist and we are not forcing a retrain
        if os.path.exists(clf_path) and os.path.exists(reg_path) and not args.force_retrain:
            print(f"Loading existing models for critic: {critic_id}")
            clf_pipeline = joblib.load(clf_path)
            reg_pipeline = joblib.load(reg_path)
        else:
            print(f"--- Training new models for critic: {critic_id} ---")
            feature_pipeline = build_feature_pipeline()

            # Train Classifier
            X_class, y_class = critic_data, critic_data['will_skip']
            clf_pipeline = Pipeline(steps=[('preprocessor', feature_pipeline), ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))])
            clf_pipeline.fit(X_class, y_class)
            joblib.dump(clf_pipeline, clf_path)
            print(f"  -> Saved classifier to {clf_path}")

            # Train Regressor
            rated_games = critic_data.dropna(subset=['score_numeric'])
            if len(rated_games) < MIN_RATINGS_PER_CRITIC / 2:
                print(f"  -> Not enough rated games to train regressor. Skipping.")
                continue
            X_reg, y_reg = rated_games, rated_games['score_numeric']
            reg_pipeline = Pipeline(steps=[('preprocessor', feature_pipeline), ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])
            reg_pipeline.fit(X_reg, y_reg)
            joblib.dump(reg_pipeline, reg_path)
            print(f"  -> Saved regressor to {reg_path}")

        # --- Generate Predictions for ALL games ---
        skip_probs = clf_pipeline.predict_proba(all_games_features)[:, 1]
        pred_scores = reg_pipeline.predict(all_games_features)

        predictions_df = all_games_features[['id', 'name']].copy()
        predictions_df['critic_id'] = critic_id
        predictions_df['predicted_skip_probability'] = skip_probs
        predictions_df['predicted_score'] = pred_scores
        all_predictions.append(predictions_df)

    # --- Save Results ---
    if all_predictions:
        pd.concat(all_predictions, ignore_index=True).to_csv(
            os.path.join(args.results_dir, 'critic_predictions.csv'), index=False
        )
        print(f"\n✅ Predictions saved to '{args.results_dir}/critic_predictions.csv'")

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train critic models and generate game predictions.")
    parser.add_argument('--data_dir', type=str, default='data', help='Directory containing input CSV files.')
    parser.add_argument('--models_dir', type=str, default='saved_models', help='Directory to save/load trained models.')
    parser.add_argument('--results_dir', type=str, default='results', help='Directory to save output predictions.')
    parser.add_argument('--force-retrain', action='store_true', help='Force retraining of all models, even if they exist.')

    args = parser.parse_args()
    main(args)