In [32]:
# !rm -rf ./saved_models/
!rm saved_models.zip

In [34]:
# -*- coding: utf-8 -*-
"""create_predictions.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1Ocims907l3w9v02V_tbql8-hkNYfC5Fo
"""

import pandas as pd
import numpy as np
import json
import os
import joblib
import ast
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# --- Configuration for Local Environment ---
DATA_DIR = 'data'
MODELS_DIR = 'saved_models'
RESULTS_DIR = 'results'

# Set to True if you want to rebuild all models from scratch
FORCE_RETRAIN = False

# Minimum ratings required to build a model for a critic
MIN_RATINGS_PER_CRITIC = 5

import ast
import json

def parse_tags(data_str):
    """
    Safely parses a string that could be a single or double-encoded list of tags.
    Replaces spaces and hyphens in multi-word tags with underscores.
    """
    if not isinstance(data_str, str):
        return ''
    try:
        # First, try to evaluate the string as a Python literal.
        parsed_data = ast.literal_eval(data_str)

        # If the result of the first parse is *still* a string, it's likely
        # double-encoded, so we parse it again using json.loads.
        if isinstance(parsed_data, str):
            final_list = json.loads(parsed_data)
        else:
            final_list = parsed_data

        # Final check to ensure we have a list before joining.
        if isinstance(final_list, list):
            # Replace spaces and hyphens with underscores in each tag
            processed_tags = [tag.replace(' ', '_').replace('-', '_') for tag in final_list]
            return ' '.join(processed_tags)
        else:
            return ''
    except (ValueError, SyntaxError, json.JSONDecodeError, TypeError):
        # If any parsing step fails, return an empty string.
        return ''

def preprocess_data(games_df, ratings_df):
    """Merges, cleans, and prepares the data for modeling."""
    ratings_df = ratings_df.rename(columns={'id': 'rating_id'})

    # --- CHANGED ---
    # Merge on the new 'game_id' column, which should now exist in both files.
    merged_df = pd.merge(ratings_df, games_df, on='game_id', how='left')

    merged_df['will_skip'] = merged_df['score'].isnull() | (merged_df['score'] == 'skipped')
    merged_df['score_numeric'] = pd.to_numeric(merged_df['score'], errors='coerce')

    text_cols = ['user_tags', 'developer_genres', 'developers', 'publishers']
    for col in text_cols:
        merged_df[col] = merged_df[col].fillna('')

    if 'user_tags' in merged_df.columns:
         merged_df['user_tags'] = merged_df['user_tags'].apply(parse_tags) # Use new parser

    for col in ['developer_genres', 'developers', 'publishers']:
        if col in merged_df:
             merged_df[col] = merged_df[col].str.replace(',', ' ')

    merged_df['metacritic_score'] = pd.to_numeric(merged_df['metacritic_score'], errors='coerce')
    merged_df['price_usd'] = pd.to_numeric(merged_df['price_usd'], errors='coerce')
    merged_df['release_year'] = pd.to_datetime(merged_df['release_date'], errors='coerce', format='mixed').dt.year

    return merged_df

def build_feature_pipeline():
    """Generates a scikit-learn pipeline to transform raw data into model-ready features."""
    numeric_features = ['metacritic_score', 'price_usd', 'release_year']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    # Modify TfidfVectorizer for user_tags to include all n-grams
    text_transformer_tags = TfidfVectorizer(stop_words='english', max_features=100, min_df=1) # ngram_range can be default now
    text_transformer_genres = TfidfVectorizer(max_features=50, min_df=1)
    text_transformer_devs = TfidfVectorizer(max_features=50, min_df=1)
    text_transformer_pubs = TfidfVectorizer(max_features=50, min_df=1)


    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('tags', text_transformer_tags, 'user_tags'),
            ('genres', text_transformer_genres, 'developer_genres'),
            ('devs', text_transformer_devs, 'developers'),
            ('pubs', text_transformer_pubs, 'publishers')
        ],
        remainder='drop'
    )
    return preprocessor

# --- NEW FUNCTION ---
def validate_schema(ratings_df, games_df):
    """
    Validates that the dataframes have the new, required schema.
    Halts execution if the schema is incorrect.
    """
    print("Validating data schema...")
    ratings_cols = set(ratings_df.columns)
    games_cols = set(games_df.columns)

    errors = []

    # Check 1: ratings.csv needs 'game_id'
    if 'game_id' not in ratings_cols:
        errors.append("ERROR: 'ratings.csv' is missing the required 'game_id' column.")

    # Check 2: games_details.csv needs 'game_id'
    if 'game_id' not in games_cols:
        errors.append("ERROR: 'games_details.csv' is missing the required 'game_id' column.")

    # Check 3: games_details.csv needs 'appid'
    if 'appid' not in games_cols:
        errors.append("ERROR: 'games_details.csv' is missing the required 'appid' column (which should be the renamed 'id' column).")

    # Check 4: games_details.csv should NOT have 'id'
    if 'id' in games_cols:
        errors.append("ERROR: 'games_details.csv' still contains an 'id' column. Please rename it to 'appid' and ensure the new 'game_id' column is present.")

    if errors:
        print("\n--- ðŸš¨ SCHEMA VALIDATION FAILED ðŸš¨ ---")
        for error in errors:
            print(f"- {error}")
        print("\nPlease correct your CSV files based on the errors above and run the script again.")
        print("Reminder: 'games_details.csv' must now have 'game_id' (to match ratings.csv) and 'appid' (the old id).")
        return False # Indicate failure

    print("âœ… Schema validation passed.")
    return True # Indicate success
# --- END NEW FUNCTION ---


def run_process(data_dir, models_dir, results_dir, force_retrain):
    """Main function to run the entire analysis pipeline."""
    print("Loading data...")
    games_path = os.path.join(data_dir, 'games_details.csv')
    ratings_path = os.path.join(data_dir, 'ratings.csv')
    try:
        games_df = pd.read_csv(games_path)
        ratings_df = pd.read_csv(ratings_path)
    except FileNotFoundError as e:
        print(f"Error: {e}. Make sure your CSV files are in the '{data_dir}' directory.")
        return

    # --- NEW: VALIDATION STEP ---
    # Run the schema check. If it fails, stop the script.
    if not validate_schema(ratings_df, games_df):
        return

    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(results_dir, exist_ok=True)

    print("Preprocessing data...")
    data = preprocess_data(games_df.copy(), ratings_df.copy())

    # --- CHANGED ---
    # Prepare games_df for fitting the preprocessor manually
    # Use 'game_id' as the unique identifier, not 'id'
    preprocessed_games_df = games_df.drop_duplicates(subset=['game_id']).copy()

    text_cols = ['user_tags', 'developer_genres', 'developers', 'publishers']
    for col in text_cols:
        preprocessed_games_df[col] = preprocessed_games_df[col].fillna('')
    if 'user_tags' in preprocessed_games_df.columns:
        preprocessed_games_df['user_tags'] = preprocessed_games_df['user_tags'].apply(parse_tags)
    for col in ['developer_genres', 'developers', 'publishers']:
        if col in preprocessed_games_df:
             preprocessed_games_df[col] = preprocessed_games_df[col].str.replace(',', ' ')

    preprocessed_games_df['metacritic_score'] = pd.to_numeric(preprocessed_games_df['metacritic_score'], errors='coerce')
    preprocessed_games_df['price_usd'] = pd.to_numeric(preprocessed_games_df['price_usd'], errors='coerce')
    preprocessed_games_df['release_year'] = pd.to_datetime(preprocessed_games_df['release_date'], errors='coerce', format='mixed').dt.year


    print("Fitting the feature preprocessor on all games...")
    preprocessor = build_feature_pipeline()
    preprocessor.fit(preprocessed_games_df)
    joblib.dump(preprocessor, os.path.join(models_dir, 'preprocessor.joblib'))
    print("Preprocessor fitted and saved.")

    try:
        feature_names = preprocessor.get_feature_names_out()
    except Exception as e:
        print(f"Warning: Could not get feature names from the preprocessor. Error: {e}")
        feature_names = None

    critics = data['critic_id'].unique()
    all_predictions = []
    all_importances = []
    all_best_trees = []

    print(f"\nFound {len(critics)} critics. Starting model processing loop...")
    for critic_id in critics:
        critic_data = data[data['critic_id'] == critic_id]

        if len(critic_data) < MIN_RATINGS_PER_CRITIC:
            print(f"Skipping critic {critic_id}: not enough ratings.")
            continue

        clf_path = os.path.join(models_dir, f'{critic_id}_classifier.joblib')
        reg_path = os.path.join(models_dir, f'{critic_id}_regressor.joblib')

        print(f"--- Training models for critic: {critic_id} ---")

        X_class_transformed = preprocessor.transform(critic_data)
        y_class = critic_data['will_skip']
        clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        clf.fit(X_class_transformed, y_class)
        joblib.dump(clf, clf_path)

        rated_games = critic_data.dropna(subset=['score_numeric'])
        reg = None
        if len(rated_games) >= MIN_RATINGS_PER_CRITIC / 2:
            X_reg_transformed = preprocessor.transform(rated_games)
            y_reg = rated_games['score_numeric']
            reg = RandomForestRegressor(n_estimators=100, random_state=42)
            reg.fit(X_reg_transformed, y_reg)
            joblib.dump(reg, reg_path)

        if feature_names is not None:
            skip_importances = pd.DataFrame({
                'feature': feature_names,
                'importance': clf.feature_importances_
            }).sort_values(by='importance', ascending=False).head(5)
            skip_importances['critic_id'] = critic_id
            skip_importances['model_type'] = 'skip_prediction'
            all_importances.append(skip_importances)

            if reg:
                score_importances = pd.DataFrame({
                    'feature': feature_names,
                    'importance': reg.feature_importances_
                }).sort_values(by='importance', ascending=False).head(5)
                score_importances['critic_id'] = critic_id
                score_importances['model_type'] = 'score_prediction'
                all_importances.append(score_importances)

        # --- NEW: Find and store the index of the most representative tree ---
        try:
            forest_predictions = clf.predict(X_class_transformed)

            best_tree_index = -1
            max_accuracy = -1

            for i, tree_estimator in enumerate(clf.estimators_):
                tree_predictions = tree_estimator.predict(X_class_transformed)
                accuracy = (forest_predictions == tree_predictions).mean()
                if accuracy > max_accuracy:
                    max_accuracy = accuracy
                    best_tree_index = i

            if best_tree_index != -1:
                all_best_trees.append({
                    'critic_id': critic_id,
                    'best_tree_index': best_tree_index
                })
        except Exception as e:
            print(f"Warning: Could not determine best tree for critic {critic_id}. Error: {e}")

        # --- CHANGED ---
        # Prepare games_df for prediction manually, using 'game_id' as the key
        preprocessed_games_for_prediction = games_df.drop_duplicates(subset=['game_id']).copy()

        text_cols = ['user_tags', 'developer_genres', 'developers', 'publishers']
        for col in text_cols:
            preprocessed_games_for_prediction[col] = preprocessed_games_for_prediction[col].fillna('')
        if 'user_tags' in preprocessed_games_for_prediction.columns:
            preprocessed_games_for_prediction['user_tags'] = preprocessed_games_for_prediction['user_tags'].apply(parse_tags)
        for col in ['developer_genres', 'developers', 'publishers']:
            if col in preprocessed_games_for_prediction:
                 preprocessed_games_for_prediction[col] = preprocessed_games_for_prediction[col].str.replace(',', ' ')

        preprocessed_games_for_prediction['metacritic_score'] = pd.to_numeric(preprocessed_games_for_prediction['metacritic_score'], errors='coerce')
        preprocessed_games_for_prediction['price_usd'] = pd.to_numeric(preprocessed_games_for_prediction['price_usd'], errors='coerce')
        preprocessed_games_for_prediction['release_year'] = pd.to_datetime(preprocessed_games_for_prediction['release_date'], errors='coerce', format='mixed').dt.year


        X_all_games_transformed = preprocessor.transform(preprocessed_games_for_prediction)
        skip_probs = clf.predict_proba(X_all_games_transformed)[:, 1]
        pred_scores = reg.predict(X_all_games_transformed) if reg else np.nan

        # --- CHANGED ---
        # Get 'game_id' and 'name' for the output file
        predictions_df = games_df.drop_duplicates(subset=['game_id'])[['game_id', 'name']].copy()
        predictions_df['critic_id'] = critic_id
        predictions_df['predicted_skip_probability'] = skip_probs
        predictions_df['predicted_score'] = pred_scores
        all_predictions.append(predictions_df)


    if all_predictions:
        final_pred_df = pd.concat(all_predictions, ignore_index=True)
        pred_output_path = os.path.join(results_dir, 'critic_predictions.csv')
        final_pred_df.to_csv(pred_output_path, index=False)
        print(f"\nâœ… Predictions saved to '{pred_output_path}'")
    else:
        print("\nNo predictions were generated.")

    if all_importances:
        final_imp_df = pd.concat(all_importances, ignore_index=True)
        imp_output_path = os.path.join(results_dir, 'critic_feature_importances.csv')
        final_imp_df.to_csv(imp_output_path, index=False)
        print(f"âœ… Feature importances saved to '{imp_output_path}'")

    # --- NEW: Save Best Tree Indices to a CSV ---
    if all_best_trees:
        final_tree_df = pd.DataFrame(all_best_trees)
        tree_output_path = os.path.join(results_dir, 'critic_best_trees.csv')
        final_tree_df.to_csv(tree_output_path, index=False)
        print(f"âœ… Best tree indices saved to '{tree_output_path}'")

run_process(
    data_dir=DATA_DIR,
    models_dir=MODELS_DIR,
    results_dir=RESULTS_DIR,
    force_retrain=FORCE_RETRAIN
)

Loading data...
Validating data schema...
âœ… Schema validation passed.
Preprocessing data...
Fitting the feature preprocessor on all games...
Preprocessor fitted and saved.

Found 10 critics. Starting model processing loop...
--- Training models for critic: 1 ---
--- Training models for critic: 2 ---
--- Training models for critic: 3 ---
--- Training models for critic: 4 ---
--- Training models for critic: 5 ---
--- Training models for critic: 6 ---
--- Training models for critic: 7 ---
--- Training models for critic: 8 ---
--- Training models for critic: 10 ---
--- Training models for critic: 9 ---

âœ… Predictions saved to 'results/critic_predictions.csv'
âœ… Feature importances saved to 'results/critic_feature_importances.csv'
âœ… Best tree indices saved to 'results/critic_best_trees.csv'
