In [25]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, make_scorer

# Load the new CSV file
df = pd.read_csv("SelectedGamesAdjustedData.csv")

print("New Data loaded successfully.")
print(f"Shape: {df.shape}")

New Data loaded successfully.
Shape: (304, 7)


In [26]:
# --- 1. Base Price Feature Engineering (Using the highly important 'Title' column) ---

# Calculate the median price for each game title when it is NOT on discount.
# This serves as the 'base retail price' of the game.
base_price_map = df[df['Is on Discount (1/0)'] == 0].groupby('Title')['Price (USD)'].median()

# Fallback: If a game only has discounted entries, use the overall median price of the game.
# This prevents NaN values.
base_price_map = base_price_map.combine_first(df.groupby('Title')['Price (USD)'].median())

# Map the calculated base price back to the main DataFrame
df['Base_Price'] = df['Title'].map(base_price_map)


# --- 2. Genre Feature Engineering ---

# Function to clean and parse the string representation of a list
def parse_list_string(list_string):
    if pd.isna(list_string):
        return []
    matches = re.findall(r"'(.*?)'", list_string)
    return [match.strip() for match in matches if match.strip()]

# Apply the parsing function
df['Genres'] = df['Genres'].apply(parse_list_string)

# Use MultiLabelBinarizer for the Genres column
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['Genres'])
genre_df = pd.DataFrame(genre_matrix, columns=[f'Genre_{c}' for c in mlb.classes_])

# Merge the new genre features and Base_Price, and drop the originals
df_processed = pd.concat([df.drop(columns=['Genres', 'Title']).reset_index(drop=True), genre_df], axis=1)

print("Feature engineering complete. Base_Price added and Genres binarized.")
print(df_processed[['Base_Price', 'Price (USD)']].head())

Feature engineering complete. Base_Price added and Genres binarized.
   Base_Price  Price (USD)
0      20.615        24.99
1      20.615        10.00
2      20.615        14.99
3      20.615         2.50
4      20.615        18.74


In [27]:
# 1. Define Target Variable (y)
y = df_processed['Price (USD)']

# 2. Define Features (X) - All columns except the target
X = df_processed.drop(columns=['Price (USD)'])

# 3. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"Features used (including Base_Price and Genres): {X_train.columns.tolist()}")

X_train shape: (243, 10)
Features used (including Base_Price and Genres): ['Has DLC (1/0)', 'Is F2P (1/0)', 'Is on Game Pass (1/0)', 'Is on Discount (1/0)', 'Base_Price', 'Genre_Adventure', 'Genre_Brawler', 'Genre_Platform', 'Genre_RPG', 'Genre_Strategy']


In [28]:
# Create the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Create a simple pipeline
pipeline = Pipeline(steps=[
    ('regressor', rf_model)
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the initial model
y_pred_initial = pipeline.predict(X_test)
mae_initial = mean_absolute_error(y_test, y_pred_initial)
r2_initial = r2_score(y_test, y_pred_initial)

print(f"--- Initial Model Evaluation with Base_Price Feature ---")
print(f"Mean Absolute Error (MAE): ${mae_initial:.2f}")
print(f"R-squared (R2) Score: {r2_initial:.4f}")

--- Initial Model Evaluation with Base_Price Feature ---
Mean Absolute Error (MAE): $1.28
R-squared (R2) Score: 0.9914


In [29]:
# Define the parameter grid to search
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [5, 10, 15, None],
    'regressor__min_samples_split': [2, 5]
}

# Use MAE as the scoring metric (we want to minimize it)
# We use 'neg_mean_absolute_error' because GridSearchCV always maximizes the score.
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Setup GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5, # 5-fold cross-validation
    scoring=mae_scorer,
    n_jobs=-1,
    verbose=1
)

print("Starting Hyperparameter Tuning...")

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# The best estimator found by the search
best_pipeline = grid_search.best_estimator_

print("\n--- Tuning Complete ---")
print(f"Best Parameters found: {grid_search.best_params_}")

Starting Hyperparameter Tuning...
Fitting 5 folds for each of 24 candidates, totalling 120 fits

--- Tuning Complete ---
Best Parameters found: {'regressor__max_depth': None, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}


In [30]:
# Make predictions using the best tuned model
y_pred_tuned = best_pipeline.predict(X_test)

# Evaluate the final model
mae_final = mean_absolute_error(y_test, y_pred_tuned)
r2_final = r2_score(y_test, y_pred_tuned)

print(f"--- Final Tuned Model Evaluation ---")
print(f"Mean Absolute Error (MAE): ${mae_final:.2f}")
print(f"R-squared (R2) Score: {r2_final:.4f}")

# Check against the targets
if mae_final < 4 and r2_final > 0.8:
    print("\n✅ SUCCESS: Model meets both target criteria!")
else:
    print("\n❌ NOTE: Model did not meet both target criteria. Consider adding more base game data or exploring Gradient Boosting models.")

--- Final Tuned Model Evaluation ---
Mean Absolute Error (MAE): $1.28
R-squared (R2) Score: 0.9911

✅ SUCCESS: Model meets both target criteria!
