In [None]:
import pandas as pd
from io import StringIO
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import NuSVR
import pickle
from datasets import load_dataset
hf_data = load_dataset("wykonos/steam_games")

In [2]:
train_data = hf_data['train']
train_data.to_csv('games_dataset.csv')

Creating CSV from Arrow format:   0%|          | 0/64 [00:00<?, ?ba/s]

12060816

In [3]:
def predict_names_with_scores(included_column_names, one_hot_df, model, train_with_info):
    if 'y' in included_column_names:
        included_column_names.remove('y')
    else:
        print("'y' not found in included_column_names list")
        return

    one_hot_df_trimmed = one_hot_df[included_column_names]
    test_names = one_hot_df_trimmed['name']
    test_X = one_hot_df_trimmed.drop('name', axis=1)
    predictions = model.predict(test_X)
    names_with_preds = list(zip(test_names, predictions))
    names_with_preds = sorted(names_with_preds, key=lambda x: x[1], reverse=True)
    for name, score in names_with_preds[:100]:
        if name not in train_with_info.name.values:
            print(f'{name} - {score}')

In [4]:
dataset = pd.read_csv('games_dataset.csv')
pd.set_option('display.max_rows', 500)
dataset = dataset.drop('metacritic_rating', axis=1)
dataset['tags'] = dataset['tags'].str.split('|')

In [5]:
mlb = MultiLabelBinarizer()
one_hot_df = dataset.join(pd.DataFrame(mlb.fit_transform(dataset.pop('tags')),
                          columns=mlb.classes_,
                          index=dataset.index))

In [6]:
one_hot_df['reviewer_rating'].fillna(0, inplace=True)
one_hot_df['reviewer_rating'] = one_hot_df['reviewer_rating'] / 10
columns_to_scale = ['year', 'positivity_ratio', 'to_beat_main', 'to_beat_extra',
                    'to_beat_completionist', 'extra_content_length']
for column in columns_to_scale:
    if column == 'positivity_ratio':
        one_hot_df['positivity_ratio'].fillna(0, inplace=True)
    else:
        one_hot_df[column].fillna(one_hot_df[column].mean(), inplace=True)
    one_hot_df[column] = MinMaxScaler().fit_transform(one_hot_df[[column]])

In [7]:
SAMPLE_TEST_SET = """
name;y
Adam Wolfe;7
Alan Wake;8
Alpha Protocol™;9
Assassin's Creed 2;7
Assassin's Creed™: Director's Cut Edition;7
Back to the Future: Ep 2 - Get Tannen!;8
Back to the Future: Ep 3 - Citizen Brown;8
Bastion;9
Batman: Arkham Asylum Game of the Year Edition;7
Batman: Arkham City - Game of the Year Edition;7
Batman: Arkham City;7
BioShock™ Infinite;7
Black Mesa;7
Borderlands 2;7
"""
train_raw = pd.read_csv(StringIO(SAMPLE_TEST_SET), sep=';')
train_with_info = pd.merge(one_hot_df, train_raw, on='name').drop('id', axis=1)

In [8]:
fixed_column_names = ['name', 'year', 'reviewer_rating', 'positivity_ratio',
                      'to_beat_main', 'to_beat_extra', 'to_beat_completionist',
                      'extra_content_length', 'y']
feature_cutoff = 0
column_stats = train_with_info.drop(fixed_column_names, axis=1).sum(axis=0)[1:].sort_values()

while feature_cutoff < len(train_with_info):
    good_columns = column_stats[column_stats >= feature_cutoff]
    if len(good_columns) + len(fixed_column_names) < len(train_with_info):
        included_column_names = list(good_columns.index)
        break
    else:
        feature_cutoff += 1

for (i, column) in enumerate(fixed_column_names):
    if column == 'y':
        continue
    included_column_names.insert(i, column)
included_column_names.append('y')
train_with_good_features = train_with_info[included_column_names]

In [9]:
names = train_with_good_features['name']
y = train_with_good_features['y']
X = train_with_good_features.drop('name', axis=1).drop('y', axis=1)

In [10]:
model = NuSVR(kernel='poly', degree=6)
model.fit(X, y)
accuracies = cross_val_score(model, X, y=y)
print("Cross Validation Accuracy:", sum(accuracies) / len(accuracies))

Cross Validation Accuracy: -0.7802676585511182


In [11]:
pickle.dump(model, open('game_model.pkl', 'wb'))

In [12]:
game_model = pickle.load(open('game_model.pkl', 'rb'))
predict_names_with_scores(included_column_names, one_hot_df, game_model, train_with_info)

Vertigo 2 - 8.220637687565002
Resident Evil 4 - 8.217496074596419
Touhou Hero of Ice Fairy - 8.216948632830395
Hi-Fi RUSH - 8.216556942661313
Pizza Tower - 8.211803043107487
Ember Knights - 8.199379083754081
Slime Rancher 2 - 8.199256320530814
PAC-MAN WORLD Re-PAC - 8.198757462686139
Nightmare of Decay - 8.198591765914102
Flatworld - 8.197771681012128
Supraland Six Inches Under - 8.197310572425096
Klonoa Phantasy Reverie Series - 8.196850556502675
Dungeon Munchies - 8.196539023228283
Bugsnax - 8.194938283335588
Will You Snail? - 8.194567039540722
Neon White - 8.191415589866764
The Upturned - 8.178306394679616
Paint the Town Red - 8.176170186649244
Touhou Endless Dream - 8.176084971969232
Everhood - 8.175910581202473
Ancient Dungeon - 8.175367395605377
ElecHead - 8.17306561225081
Rain on Your Parade - 8.172821631303151
Tree Simulator 2022 - 8.172603382356101
Psychonauts 2 - 8.172046669529898
Totally Accurate Battle Simulator - 8.171261304039234
Rhythm Doctor - 8.169212158547886
Elderand