In [14]:
import json
import scipy as sp
import scipy.sparse
import matplotlib.pyplot as plt
import joblib
import seaborn as sns
import pandas as pd
import numpy as np
import pickle

from numpy.random import default_rng


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, mean_squared_error
from sklearn.feature_selection import RFE
sns.set()
rng = default_rng()
%matplotlib inline

In [20]:
# hero_selection = sp.sparse.load_npz("../data/hero_selection.npz")
# match_winner = np.load("../data/match_winner.npy")
# X = sp.sparse.load_npz("../data/big1_game_features.npz")
# y = np.load("../data/big1_match_winners.npy")
X = sp.sparse.load_npz("../data/big2_7080_game_features.npz")[:,:7080]
y = np.load("../data/big2_7080_match_winners.npy")

# y = (y * 2) - 1
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True) 

print(X.shape, y.shape, f"y mean: {y.mean()}")

(2162047, 7080) (2162047,) y mean: 0.5001389886528831


In [21]:
clf = LogisticRegression(random_state=21, max_iter=200).fit(x_train, y_train)
y_h = clf.predict(x_test)
acc = accuracy_score(y_test, y_h)

print(clf.score(x_test, y_test))
print(mean_squared_error(y_test, y_h))
print(r2_score(y_test, y_h))
print(confusion_matrix(y_test, y_h))
y_test_prob = clf.predict_proba(x_test) 
m = y_test_prob.max(axis=1)

# antall matches som classifieren er trygg på.
# Hadde vi valgt ut kun disse gamesa, hadde vi nok hatt bedre acc
print((m.mean() > m).sum())

0.5787747739413982
0.4212252260586018
-0.6849011106070642
[[187616 136578]
 [136635 187786]]
372982


In [22]:
joblib.dump(clf, "../models/logreg_7080features.joblib")


['../models/logreg_7080features.joblib']

# feature selection

In [23]:
def create_feature_index():
    heroes_path = "../data/hid_to_rid_dict.json"

    with open(heroes_path, 'r') as fp:
        heroes = json.load(fp)
        heroes = {int(k): int(v) for k, v in heroes.items()} 
        
    feature_set = set()
    for hero_id in heroes.keys():
        feature_set.add(("hero", hero_id))

        for hero_opp in heroes.keys():
            if hero_id == hero_opp:
                continue

            key = [hero_id, hero_opp]
            key.sort()
            feature_set.add(("pair_opp", tuple(key)))

        for hero_same in heroes.keys():
            if hero_id == hero_same:
                continue

            key = [hero_id, hero_same]
            key.sort()

            feature_set.add(("pair_same", tuple(key)))

    print("#feature_set:", len(feature_set))
    features_to_index = {}
    index_to_features = {}
    for k, v in enumerate(feature_set):
        features_to_index[v] = k
        index_to_features[k] = v
    
    return features_to_index, index_to_features
    
features_to_index, index_to_features = create_feature_index()

def rules_to_training_example(rules=None, draft=None):
    t0 = set(sorted(draft[::2], key=lambda x: x))
    t1 = set(sorted(draft[1::2], key=lambda x: x))

    t0_opp_pairs = [{x, y} for x in t0 for y in t1]
    t1_opp_pairs = [{x, y} for x in t1 for y in t0]

    training_example = np.zeros(14161)

    for rule, rule_i in rules.items():
        r_type = rule[0]
        r_heroes = rule[1]
        if type(r_heroes) is tuple:
            r_heroes = set(r_heroes)

        if r_type == "pair_same":
            if r_heroes.issubset(t0):
                training_example[rule_i] = 1.
            if r_heroes.issubset(t1):
                training_example[rule_i] = -1.

        elif r_type == "pair_opp":
            if r_heroes in t0_opp_pairs:
                training_example[rule_i] = 1.
            if r_heroes in t1_opp_pairs:
                training_example[rule_i] = -1.

        elif r_type == "hero":
            if r_heroes in t0:
                training_example[rule_i] = 1.
            if r_heroes in t1:
                training_example[rule_i] = -1.
        else:
            raise Exception(f"No rule with type {r_type}")

    return training_example

#feature_set: 14161


In [24]:
# logreg = LogisticRegression(random_state=21, max_iter=200)
# rfe = RFE(logreg, n_features_to_select=None, step=0.1, verbose=1)
# rfe = rfe.fit(x_train, y_train)


In [25]:
rfe = joblib.load("../models/rfe_logreg_7080.joblib")
rfe_ranking = rfe.ranking_
rfe_support = rfe.support_
chosen_rules = np.where(rfe_ranking == 1)[0]

# joblib.dump(rfe, "../models/rfe_logreg_7080.joblib")


In [11]:
chosen_rules[:12]

array([ 0,  2,  3,  4,  6,  7,  8,  9, 11, 16, 18, 20])

In [12]:
rules = [index_to_features[x] for x in chosen_rules]

7080

In [7]:
# new_index_to_features = {k:v for (k,v) in index_to_features.items() if rfe_support[k]}
new_index_to_features = {}
new_i = 0

for k,v in index_to_features.items():
    if rfe_support[k]:
        new_index_to_features[new_i] = v
        new_i += 1

new_features_to_index = {v:k for (k,v) in new_index_to_features.items()}