In [2]:
import os
import sys
import json
import time
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

In [3]:
sys.path.append("../src/")

In [4]:
from utilities import *

In [5]:
pd.set_option("display.max_columns", None)

os.environ["PYTHONWARNINGS"] = "ignore"

In [6]:
from sklearn.model_selection import cross_val_score

In [7]:
filepath = os.path.join(dir_dict["train_test"], f"train.parquet")
df = pd.read_parquet(filepath)

to_drop = ["event_date", "fight_id", "fighter_id", "opponent_id"]
df = df.drop(to_drop, axis=1)

In [8]:
target = "fight_fighter_win"
X, y = df.drop(target, axis=1), df[target]

In [9]:
def cross_validate(estimator, X=X, y=y):
    return np.mean(cross_val_score(estimator, X, y, cv=5, n_jobs=5))

### Initial Model

In [10]:
from sklearn.linear_model import LogisticRegression

In [10]:
lr = LogisticRegression(random_state=42)

In [11]:
cross_validate(lr)

0.5762390670553936

### Scale Data

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [13]:
pipe = Pipeline([
    ("scaler", MinMaxScaler()),
    ("lr", LogisticRegression(max_iter=500, random_state=42))
])
cross_validate(pipe)

0.6077259475218659

### Model Selection (Hyperparameter Tuning & Feature Selection)

In [12]:
from sklearn.feature_selection import SelectKBest, f_regression

In [13]:
from sklearn.model_selection import RandomizedSearchCV

In [14]:
sys.path.append("../misc/")

In [15]:
from clf_param_grid_list_generator import clfs

In [None]:
search_results = dict()
for name, est_dict in tqdm(clfs.items()):
    pipe = Pipeline([
        ("scaler", MinMaxScaler()),
        ("selector", SelectKBest(f_regression)),
        ("model", est_dict["model"])
    ])
    
    param_grid = {f"model__{k}":v for k, v in est_dict["param_grid"].items()}
    param_grid["selector__k"] = [i*10 for i in range(20,57)]
    
    search = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=500, 
                                cv=3, n_jobs=int(cpu_count() / 2), scoring ="accuracy", verbose=5)
    
    search.fit(X, y)
    search_results[name] = {
        "best_params": search.best_params_,
        "best_score": search.best_score_
    }
    
    with open("../assets/search_results.json", "w") as fh:
        json.dump(search_results, fh)
        
    time.sleep(120)

  0%|                                                                                                                                                                                         | 0/10 [00:00<?, ?it/s]

Fitting 3 folds for each of 200 candidates, totalling 600 fits


In [39]:
with open("../assets/search_results.json", "r") as fh:
    search_results = json.load(fh)

In [40]:
name, best_result = max(search_results.items(), key=lambda x: x[1]["best_score"])
name, best_result

('LogisticRegression',
 {'best_params': {'selector__k': 360,
   'model__solver': 'liblinear',
   'model__max_iter': 1000,
   'model__C': 1},
  'best_score': 0.6109297826621695})

In [41]:
best_params = best_result["best_params"]

In [32]:
selector = SelectKBest(f_regression, best_params.pop("selector__k"))

TypeError: __init__() takes from 1 to 2 positional arguments but 3 were given

In [28]:
clfs[name]["model"].set_params(**best_result["best_params"])

ValueError: Invalid parameter 'selector' for estimator LogisticRegression(max_iter=500, random_state=42). Valid parameters are: ['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'].