# Determining variable correlation using IJS- FASTENER
|- https://github.com/JozefStefanInstitute/FASTENER   
**The goal of this notebook is to find variables that are good for predicting analog2 and tot1 change.**  
This notebook keeps the example structure of FASTENER project and the "fastner" folder is also copied from FASTENER    

The following steps are shown in the example:
* data preparation
* feature evaluation function `eval_func` implementation, which is responsible for calculating information gain of a particular feature
* setting up the FASTENER (including basic description of the options)
* running the FASTENER loop
* reading the fastener results  
**Imports:**

In [1]:
# import preprocessing tools
from sklearn import preprocessing
%reload_ext autoreload
%autoreload 2
import os 
import sys
import pandas as pd
import numpy as np
from datetime import datetime
import plotly 
import plotly.graph_objects as go
from datetime import timedelta

sys.path.insert(0, "./../../")

from src.data_loader import add_season_rankings
from src.preprocessing.DataPreparator import DataPreparator

# FASTENER

# import learning/evaluation
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, r2_score
from sklearn.model_selection import cross_val_score

# typing
from typing import Dict, List, Callable, Any, Tuple, Optional, \
    Counter as CounterType, Set

# FASTENER specific imports
from fastener_src.random_utils import shuffle
from fastener_src import random_utils
from fastener_src.item import Item, EvalItem, Result, Population, flatten_population, FitnessFunction, \
    Genes, EvalItem, RandomFlipMutationStrategy, RandomEveryoneWithEveryone, \
    IntersectionMating, UnionMating, IntersectionMatingWithInformationGain, \
    IntersectionMatingWithWeightedRandomInformationGain, UnevaluatedPopulation, \
    MatingStrategy, MutationStrategy, MatingSelectionStrategy
from fastener_src import fastener

In [2]:
%%time
data_p = DataPreparator(force_recompute_method_data=False, force_recompute_class_data=False)
all_games = data_p.get_games_df()
all_games = add_season_rankings(all_games)
display(all_games)


Loading game data from file ...


  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])
  teams = (home_teams.append(visitor_teams)).sort_values(by=["record_wins"])

Unnamed: 0,GAME_ID,play_count,home_team_id,visitor_team_id,home_record_wins,home_record_losses,season_name,visitor_team_city,visitor_team_nickname,home_final_score,...,visitor_recent_3PT_made,visitor_recent_3PT_missed,visitor_recent_ft_made,visitor_recent_ft_missed,visitor_recent_players_deployed,visitor_recent_rebound,visitor_recent_turnover,visitor_recent_foul,home_rank,visitor_rank
0,20000001,429,1610612752,1610612755,0,1,2000,Philadelphia,76ers,72,...,,,,,,,,,0,0
1,20000002,510,1610612751,1610612739,0,1,2000,Cleveland,Cavaliers,82,...,,,,,,,,,0,0
2,20000003,478,1610612753,1610612764,1,0,2000,Washington,Wizards,97,...,,,,,,,,,0,0
3,20000004,448,1610612737,1610612766,0,1,2000,Charlotte,Hornets,82,...,,,,,,,,,0,0
4,20000005,505,1610612761,1610612765,0,1,2000,Detroit,Pistons,95,...,,,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22960,21801226,474,1610612749,1610612760,60,22,2018,Oklahoma City,Thunder,116,...,12.466667,23.133333,15.333333,6.266667,10.466667,47.400000,13.266667,21.200000,11,8
22961,21801227,443,1610612759,1610612742,48,34,2018,Dallas,Mavericks,105,...,13.666667,26.266667,18.000000,5.866667,11.000000,46.333333,10.733333,21.133333,9,21
22962,21801228,434,1610612743,1610612750,54,28,2018,Minnesota,Timberwolves,99,...,10.133333,19.333333,17.800000,5.000000,10.866667,42.533333,12.666667,19.266667,10,9
22963,21801229,555,1610612746,1610612762,48,34,2018,Utah,Jazz,143,...,13.066667,20.533333,18.000000,6.200000,11.600000,47.733333,14.200000,21.266667,13,8


Wall time: 1.79 s


In [3]:
display(all_games["home_win"])

0        0
1        0
2        1
3        0
4        0
        ..
22960    0
22961    1
22962    1
22963    1
22964    1
Name: home_win, Length: 22965, dtype: int32

Selecting a subset of data on which the calculations will be done....

In [4]:
selected_columns = ["season_name", "home_team_id", "home_recent_TSP", "home_final_score_diff",
                    "home_recent_home_game_ratio", "home_recent_win_ratio",
                    "home_recent_points", "home_recent_fg_made",
                    "home_recent_fg_missed", "home_recent_3PT_made",
                    "home_recent_3PT_missed", "home_recent_ft_made",
                    "home_recent_ft_missed", "home_recent_players_deployed",
                    "home_recent_rebound", "home_recent_turnover", "home_recent_foul",
                    "home_common_lineup", "home_rank",
                    "visitor_team_id", "visitor_recent_TSP", 
                    "visitor_final_score_diff",
                    "visitor_recent_home_game_ratio", "visitor_recent_win_ratio",
                    "visitor_recent_points", "visitor_recent_fg_made",
                    "visitor_recent_fg_missed", "visitor_recent_3PT_made",
                    "visitor_recent_3PT_missed", "visitor_recent_ft_made",
                    "visitor_recent_ft_missed", "visitor_recent_players_deployed",
                    "visitor_recent_rebound", "visitor_recent_turnover", "visitor_recent_foul",
                    "visitor_common_lineup", "visitor_rank"]
x_df = all_games[selected_columns]
y_series = all_games["home_win"]
display(x_df, y_series)

Unnamed: 0,season_name,home_team_id,home_recent_TSP,home_final_score_diff,home_recent_home_game_ratio,home_recent_win_ratio,home_recent_points,home_recent_fg_made,home_recent_fg_missed,home_recent_3PT_made,...,visitor_recent_3PT_made,visitor_recent_3PT_missed,visitor_recent_ft_made,visitor_recent_ft_missed,visitor_recent_players_deployed,visitor_recent_rebound,visitor_recent_turnover,visitor_recent_foul,visitor_common_lineup,visitor_rank
0,2000,1610612752,,,,,,,,,...,,,,,,,,,0,0
1,2000,1610612751,,,,,,,,,...,,,,,,,,,1,0
2,2000,1610612753,,,,,,,,,...,,,,,,,,,0,0
3,2000,1610612737,,,,,,,,,...,,,,,,,,,1,0
4,2000,1610612761,,,,,,,,,...,,,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22960,2018,1610612749,0.557041,8.133333,0.533333,0.666667,121.000000,43.666667,48.400000,13.800000,...,12.466667,23.133333,15.333333,6.266667,10.466667,47.400000,13.266667,21.200000,0,8
22961,2018,1610612759,0.555148,4.400000,0.533333,0.666667,111.133333,43.066667,45.866667,9.800000,...,13.666667,26.266667,18.000000,5.866667,11.000000,46.333333,10.733333,21.133333,0,21
22962,2018,1610612743,0.509244,-0.933333,0.400000,0.600000,104.200000,40.200000,48.600000,10.133333,...,10.133333,19.333333,17.800000,5.000000,10.866667,42.533333,12.666667,19.266667,0,9
22963,2018,1610612746,0.593993,1.266667,0.666667,0.666667,117.800000,42.133333,45.133333,11.133333,...,13.066667,20.533333,18.000000,6.200000,11.600000,47.733333,14.200000,21.266667,0,8


0        0
1        0
2        1
3        0
4        0
        ..
22960    0
22961    1
22962    1
22963    1
22964    1
Name: home_win, Length: 22965, dtype: int32

In [5]:
# fill with 0 so that the methods works
all_games = all_games.fillna(0)
# all_games = all_games[all_games["season_name"] == 2017]
display(all_games)

Unnamed: 0,GAME_ID,play_count,home_team_id,visitor_team_id,home_record_wins,home_record_losses,season_name,visitor_team_city,visitor_team_nickname,home_final_score,...,visitor_recent_3PT_made,visitor_recent_3PT_missed,visitor_recent_ft_made,visitor_recent_ft_missed,visitor_recent_players_deployed,visitor_recent_rebound,visitor_recent_turnover,visitor_recent_foul,home_rank,visitor_rank
0,20000001,429,1610612752,1610612755,0,1,2000,Philadelphia,76ers,72,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0
1,20000002,510,1610612751,1610612739,0,1,2000,Cleveland,Cavaliers,82,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0
2,20000003,478,1610612753,1610612764,1,0,2000,Washington,Wizards,97,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0
3,20000004,448,1610612737,1610612766,0,1,2000,Charlotte,Hornets,82,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0
4,20000005,505,1610612761,1610612765,0,1,2000,Detroit,Pistons,95,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22960,21801226,474,1610612749,1610612760,60,22,2018,Oklahoma City,Thunder,116,...,12.466667,23.133333,15.333333,6.266667,10.466667,47.400000,13.266667,21.200000,11,8
22961,21801227,443,1610612759,1610612742,48,34,2018,Dallas,Mavericks,105,...,13.666667,26.266667,18.000000,5.866667,11.000000,46.333333,10.733333,21.133333,9,21
22962,21801228,434,1610612743,1610612750,54,28,2018,Minnesota,Timberwolves,99,...,10.133333,19.333333,17.800000,5.000000,10.866667,42.533333,12.666667,19.266667,10,9
22963,21801229,555,1610612746,1610612762,48,34,2018,Utah,Jazz,143,...,13.066667,20.533333,18.000000,6.200000,11.600000,47.733333,14.200000,21.266667,13,8


## Data preparation 

Selecting attributes from all data and the target variable.

In [6]:
# scikit-learn 0.22+ is needed 
# data_names = [name for name in data_subset.columns if name != filled_atr]
# print(data_names)

def generate_X_Y_df(data_frame, data_x, target_y):
    # data_matrix and target split
    X_df = data_frame[data_x]
    Y_df = data_frame[target_y]
    
    # basic dataset split
    n_sample = X_df.shape[0]
    n_test = int(n_sample * 0.8)
    
    return X_df, Y_df, n_test

X_df, y_df, n_test = generate_X_Y_df(all_games, selected_columns, "home_win")

labels_train = y_df.to_numpy().astype(float)[:n_test]
labels_test = y_df.to_numpy().astype(float)[n_test:]

XX_train = X_df.to_numpy()[:n_test, :]
XX_test = X_df.to_numpy()[n_test:, :]

## Evaluation function 

Copied this function from FASTENER and changed score to r2.

In [7]:
def eval_fun(model: Any, genes: "Genes", shuffle_indices: Optional[List[int]] = None) -> "Result":
    test_data = XX_test[:, genes]
    if shuffle_indices:
        test_data = test_data.copy()
        for j in shuffle_indices:
            shuffle(test_data[:, j])
    pred = model.predict(test_data)
    res = Result(r2_score(labels_test, pred))
    return res

## Setting configuration parameters 
Preparing variables to give as parameters to EntropyOptimizer. 
   
I had some problems with: ValueError: Unknown label type: 'continuous'   
|- This is because for classification labels can't be float you have to convert them    
|- Code for this:   
|---- lab_enc = preprocessing.LabelEncoder()   
|---- labels_train = lab_enc.fit_transform(labels_train)   

In [8]:
number_of_genes = XX_train.shape[1]
general_model = DecisionTreeClassifier # RandomForestRegressor DecisionTreeClassifier LinearRegression

# Select mating selection strategies (RandomEveryoneWithEveryone, NoMating) and mating strategy
# - ta spodi dela kul
mating = RandomEveryoneWithEveryone(
    pool_size=5, 
    mating_strategy=IntersectionMatingWithWeightedRandomInformationGain(regression=True))
""" 
novejsa verzija ki spije več rama ?
mating = RandomEveryoneWithEveryone(pool_size=5, 
mating_strategy=IntersectionMatingWithWeightedRandomInformationGain(regression=True))
"""
# Random mutation (probability of gene mutating: 1 / number_of_genes)
mutation = RandomFlipMutationStrategy(1 / number_of_genes)

#to start the algorithm initial_genes or initial_population must be provided
initial_genes = [
    [0]
]

In [9]:
# output folder name must be changed every time the algorithm is run
output_folder_name = "output"
entropy_optimizer = fastener.EntropyOptimizer(
    general_model, XX_train, labels_train, eval_fun,
    number_of_genes, mating, mutation, initial_genes=initial_genes,
    config=fastener.Config(output_folder="output", random_seed=2020, reset_to_pareto_rounds=5)
)

In [10]:
print(labels_train)
print(XX_train)

[0. 0. 1. ... 1. 1. 1.]
[[2.00000000e+03 1.61061275e+09 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.00000000e+03 1.61061275e+09 0.00000000e+00 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [2.00000000e+03 1.61061275e+09 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [2.01500000e+03 1.61061275e+09 5.69111755e-01 ... 1.96666667e+01
  0.00000000e+00 2.10000000e+01]
 [2.01500000e+03 1.61061276e+09 5.41860978e-01 ... 2.19333333e+01
  1.00000000e+00 2.00000000e+01]
 [2.01500000e+03 1.61061276e+09 5.42420464e-01 ... 2.22666667e+01
  1.00000000e+00 2.30000000e+01]]


## Running the algorithm

In [11]:
entropy_optimizer.mainloop()

Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Round: 25
Round: 26
Round: 27
Round: 28
Round: 29
Round: 30
Round: 31
Round: 32
Round: 33
Round: 34
Round: 35
Round: 36
Round: 37
Round: 38
Round: 39
Round: 40
Round: 41
Round: 42
Round: 43
Round: 44
Round: 45
Round: 46
Round: 47
Round: 48
Round: 49
Round: 50
Round: 51
Round: 52
Round: 53
Round: 54
Round: 55
Round: 56
Round: 57
Round: 58
Round: 59
Round: 60
Round: 61
Round: 62
Round: 63
Round: 64
Round: 65
Round: 66
Round: 67
Round: 68
Round: 69
Round: 70
Round: 71
Round: 72
Round: 73
Round: 74
Round: 75
Round: 76
Round: 77
Round: 78
Round: 79
Round: 80
Round: 81
Round: 82
Round: 83
Round: 84
Round: 85
Round: 86
Round: 87
Round: 88
Round: 89
Round: 90
Round: 91
Round: 92
Round: 93
Round: 94
Round: 95
Round: 96
Round: 97
Round: 98
Round: 99
Round: 100
Round: 1

Round: 756
Round: 757
Round: 758
Round: 759
Round: 760
Round: 761
Round: 762
Round: 763
Round: 764
Round: 765
Round: 766
Round: 767
Round: 768
Round: 769
Round: 770
Round: 771
Round: 772
Round: 773
Round: 774
Round: 775
Round: 776
Round: 777
Round: 778
Round: 779
Round: 780
Round: 781
Round: 782
Round: 783
Round: 784
Round: 785
Round: 786
Round: 787
Round: 788
Round: 789
Round: 790
Round: 791
Round: 792
Round: 793
Round: 794
Round: 795
Round: 796
Round: 797
Round: 798
Round: 799
Round: 800
Round: 801
Round: 802
Round: 803
Round: 804
Round: 805
Round: 806
Round: 807
Round: 808
Round: 809
Round: 810
Round: 811
Round: 812
Round: 813
Round: 814
Round: 815
Round: 816
Round: 817
Round: 818
Round: 819
Round: 820
Round: 821
Round: 822
Round: 823
Round: 824
Round: 825
Round: 826
Round: 827
Round: 828
Round: 829
Round: 830
Round: 831
Round: 832
Round: 833
Round: 834
Round: 835
Round: 836
Round: 837
Round: 838
Round: 839
Round: 840
Round: 841
Round: 842
Round: 843
Round: 844
Round: 845
Round: 846

## Reading results
Best correlated features for a given time frame most correlation in ussualy between dates in one month range.    
**On attributes ['hour', 'minute', 'tot1 change'] or just ['hour', 'tot1 change']**

<pre>
Features: ['1_day_rolling', '3_day_rolling', 'analog2_fill-1_day_shifted', 'analog2_fill-2_day_shifted', 'analog2_fill-4_day_shifted', 'analog2_fill-5_day_shifted', 'analog2_fill-7_day_shifted']
Accuracy:  0.839029305148926  stdev:  0.13488830453845554 <pre/>

In [12]:
# read log from last generation
data_dump = pd.read_pickle(f'log/{output_folder_name}/generation_1000.pickle')
# print(data_dump.front)

# list of best-scoring EvalItem objects for each number of features
best = list(data_dump.front.values())

best_arr = [0, []]
for item in best:
    # names of best features
    selected_features =X_df.iloc[:, item.genes].columns.tolist()
    X = X_df[selected_features].values.astype(float)
    y = y_df.values.astype(float)
    
    # evaluates each set of features with cross validation
    model = DecisionTreeClassifier() # DecisionTreeClassifier()  model = LinearRegression()
    cvs = cross_val_score(model, X, y, cv=10)
    print("Features:")
    for feature in selected_features:
        print(f"- {feature}")
    print("Accuracy: ", cvs.mean(), " stdev: ", cvs.std(), "\n")
    
    if cvs.mean() > best_arr[0]:
        best_arr[0] = cvs.mean()
        best_arr[1] = selected_features
    # print("\n")

Features:
- home_recent_win_ratio
Accuracy:  0.6284348506383877  stdev:  0.009256849133799209 

Features:
- home_recent_win_ratio
- visitor_recent_win_ratio
Accuracy:  0.6535604120812027  stdev:  0.010808832841248636 



In [13]:
print(best_arr)

[0.6535604120812027, ['home_recent_win_ratio', 'visitor_recent_win_ratio']]
