In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

# Dimensionality Reduction
from sklearn.decomposition import PCA
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Classification
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Local imports - these functions are located in ./local_definitions.py
from local_definitions import plot_confusion_matrix
from local_definitions import plot_accuracy_with_errorbar
from local_definitions import plot_accuracy
from local_definitions import print_max_accuracy

In [2]:
# Import data from main_notebook.ipynb

import pickle

pickle_in = open("./data/games_dfs.pickle","rb")
games = pickle.load(pickle_in)
pickle_in.close()

games_10_df = games["games_10_df"]
games_14_df = games["games_14_df"]
games_18_df = games["games_18_df"]
games_comb_df = games["games_comb_df"]

features_list = games["features_list"] # sorted alphabetically

avg_importance_scores_df = games["avg_importance_scores_df"] # most important to least important
features_array = games["features_array"] # sorted from most important to least important

features_dict = {
    "out_of_game" : ["value","top100","rank"],
    "in_game" : ["passes completed", "total passes", "passes acc", "total shots", "on-target", "possession",
                  "distance", "distance poss", "distance not poss", "offsides", "fouls committed", "fouls suffered",
                  "yellow", "red", "high intensity", "low intensity", "sprints", "time opp half", "time opp third"],
    "all_features" : features_list
}

In [26]:
def visualize_decision_boundary_plotly(clf, X, y, X_test, y_test, title="", h=0.02):
    '''
    Pass in a trained classifier and this function will produce two plots showing the classifiers decision boundaries.
    The first plot (left) overlays the training points colored by their true labels over the decision boundary plot.
    The second plot (right) overlays the testing points colored by their true labels over the decision boundary plot.
    Uses Plotly.
    
    Inputs
    -------------
        clf:    sklearn classifier. Should already have been trained using .fit(X_train, y_train)
        X:      X_train. Pass in the training X data
        y:      y_train. Pass in the training y data
        X_test: testing X data
        y_test: testing y data
        title:  the title for the entire figure
        h:      step size to produce the decision boundary, which is implemented using a heat map
    
    '''
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx_1, xx_2 = np.meshgrid(np.arange(x1_min, x1_max, h),
                         np.arange(x2_min, x2_max, h))
    y_ = np.arange(x2_min, x2_max, h)

    Z = clf.predict(np.c_[xx_1.ravel(), xx_2.ravel()])
    Z = Z.reshape(xx_1.shape)

    my_color = 'Spectral'

    fig = make_subplots(rows=1, cols=2, subplot_titles=("Training Points",
                                                        "Testing Points"))

    left_heatmap = go.Heatmap(x=xx_1[0], y=y_, z=Z,
                        colorscale=my_color,
                        opacity=0.8,
                        showscale=False)

    left_scatter = go.Scatter(x=X[:, 0], y=X[:, 1], 
                        mode='markers',
                        showlegend=False,
                        marker=dict(size=7,
                                    color=y, 
                                    colorscale=my_color,
                                    line=dict(color='black', width=1))
                        )

    right_heatmap = go.Heatmap(x=xx_1[0], y=y_, 
                        z=Z,
                        colorscale=my_color,
                        opacity=0.8,
                        showscale=False)

    right_win = go.Scatter(x=X_test[y_test==1, 0], y=X_test[y_test==1, 1], 
                        mode='markers',
                        showlegend=True,
                        name="Win",
                        marker=dict(size=7,
                                    color='rgb(94, 79, 162)', #purple
                                    line=dict(color='black', width=1))
                        )
    right_draw = go.Scatter(x=X_test[y_test==0, 0], y=X_test[y_test==0, 1], 
                        mode='markers',
                        showlegend=True,
                        name="Draw",
                        marker=dict(size=7,
                                    color='rgb(255,255,191)', #yellow
                                    line=dict(color='black', width=1))
                        )
    right_lose = go.Scatter(x=X_test[y_test==-1, 0], y=X_test[y_test==-1, 1], 
                        mode='markers',
                        showlegend=True,
                        name="Lose",
                        marker=dict(size=7,
                                    color='rgb(158,1,66)', #red
                                    line=dict(color='black', width=1))
                        )

    fig.append_trace(left_heatmap, 1, 1)
    fig.append_trace(left_scatter, 1, 1)
    
    fig.append_trace(right_heatmap, 1, 2)
    fig.append_trace(right_win, 1, 2)
    fig.append_trace(right_draw, 1, 2)
    fig.append_trace(right_lose, 1, 2)

    fig.update_layout(height=480, 
                      width=960, 
                      title_text=title, 
                      xaxis1=dict(title='feature 0',
                                 showticklabels=False,
                                 zeroline=False),
                      yaxis1=dict(title='feature 1',
                                 showticklabels=False,
                                 zeroline=False),
                      xaxis2=dict(title='feature 0',
                                 showticklabels=False,
                                 zeroline=False),
                      yaxis2=dict(title='feature 1',
                                 showticklabels=False,
                                 zeroline=False),
                      legend=dict(itemsizing='constant',
                                  traceorder='reversed')
                     )
    

    fig.show()

# a1: SVM decision boundaries

In [28]:
X = np.array(games_comb_df[features_array]) # all 22 features
y = np.array(games_comb_df['WDL'])

# Split data into training and testing (random_state=42 is a random seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

lda = LinearDiscriminantAnalysis()
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

# Fit a classifier on the training data. 
clf = SVC(kernel='rbf', C=0.55)
clf.fit(X_train, y_train)

#Note: The decision boundaries are created using the training data.

# visualize_decision_boundary for training and testing
visualize_decision_boundary_plotly(clf, X_train, y_train, X_test, y_test, 
                            title="Decision Boundaries for SVC with LDA:")
