### Dependences

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
import pandas as pd
import numpy as np
from sklearn import preprocessing, model_selection, svm
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

### Feature analysis class

In [None]:
# print('accuracy', result[-1])

class InferringFeature:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        # self.x_inferring = np.arange(0, 1, 0.001)
        self.scaled_X = StandardScaler().fit_transform(X)

    def get_x_coordinate(self, feature_index=1, x_type='continous'):
        if self.x_coordinate_next != '':
            return self.x_coordinate_next
        if x_type=='continous':
            feature = self.X[:, feature_index]
            return np.arange(feature.min(), feature.max(), .01)
        else:
            feature = self.X[:, feature_index]
            return np.arange(feature.min(), feature.max()+1)
        
    x_coordinate_next = ''

    def infer(
        self, model, need_plot = True, predict_func = 'predict_proba', 
        feature_index = 1, result_index=1,x_type = 'continous',
        plot_args = {}
    ):
        specificFeature = self.X[:, feature_index].reshape(-1,1)

        y_inferring = []
        x_inferring = self.get_x_coordinate(feature_index, x_type)
        feature_mean = specificFeature.mean()
        feature_std = specificFeature.std()
        for v in x_inferring:
            v_scaled = (v - feature_mean) / feature_std # standard scale for each v
            modified_test_X = self.scaled_X.copy()
            modified_test_X[:, feature_index] = v_scaled # modify home attendance
            modified = eval('model.{}(modified_test_X)'.format(predict_func))
            modified_mean = modified[:, result_index].mean() if len(modified.shape) >1 else modified.mean()
            y_inferring.append(modified_mean)
        if need_plot:
            self.plot(y_inferring, feature_index=feature_index,result_index=result_index, x_type=x_type, **plot_args)
        self.x_coordinate_next=''
        return y_inferring
    
    def plot(self, y, title=None, result_index=1, feature_index=1, x_type='continous', xlabel='Home Attendance Percentage'):
        x_inferring = self.get_x_coordinate(feature_index, x_type)
        plt.plot(x_inferring, y, label='modified')
        if title:
            plt.title(title)
        plt.ylabel('Mean of modeling result P(y = {})'.format(result_index))
        plt.xlabel(xlabel)
        # plt.legend()
        sns.despine()
        self.x_coordinate_next=''
        return plt

### Changing result feature： 1 equals win or tie, 0 equals lose

In [None]:
def change_result_from_raw2data(
    path1 = '/datasets/rawdataversion2/raw_data3.csv', 
    path2 = '/root/work/demonstration/datasets/final.csv',
    logic = lambda raw: 1 if raw['Score'] - raw['Lost'] >= 0 else 0
):
    raw2 = pd.read_csv(path1)
    data = pd.read_csv(path2)


    raw2['result'] = raw2.apply(logic, axis=1)

    data_changed = data.copy()
    raw2.rename(columns = {
        'Home_Team':'HomeTeam', 
        'Away_Team':'AwayTeam', 
    }, inplace=True)

    data_changed = pd.merge(data_changed, raw2[['HomeTeam', 'AwayTeam', 'Date', 'result']], on=['HomeTeam', 'AwayTeam', 'Date'], how='left')
    data_changed.drop(['Y'], axis=1, inplace=True)
    data_changed.rename(columns={'result':'Y'}, inplace=True)

    return data_changed

### Changing result feature： 0 equals lose, 1 equals tie, 2 equals win 

In [None]:
def change_result2_from_raw2data(
    path1 = '/datasets/rawdataversion2/raw_data3.csv', 
    path2 = '/root/work/demonstration/datasets/final.csv'
):
    def logic(raw):
        net_score = raw['Score'] - raw['Lost']
        if net_score > 0:
            return 2
        elif net_score == 0:
            return 1
        else:
            return 0
    return change_result_from_raw2data( path1=path1, path2=path2, logic=logic)

### Data

In [None]:
def split_Xy(data):
    return data.iloc[:,4:-1].values, data['Y'].values

def get_train_validation_test(X, y, test_ratio=.2, validation_ratio=.1):
    X_trainfull, X_test, y_trainfull, y_test = train_test_split(X, y, test_size=test_ratio)
    valid_index = int(X_trainfull.size * validation_ratio)
    X_train, X_valid = X_trainfull[:valid_index], X_trainfull[valid_index:]
    y_train, y_valid = y_trainfull[:valid_index], y_trainfull[valid_index:]
    return X_train, X_valid, X_test, y_train, y_valid, y_test

def data_loseandtie():
    data = pd.read_csv('/root/work/demonstration/datasets/final.csv')
    return split_Xy(data)

def data_tieandwin():
    data = change_result_from_raw2data()
    return split_Xy(data)

def data_splited():
    data = change_result2_from_raw2data()
    return split_Xy(data)

In [None]:
def get_configs():
    input_file = open('/root/work/configs.json')
    content = json.load(input_file)
    return content

In [None]:
def get_features_name():
    features = pd.read_json('/root/work/demonstration/datasets/Features_Metadata.json')
    return features['Name'].values

all.csv  pipelines.ipynb  test.ipynb


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b150ef30-52fe-41e2-b3cd-643278d2147a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>