In [None]:
import sklearn as sk
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
def stats(pred, test):
    print("\n-----------------------------------Report------------------------------------")
    print(classification_report(test,pred))
    print('Accuracy: ',accuracy_score(test, pred))
    print("\n")
    return accuracy_score(test, pred)

In [None]:
def executeNeuralNetwork(X, Y, testSize = 0.2, rand = 0, layers = (100,), solverType = 'adam', a = 0.0001, learningRate = 'constant', learningRateVal = 0.001, iterations = 200, earlyStopping = False, validationSet = 0.1, act = 'relu'):
    classifier =  MLPClassifier(hidden_layer_sizes = layers , solver = solverType, alpha = a, learning_rate = learningRate, learning_rate_init = learningRateVal , max_iter = iterations, random_state = rand, early_stopping = earlyStopping, validation_fraction = validationSet, activation = act)
    # hidden_layer_sizes: The ith element represents the number of neurons in the ith hidden layer.
    # solver: ‘lbfgs’ is an optimizer in the family of quasi-Newton methods, and ‘sgd’ refers to stochastic gradient descent.
    # alpha: L2 penalty (regularization term) parameter.
    # learning_rate: Learning rate schedule for weight updates, and only used when solver='sgd'.
    # learning_rate_init: The initial learning rate used. It controls the step-size in updating the weights. Only used when solver=’sgd’ or ‘adam’.
    # max_iter: Maximum number of iterations. For sgd this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.
    # random_state: Determines random number generation for weights and bias initialization.
    # early_stopping: Whether to use early stopping to terminate training when validation score is not improving.
    #                 If set to true, it will automatically set aside 10% of training data as validation and terminate training when validation score is not improving.
    #                 Only effective when solver=’sgd’ or ‘adam’.
    # validation_fraction: The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True.
    result = []
    print("Parameters:")
    print(classifier.get_params())
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size = testSize, random_state = rand)
    clf = classifier.fit(xTrain, yTrain)    
    result.append(1 - stats(clf.predict(xTrain), yTrain))
    result.append(1 - stats(clf.predict(xTest), yTest))
    return [result, clf]

In [None]:
def executeSVM(X, Y, testSize = 0.2, rand = 0,):
    classifier = SVC(C = 511.5344811797876, class_weight = 'balanced', gamma = 0.06435475773883788, kernel = 'linear')
    result = []
    print("Parameters:")
    print(classifier.get_params())
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size = testSize, random_state = rand)
    clf = classifier.fit(xTrain, yTrain)    
    result.append(1 - stats(clf.predict(xTrain), yTrain))
    result.append(1 - stats(clf.predict(xTest), yTest))
    return [result, clf]

In [None]:
def executeLogisticRegression(X, Y, testSize = 0.2, rand = 0):
    classifier = LogisticRegression(penalty = 'l2', C =5.76650390625e-05, max_iter = 500)
    result = []
    print("Parameters:")
    print(classifier.get_params())
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size = testSize, random_state = rand)
    clf = classifier.fit(xTrain, yTrain)    
    result.append(1 - stats(clf.predict(xTrain), yTrain))
    result.append(1 - stats(clf.predict(xTest), yTest))
    return [result, clf]

In [None]:
import numpy as np
import pandas as pd
import random
def apply(dataRow):
    ret = list(dataRow[0:11])
    if random.choice([True, False]):
        ret = list(dataRow[0:2])
        ret.extend(list(dataRow[6:10]))
        ret.extend(list(dataRow[2:6]))
        ret.append('0')
    return ret

def addStats(dataRow, playerData):
    ret = list(dataRow[0:2])
    ret.extend(list(playerData[float(dataRow[2])]))
    ret.extend(list(dataRow[3:6]))
    ret.extend(list(playerData[float(dataRow[6])]))
    ret.extend(list(dataRow[7:]))
    return ret

In [None]:
# Load data
fGameData = pd.read_csv("ATP_data_cleaned.csv", delimiter=',')
gameData = np.array([list(row) for row in fGameData.values])
# Seperate features and target columns
gameData =  np.array([apply(item) for item in gameData])
X = gameData[:5000, 0:10]
Y = gameData[:5000, 10]
# Add in player data
fGameData = pd.read_csv("ATP_player_data.csv", delimiter=',')
playerData = np.array([list(row) for row in fGameData.values])
playerData = dict((float(row[1]),list(row[2:])) for row in playerData)
X = np.array([addStats(item, playerData) for item in X], dtype='float64')
X = X.astype(np.float)

In [None]:
#Best parameters
#clf = executeNeuralNetwork(X, Y, layers = tuple([60]*2), learningRate = 'adaptive', learningRateVal = 0.0001, act = 'logistic', earlyStopping = True)[1]
clf = executeSVM(X, Y)[1]
#clf = executeLogisticRegression(X, Y)[1]

In [None]:
import numpy as np
import pandas as pd
import random
def apply(dataRow):
    ret = list(dataRow[0:11])
    if random.choice([True, False]):
        ret = list(dataRow[0:2])
        ret.extend(list(dataRow[6:10]))
        ret.extend(list(dataRow[2:6]))
        ret.append('0')
    return ret

def addStats(dataRow, playerData):
    ret = []
    ret.append(dataRow[2])
    ret.append(dataRow[6])
    ret.extend(list(dataRow[0:2]))
    ret.extend(list(playerData[float(dataRow[2])]))
    ret.extend(list(dataRow[3:6]))
    ret.extend(list(playerData[float(dataRow[6])]))
    ret.extend(list(dataRow[7:]))
    return ret

In [None]:
# Load data
fGameData = pd.read_csv("ATP_data_cleaned.csv", delimiter=',')
gameData = np.array([list(row) for row in fGameData.values])
# Seperate features and target columns
gameData =  np.array([apply(item) for item in gameData])
X = gameData[81508:81508 + 64, 0:10]
Y = gameData[81508:81508 + 64, 10]
# Add in player data
fGameData = pd.read_csv("ATP_player_data.csv", delimiter=',')
playerData = np.array([list(row) for row in fGameData.values])
playerData = dict((float(row[1]),list(row[2:])) for row in playerData)
X = np.array([addStats(item, playerData) for item in X], dtype='float64')
X = X.astype(np.float)
fGameData = pd.read_csv("AusOpen2019.csv", delimiter=',', header = None)
winners = [row[2] for row in fGameData.values]
fGameData = pd.read_csv("ATP_player_data.csv", delimiter=',')
playerName = dict((float(row[1]), row[0]) for row in fGameData.values)

In [None]:
def tournament(X, Y, ActualWinners, clf, playerName):
    round128 = [list(item) for item in X[0:64]]
    round64 = []
    round32 = []
    round16 = []
    Q = []
    SF = []
    F = []
    
    correct128 = 0
    correct64 = 0
    correct32 = 0
    correct16 = 0
    correctQ = 0
    correctSF = 0
    correctF = 0
    
    predictions = clf.predict([item[2:] for item in round128])
    winners = ActualWinners[0:len(predictions)]
    for prediction, match in zip(predictions, round128):
        if int(match[int(prediction[0])]) in winners:
            correct128 += 1
        if prediction == '0':
            temp = []
            temp.append(match[0])
            temp.extend(match[4:18])
            round64.append(temp)
        else:
            temp = []
            temp.append(match[1])
            temp.extend(match[18:])
            round64.append(temp)
    print("Round of 128 predicted winners:")
    for winner in round64:
        print(playerName[winner[0]])
    print("Accuracy: " +  str(correct128/64) + "\n\n\n")
    
    round64 = [[round64[2*i][0]] + [round64[2*i+1][0]] + [0,5] + round64[i][1:] + round64[i+1][1:] for i in range(0,32)]
    predictions = clf.predict([item[2:] for item in round64])
    winners = ActualWinners[64:64+32]
    for prediction, match in zip(predictions, round64):
        if int(match[int(prediction[0])]) in winners:
            correct64 += 1
        if prediction == '0':
            temp = []
            temp.append(match[0])
            temp.extend(match[4:18])
            round32.append(temp)
        else:
            temp = []
            temp.append(match[1])
            temp.extend(match[18:])
            round32.append(temp)
    print("Round of 64 predicted winners:")
    for winner in round32:
        print(playerName[winner[0]])
    print("Accuracy: " +  str(correct64/32) + "\n\n\n")
    
    round32 = [[round32[2*i][0]] + [round32[2*i+1][0]] + [0,5] + round32[i][1:] + round32[i+1][1:] for i in range(0,16)]
    predictions = clf.predict([item[2:] for item in round32])
    winners = ActualWinners[96:96+16]
    for prediction, match in zip(predictions, round32):
        if int(match[int(prediction[0])]) in winners:
            correct32 += 1
        if prediction == '0':
            temp = []
            temp.append(match[0])
            temp.extend(match[4:18])
            round16.append(temp)
        else:
            temp = []
            temp.append(match[1])
            temp.extend(match[18:])
            round16.append(temp)
    print("Round of 32 predicted winners:")
    for winner in round16:
        print(playerName[winner[0]])
    print("Accuracy: " +  str(correct32/18) + "\n\n\n")
    
    round16 = [[round16[2*i][0]] + [round16[2*i+1][0]] + [0,5] + round16[i][1:] + round16[i+1][1:] for i in range(0,8)]
    predictions = clf.predict([item[2:] for item in round16])
    winners = ActualWinners[112:112+8]
    for prediction, match in zip(predictions, round16):
        if int(match[int(prediction[0])]) in winners:
            correct16 += 1
        if prediction == '0':
            temp = []
            temp.append(match[0])
            temp.extend(match[4:18])
            Q.append(temp)
        else:
            temp = []
            temp.append(match[1])
            temp.extend(match[18:])
            Q.append(temp)
    print("Round of 16 predicted winners:")
    for winner in Q:
        print(playerName[winner[0]])
    print("Accuracy: " +  str(correct16/8) + "\n\n\n")
    
    Q = [[Q[2*i][0]] + [Q[2*i+1][0]] + [0,5] + Q[i][1:] + Q[i+1][1:] for i in range(0,4)]
    predictions = clf.predict([item[2:] for item in Q])
    winners = ActualWinners[120:120+4]
    for prediction, match in zip(predictions, Q):
        if int(match[int(prediction[0])]) in winners:
            correctQ += 1
        if prediction == '0':
            temp = []
            temp.append(match[0])
            temp.extend(match[4:18])
            SF.append(temp)
        else:
            temp = []
            temp.append(match[1])
            temp.extend(match[18:])
            SF.append(temp)
    print("Quarterfinal predicted winners:")
    for winner in SF:
        print(playerName[winner[0]])
    print("Accuracy: " +  str(correctQ/4) + "\n\n\n")
    
    SF = [[SF[2*i][0]] + [SF[2*i+1][0]] + [0,5] + SF[i][1:] + SF[i+1][1:] for i in range(0,2)]
    predictions = clf.predict([item[2:] for item in SF])
    winners = ActualWinners[124:124+2]
    for prediction, match in zip(predictions, SF):
        if int(match[int(prediction[0])]) in winners:
            correctSF += 1
        if prediction == '0':
            temp = []
            temp.append(match[0])
            temp.extend(match[4:18])
            F.append(temp)
        else:
            temp = []
            temp.append(match[1])
            temp.extend(match[18:])
            F.append(temp)
    print("Semifinal predicted winners:")
    for winner in F:
        print(playerName[winner[0]])
    print("Accuracy: " +  str(correctSF/2) + "\n\n\n")
    
    F = [[F[0][0]] + [F[1][0]] + [0,5] + F[0][1:] + F[1][1:]]
    prediction = clf.predict([item[2:] for item in F])
    if F[0][int(prediction[0][0])] == int(ActualWinners[126]):
            correctF += 1    
    print("Predicted tournament winner:")
    if prediction[0] == '0':
        print(playerName[F[0][0]])
    else:
        print(playerName[F[0][1]])
    print("Accuracy: " +  str(correctF/1)+"\n\n")
        
    Total = correct128 + correct64 + correct32 + correct16 + correctQ + correctSF + correctF
    print("Tournament Accuracy: " +  str(Total/127))

In [None]:
tournament(X, Y, winners, clf, playerName)