In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 30

In [23]:
TeamCount = {} # Counting the number of matches that the team played
TeamWCount = {} # Number of winning 
smooth_value = 5 # Moving average value

df = pd.read_csv('MatchData.csv', index_col=0) # Read the data scrapped from NBA website
df = df.iloc[::-1].reset_index(drop=True) # Reverse the order of dataset so that the date will arrnage in ascending order
df.drop(['Min'],axis=1,inplace=True) 

gameStat = pd.DataFrame(columns=list(df.columns) + ['WinRate'])

In [24]:
# Function that return a row with nan values
# Helps on getting the moving average
def nan_row(): 
    return pd.DataFrame(np.array([np.repeat(np.nan, len(df.columns))]), columns=df.columns)

# Function that return logistic model and it's accuracy
def fit_logistic(train_valid, train_features, train_target, return_model=False):
    # Splitting the dataset to training and testing data
    train_data, test_data = train_test_split(train_valid, test_size=0.2)

    # Define the features and target of training data
    train_X, train_Y = train_data[train_features], train_data[train_target]
    test_X, test_Y = test_data[train_features], test_data[train_target]

    # Confirm that the target is ok for fitting into the model
    train_Y = train_Y.astype('int')
    test_Y = test_Y.astype('int')

    # Fit the logistic regression model to the training dataset
    model = linear_model.LogisticRegression()
    model.fit(train_X, train_Y)

    # Estimate the accuracy of the model
    model_accuracy = model.score(test_X, test_Y)

    if return_model:
        return model_accuracy, model
    else:
        return model_accuracy

# Function that helps on getting the best features for the prediction of winning team
# Here I don't train with different combinations of features is because the combinations
# of 45 features is a very large nunmbers and is time consuming
# Greedy Algorithm
def greedy_algo(train_valid, features, target):
    # initialize a list to save features
    greedy_select = []
    
    accuracy_greedy_algo = np.array([])

    for i in range(len(features)):
        accuracy = np.array([])
        # Define which features is not picked by the greedy algorithm
        features_left = list(set(features) - set(greedy_select))

        # This loop helps on looping through the features that are not selected yet
        for new in features_left:
            features_new = greedy_select + [new]
            train_valid_sub = train_valid[features_new + target]

            # CrossValidation, compute the accuracy and save it into accuracy_sub
            accuracy_sub = fit_logistic(train_valid_sub, features_new, target)
            # Storing accuracy of the new features. After ending the loop
            # I will extract the new feature with the greatest accuracy
            accuracy = np.append(accuracy, accuracy_sub)

        # pick the features that gives the greatest accuracy
        # and add it into our features list
        # meanwhile, save the corresponding accuracy
        greedy_select += [features_left[accuracy.argmax()]]
        accuracy_greedy_algo = np.append(accuracy_greedy_algo, accuracy.max())
        
    return greedy_select[:(accuracy_greedy_algo.argmax()+1)]

In [25]:
# Restructuring dataset and calculate the moving average of each team
# Here I will use a 5 games moving average
for i in range(len(df)):
    TeamName = df['Team'][i]

    if TeamName not in TeamCount:
        TeamCount[TeamName] = 1
        TeamWCount[TeamName] = 1 if df.iloc[i,3] == 'W' else 0
        gameStat = pd.concat([gameStat,nan_row()]).reset_index(drop=True)
    else: 
        win_rate = TeamWCount[TeamName] / TeamCount[TeamName]
        TeamCount[TeamName] += 1
        TeamWCount[TeamName] = TeamWCount[TeamName] + (1 if df.iloc[i,3] == 'W' else 0)
        n = TeamCount[TeamName]

        if n > smooth_value:
            tempRow = df[df['Team']==TeamName][n-smooth_value-1 : n-1].mean()
            tempRow = pd.DataFrame(tempRow).transpose()
        else:
            tempRow = nan_row()
        
        tempRow['Team'] = [TeamName]
        tempRow['MatchUp'] = [df.iloc[i,1]]
        tempRow['Date'] = [df.iloc[i,2]]
        tempRow['W/L'] = [df.iloc[i,3]]
        tempRow['WinRate'] = [win_rate]
        gameStat = pd.concat([gameStat,tempRow])

gameStat.reset_index(drop=True,inplace=True)
gameStat['Date'] = pd.to_datetime(gameStat['Date'])
gameStat['FGP'] = gameStat['FGM']/gameStat['FGA']
gameStat['3PP'] = gameStat['3PM']/gameStat['3PA']
gameStat['FTP'] = gameStat['FTM']/gameStat['FTA']

gameStat.rename(columns={'W/L':'Winning'},inplace=True)
gameStat['Winning'] = np.where(gameStat['Winning'] == 'W', 1, 0)

gameStat.dropna(inplace=True)
gameStat.reset_index(drop=True, inplace=True)

In [26]:
dropList = []
jump = False

# Sicne after removing some rows, it is possible that the opponent team is missing when calculating the moving average
# It is needed to confirm there is no missing line, and in each two line, they should be the same match
for i in range(len(gameStat)):
    if jump:
        jump = False
        continue

    TeamName = gameStat['Team'][i]

    if i+1 < len(gameStat):
        if TeamName in gameStat.iloc[i+1].MatchUp:
            jump = True
        else:
            dropList.append(i)

gameStat.drop(dropList, axis=0, inplace=True)
gameStat.reset_index(drop=True, inplace=True)

In [27]:
# H means Home team ; A means Away team
column_list = [ 'H_Team','A_Team','Date','H_Win','A_Win',
                'H_Points','H_FGM','H_FGA','H_FGP','H_3PM','H_3PA','H_3PP','H_FTM','H_FTA','H_FTP','H_OREB','H_DREB','H_REB','H_AST','H_STL','H_BLK','H_TOV','H_PF','H_PN','H_WinRate',
                'A_Points','A_FGM','A_FGA','A_FGP','A_3PM','A_3PA','A_3PP','A_FTM','A_FTA','A_FTP','A_OREB','A_DREB','A_REB','A_AST','A_STL','A_BLK','A_TOV','A_PF','A_PN','A_WinRate']

matchDF = pd.DataFrame(columns=column_list)

# Putting two rows with same game into one row
for i in range(0, len(gameStat), 2):
    for j in range(2):
        if 'vs.' in gameStat.MatchUp[i+j]:
            home = list(gameStat.iloc[i+j])
        else:
            away = list(gameStat.iloc[i+j])
    
    match_detail = [home[0]] + [away[0]] + [home[2]] + [home[3]] + [away[3]] + home[4:] + away[4:]
    matchDF.loc[len(matchDF)] = match_detail

In [28]:
gameStat.head()

Unnamed: 0,Team,MatchUp,Date,Winning,Points,FGM,FGA,FGP,3PM,3PA,3PP,FTM,FTA,FTP,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PN,WinRate
0,TOR,TOR vs. ORL,2021-10-29,1,103.8,38.4,93.0,0.412903,10.8,33.2,0.325301,16.2,21.0,0.771429,16.0,34.2,50.2,20.4,11.8,3.8,15.2,19.0,4.8,0.4
1,ORL,ORL @ TOR,2021-10-29,0,100.8,36.0,85.2,0.422535,13.0,39.8,0.326633,15.8,20.8,0.759615,9.2,34.6,43.8,21.2,6.2,4.2,17.0,21.2,-14.2,0.2
2,IND,IND @ BKN,2021-10-29,0,113.4,42.0,90.6,0.463576,13.2,37.0,0.356757,16.2,19.8,0.818182,10.0,37.0,47.0,23.4,5.8,5.8,17.4,21.4,-3.8,0.2
3,BKN,BKN vs. IND,2021-10-29,1,102.0,37.8,87.6,0.431507,12.4,36.6,0.338798,14.0,18.0,0.777778,7.2,39.0,46.2,21.4,6.4,6.8,13.6,19.0,-6.6,0.4
4,LAL,LAL vs. CLE,2021-10-29,1,116.0,44.0,92.4,0.47619,13.0,34.2,0.380117,15.0,21.2,0.707547,9.0,35.6,44.6,24.8,8.0,6.2,16.2,21.6,-3.6,0.4


In [29]:
matchDF.head()

Unnamed: 0,H_Team,A_Team,Date,H_Win,A_Win,H_Points,H_FGM,H_FGA,H_FGP,H_3PM,H_3PA,H_3PP,H_FTM,H_FTA,H_FTP,...,A_3PA,A_3PP,A_FTM,A_FTA,A_FTP,A_OREB,A_DREB,A_REB,A_AST,A_STL,A_BLK,A_TOV,A_PF,A_PN,A_WinRate
0,TOR,ORL,2021-10-29,1,0,103.8,38.4,93.0,0.412903,10.8,33.2,0.325301,16.2,21.0,0.771429,...,39.8,0.326633,15.8,20.8,0.759615,9.2,34.6,43.8,21.2,6.2,4.2,17.0,21.2,-14.2,0.2
1,BKN,IND,2021-10-29,1,0,102.0,37.8,87.6,0.431507,12.4,36.6,0.338798,14.0,18.0,0.777778,...,37.0,0.356757,16.2,19.8,0.818182,10.0,37.0,47.0,23.4,5.8,5.8,17.4,21.4,-3.8,0.2
2,LAL,CLE,2021-10-29,1,0,116.0,44.0,92.4,0.47619,13.0,34.2,0.380117,15.0,21.2,0.707547,...,28.4,0.28169,17.8,22.4,0.794643,10.2,36.2,46.4,25.0,7.6,4.4,15.4,16.0,1.8,0.6
3,MEM,MIA,2021-10-30,0,1,114.0,42.8,96.6,0.443064,14.2,40.4,0.351485,14.2,17.6,0.806818,...,31.8,0.295597,18.4,21.8,0.844037,12.6,42.8,55.4,23.6,6.8,2.2,15.4,21.8,15.2,0.8
4,WAS,BOS,2021-10-30,1,0,112.2,41.4,90.6,0.456954,11.0,32.6,0.337423,18.4,23.0,0.8,...,43.4,0.35023,17.0,21.4,0.794393,10.4,35.8,46.2,25.2,9.2,8.2,16.0,21.8,-4.8,0.4


In [37]:
# Features that will use to construct the logistic regression model
avaliable_feature = list(matchDF.columns[5:])
target = ['H_Win']
df = matchDF[target + avaliable_feature]

# Splitting data into training and testing data
train, test = train_test_split(df, test_size=0.2)

# Picking the best features by greedy algorithm
final_feature = greedy_algo(train, avaliable_feature, target)

# By using the best features, fit into the logistic model and getting it's accuracy
accuracy, model = fit_logistic(df, final_feature, target, return_model=True)
print(f'Features used by model: {final_feature}')
print(f'Accuracy: {accuracy}')

Features used by model: ['H_WinRate', 'H_REB', 'A_PN', 'A_FGM', 'A_REB', 'H_FTP', 'A_FTP', 'A_BLK', 'A_WinRate', 'H_STL', 'H_3PA']
Accuracy: 0.6582914572864321
