In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
import gradio as gr
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
games_df = pd.read_csv('games.csv')
games_details_df = pd.read_csv('games_details.csv')
players_df = pd.read_csv('players.csv')
ranking_df = pd.read_csv('ranking.csv')
teams_df = pd.read_csv('teams.csv')

### Preparing the Data

In [None]:
games_details_df = games_details_df[games_details_df['COMMENT'].isna()]

games_details_df["PLUS_MINUS"] = games_details_df['PLUS_MINUS'].replace(np.nan, 0)

games_df = games_df.dropna()

game_dropped_columns = ["GAME_DATE_EST", "GAME_ID" , "GAME_STATUS_TEXT", "HOME_TEAM_ID", "VISITOR_TEAM_ID"]
details_dropped_columns = ["GAME_ID","TEAM_ID","TEAM_ABBREVIATION","TEAM_CITY","PLAYER_ID","PLAYER_NAME","NICKNAME","START_POSITION", 'COMMENT']
stats = ['FGM','FGA','FG_PCT','FG3M','FG3A','FG3_PCT','FTM','FTA','FT_PCT','OREB','DREB','REB','AST','STL','BLK','TO','PTS','PLUS_MINUS']

In [None]:
team_ids = teams_df['TEAM_ID'].unique()

In [None]:
# Changing the minute values to floats
def min_to_float(min):
    if (type(min) == str):
        if (':' in min):
            if ("." in min):
                split = min.split('.')
                min = ":".join([split[0], split[1][-2:]])
            mins, secs = map(int, min.split(':'))
            return (mins + secs) / 60.0
        else:
            return float(min)
    return min

games_details_df["MIN"] = games_details_df["MIN"].apply(min_to_float)

### Prediction Functionality

In [None]:
def get_team_id(name):
    return teams_df[teams_df['NICKNAME'] == name]['TEAM_ID'].values[0]

In [None]:
# Creates predictions for classifer bets
def predict_classifer(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,              
                                                    y,              
                                                    test_size=0.2,  
                                                    random_state=20,
                                                    stratify=y
                                                   )
    c = KNeighborsClassifier()
    c.fit(X_train, y_train)

    return c.predict(X_test)

In [None]:
# Creates predictinos for regressor bets
def predict_regressor(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,              
                                                    y,              
                                                    test_size=0.2,  
                                                    random_state=20,
                                                   )
    r = RandomForestRegressor(random_state=7)
    r.fit(X_train, y_train)

    return r.predict(X_test)

In [None]:
#Predicts the winner of a game given a home and an away team
def predict_winner(home, away):
    X = games_df[(games_df["HOME_TEAM_ID"] == get_team_id(home)) & (games_df['VISITOR_TEAM_ID'] == get_team_id(away))]
    X = X.drop(columns=game_dropped_columns)
    y = X['HOME_TEAM_WINS']
    X = X.drop(columns='HOME_TEAM_WINS')
    
    predictions = predict_classifer(X, y)

    result = np.bincount(predictions).argmax()
    
    return result == 1

In [None]:
# Predicts the final score of a game given a homne and away team
def predict_score(home, away):
    X = games_df[(games_df["HOME_TEAM_ID"] == get_team_id(home)) & (games_df['VISITOR_TEAM_ID'] == get_team_id(away))]
    X = X.drop(columns=game_dropped_columns)
    y = X[['PTS_home', 'PTS_away']]
    X = X.drop(columns=['PTS_home', 'PTS_away'])

    predictions = predict_regressor(X, y)

    home_scores = predictions[:, 0]
    away_scores = predictions[:, 1]

    return (math.floor(np.mean(home_scores)), math.floor(np.mean(away_scores)))

In [None]:
# Predicts a player's stat against a specific team
def predict_player_stat_against_team(player, team, stat):
    games_played_by_player = games_details_df[games_details_df['PLAYER_NAME'] == player]
    game_ids = games_played_by_player['GAME_ID'].tolist()
    
    games_ids_against_team = []
    for game_id in game_ids:
        game = games_df[games_df['GAME_ID'] == game_id]
        if (game["HOME_TEAM_ID"].values[0] == get_team_id(team) or game["VISITOR_TEAM_ID"].values[0] == get_team_id(team)):
            games_ids_against_team.append(game_id)

    games_against_team = games_played_by_player[games_played_by_player['GAME_ID'].isin(games_ids_against_team)]
    X = games_against_team.drop(columns=details_dropped_columns)
    y = X[stat]
    X = X.drop(columns=stat)

    predictions = predict_regressor(X, y)

    return math.floor(np.mean(predictions))

### Betting Functionality

In [None]:
# Returns if a bet involving a number is correct
def betting_number(pred, bet, is_betting_over):
     return (is_betting_over and bet < pred) or (not is_betting_over and bet > pred)

In [None]:
# Return if the correct winning team was bet
def make_win_bet(home, away, is_home_winning):
    pred = predict_winner(home, away)
    return is_home_winning == pred

In [None]:
# Returns if a bet on a final score is correct
def make_score_bet(home, away, bet, is_betting_home, is_betting_over):
    pred = predict_score(home, away)
    if (is_betting_home):
        return betting_number(pred[0], bet, is_betting_over), pred[0]
    else:
        return betting_number(pred[1], bet, is_betting_over), pred[1]

In [None]:
# Returns if a bet on a player stat is correct
def make_player_stat_bet(player, stat, bet, opponent, is_beating_over):
    pred = predict_player_stat_against_team(player, opponent, stat)
    return betting_number(pred, bet, is_beating_over), pred

In [None]:
# Displays interface for bets on winning team
win_iface = gr.Interface(
    fn=make_win_bet,
    inputs=[
        gr.Textbox("", label="Home Team"),
        gr.Textbox("", label="Away Team"),
        gr.Checkbox(label="Is Home Winning?")
    ],
    outputs=gr.Textbox(label='Correct'),
    theme="compact",
    title="Winning Bet Predictor"
)

win_iface.launch()

In [1]:
# Displays interface for bets on final score
score_iface = gr.Interface(
    fn=make_score_bet,
    inputs=[
        gr.Textbox("", label="Home Team"),
        gr.Textbox("", label="Away Team"),
        gr.Number(label='Bet'),
        gr.Checkbox(label="Betting home team?"),
        gr.Checkbox(label='Betting Over?')
    ],
    outputs=[
        gr.Textbox(label='Correct'),
        gr.Textbox(label='Prediction')
    ],
    theme="compact",
    title="Score Bet Predictor"
)

score_iface.launch()

NameError: name 'gr' is not defined

In [None]:
# Displays interface for making player stat bets
stat_iface = gr.Interface(
    fn=make_player_stat_bet,
    inputs=[
        gr.Textbox("", label="Player"),
        gr.Textbox("", label='Stat'),
        gr.Number(label='Bet'),
        gr.Textbox("", label='Opponent'),
        gr.Checkbox(label='Betting Over?')
    ],
    outputs=[
        gr.Textbox(label='Correct'),
        gr.Textbox(label='Prediction')
    ],
    theme="compact",
    title="Stat Bet Predictor"
)

stat_iface.launch()

### Data Visualization

In [None]:
def team_id_to_name(id):
    team = teams_df[teams_df['TEAM_ID'] == id]
    return team['NICKNAME'].values[0]

home_team_counts = games_df['HOME_TEAM_ID'].value_counts()

home_team_names = []
for team_id in home_team_counts.index:
    team_name = team_id_to_name(team_id)
    home_team_names.append(team_name)

home_team_counts = home_team_counts.set_axis(home_team_names)
home_team_counts.plot(kind='bar')
plt.ylabel('Games')
plt.title('Home Games Played Per Team')

In [None]:
away_team_counts = games_df['VISITOR_TEAM_ID'].value_counts()

away_team_names = []
for team_id in away_team_counts.index:
    team_name = team_id_to_name(team_id)
    away_team_names.append(team_name)

away_team_counts = away_team_counts.set_axis(away_team_names)
away_team_counts.plot(kind='bar')
plt.ylabel('Games')
plt.title('Away Games Played Per Team')

In [None]:
scores = [games_df['PTS_home'], games_df['PTS_away']]
labels = ['Home Scores', 'Away Scores']
plt.hist(scores, bins=20, label=labels)
plt.ylabel('Frequency')
plt.xlabel('Points')
plt.title('Final Score in Games')
plt.legend()

In [None]:
player_counts = games_details_df['PLAYER_NAME'].value_counts()

player_counts.head(10).plot(kind='bar')
plt.ylabel('Games')
plt.title('Top 10 Most Played Games by Players')

In [None]:
points = games_details_df['PTS']
plt.hist(points, bins=20)
plt.ylabel('Frequency')
plt.xlabel('Points')
plt.title('Points Scored in Games')

In [None]:
assits = games_details_df['AST']
plt.hist(assits, bins=20)
plt.ylabel('Frequency')
plt.xlabel('Assists')
plt.title('Assists in Games')

In [None]:
rebs = games_details_df['REB']
plt.hist(rebs, bins=20)
plt.ylabel('Frequency')
plt.xlabel('Reboounds')
plt.title('Rebounds in Games')

In [None]:
wins_per_team = {}
for team_id in teams_df['TEAM_ID']:
    team_name = team_id_to_name(team_id)
    wins_per_team[team_name] = 0

for index, game in games_df.iterrows():
    home_team = game['HOME_TEAM_ID']
    away_team = game['VISITOR_TEAM_ID']
    if (game['HOME_TEAM_WINS'] == 1):
        wins_per_team[team_id_to_name(home_team)] += 1
    else:
        wins_per_team[team_id_to_name(away_team)] += 1

wins_per_team = dict(sorted(wins_per_team.items(), key=lambda item: item[1], reverse=True))
plt.bar(data=wins_per_team, x=wins_per_team.keys(), height=wins_per_team.values())
plt.title('Wins Per Team')
plt.ylabel('Wins')
plt.xticks(rotation=90)

### Model Analysis

In [None]:
# Getting scores for Random Forest Classifier model, Decision Tree Classifier model, and K-Nearest Neighbor Classifier model
# on predicting team wins

rfc_scores = []
dtc_scores = []
knnc_scores = []

for home_team in team_ids:
    for away_team in team_ids:
        if (home_team != away_team):
            home_vs_away = games_df[(games_df['HOME_TEAM_ID'] == home_team) & (games_df['VISITOR_TEAM_ID'] == away_team)]
            X = home_vs_away.drop(columns=game_dropped_columns)
            X = X.drop(columns='HOME_TEAM_WINS')
            y = home_vs_away['HOME_TEAM_WINS']

            X_train, X_test, y_train, y_test = train_test_split(X,              
                                                y,              
                                                test_size=0.2,  
                                                random_state=20,
                                                stratify=y
                                                )
            
            rfc = RandomForestClassifier(random_state=20)
            rfc.fit(X_train, y_train)

            rfc_scores.append(accuracy_score(rfc.predict(X_test), y_test))

            dtc = DecisionTreeClassifier(random_state=20)
            dtc.fit(X_train, y_train)

            dtc_scores.append(accuracy_score(dtc.predict(X_test), y_test))

            knnc = KNeighborsClassifier()
            knnc.fit(X_train, y_train)

            knnc_scores.append(accuracy_score(knnc.predict(X_test), y_test))

print({'RFC Score' : np.mean(rfc_scores), 'DTC Score' : np.mean(dtc_scores), 'KNNC Score' : np.mean(knnc_scores)})

In [None]:
# Getting scores for Random Forest Regressor model, Decision Tree Regressor model, and K-Nearest Neighbor Regressor model
# on predicting team score

rfr_scores = []
dtr_scores = []
knnr_scores = []

for home_team in team_ids:
    for away_team in team_ids:
        if (home_team != away_team):
            home_vs_away = games_df[(games_df['HOME_TEAM_ID'] == home_team) & (games_df['VISITOR_TEAM_ID'] == away_team)]
            X = home_vs_away.drop(columns=game_dropped_columns)
            X = X.drop(columns=['PTS_home', 'PTS_away'])
            y = home_vs_away[['PTS_home', 'PTS_away']]

            X_train, X_test, y_train, y_test = train_test_split(X,              
                                                y,              
                                                test_size=0.2,  
                                                random_state=20,

                                                )
            
            rfr = RandomForestRegressor(random_state=20)
            rfr.fit(X_train, y_train)

            rfr_scores.append(r2_score(y_pred=rfr.predict(X_test), y_true=y_test))

            dtr = DecisionTreeRegressor(random_state=20)
            dtr.fit(X_train, y_train)

            dtr_scores.append(r2_score(y_pred=dtr.predict(X_test), y_true=y_test))

            knnr = KNeighborsRegressor()
            knnr.fit(X_train, y_train)

            knnr_scores.append(r2_score(y_pred=knnr.predict(X_test), y_true=y_test))

print({'RFR Score' : np.mean(rfr_scores), 'DTR Score' : np.mean(dtr_scores), 'KNNR Score' : np.mean(knnr_scores)})

In [None]:
plt.hist(rfr_scores, bins=30)
plt.xticks(np.arange(math.ceil(min(rfr_scores)), math.ceil(max(rfr_scores)+1), 1), rotation=90)
plt.title('Distrubution of R^2 Scores Predicting Final Score')
plt.ylabel('Frequency')
plt.xlabel('R^2')

In [None]:
# Getting scores for Random Forest Regressor model, Decision Tree Regressor model, and K-Nearest Neighbor Regressor model

rfr_scores = []
dtr_scores = []
knnr_scores = []

player_ids = games_details_df['PLAYER_ID'].value_counts().head(10).index
for player in player_ids:
    games_played_by_player = games_details_df[games_details_df['PLAYER_ID'] == player]
    for stat in stats:
        X = games_played_by_player.drop(columns=details_dropped_columns)
        y = X[stat]
        X = X.drop(columns=stat)


        X_train, X_test, y_train, y_test = train_test_split(X,              
                                                    y,              
                                                    test_size=0.2,  
                                                    random_state=20,
                                                    )
                
        rfr = RandomForestRegressor(random_state=20)
        rfr.fit(X_train, y_train)

        rfr_scores.append(r2_score(y_pred=rfr.predict(X_test), y_true=y_test))

        dtr = DecisionTreeRegressor(random_state=20)
        dtr.fit(X_train, y_train)

        dtr_scores.append(r2_score(y_pred=dtr.predict(X_test), y_true=y_test))

        knnr = KNeighborsRegressor()
        knnr.fit(X_train, y_train)

        knnr_scores.append(r2_score(y_pred=knnr.predict(X_test), y_true=y_test))

print({'RFR Score' : np.mean(rfr_scores), 'DTR Score' : np.mean(dtr_scores), 'KNNR Score' : np.mean(knnr_scores)})