In [127]:
# Importation des bibliothèques
import numpy as np
import pandas as pd

# Importation de l'API
from nba_api.stats.endpoints import leaguegamefinder

In [128]:
# Récupération du jeu de données
path = 'nba odds 2021-22.xlsx'
odds_df = pd.read_excel(path, usecols = ['Date', 'Team', 'ML'])
odds_df.tail()

Unnamed: 0,Date,Team,ML
2641,610,Boston,-165
2642,613,Boston,145
2643,613,Golden State,-165
2644,616,Golden State,155
2645,616,Boston,-175


In [129]:
# Suppression des espaces dans les noms des équipes
odds_df.Team = odds_df.Team.str.replace(' ', '')
odds_df.tail()

Unnamed: 0,Date,Team,ML
2641,610,Boston,-165
2642,613,Boston,145
2643,613,GoldenState,-165
2644,616,GoldenState,155
2645,616,Boston,-175


In [130]:
# Création d'un dictionnaire pour remplacer les équipes par leurs noms officiels
team_name = {'Detroit': 'DetroitPistons', 'Washington': 'WashingtonWizards',
            'Dallas': 'DallasMavericks', 'Phoenix': 'PhoenixSuns',
            'NewOrleans': 'NewOrleansPelicans', 'LAClippers': 'LAClippers',
            'OklahomaCity': 'OklahomaCityThunder', 'GoldenState': 'GoldenStateWarriors',
            'Philadelphia': 'Philadelphia76ers', 'Indiana': 'IndianaPacers',
            'Miami': 'MiamiHeat', 'Toronto': 'TorontoRaptors',
            'Orlando': 'OrlandoMagic', 'NewYork': 'NewYorkKnicks',
            'Boston': 'BostonCeltics', 'Chicago': 'ChicagoBulls',
            'SanAntonio': 'SanAntonioSpurs', 'Portland': 'PortlandTrailBlazers',
            'Denver': 'DenverNuggets', 'Memphis': 'MemphisGrizzlies',
            'Brooklyn': 'BrooklynNets', 'Houston': 'HoustonRockets',
            'Utah': 'UtahJazz', 'Minnesota':'MinnesotaTimberwolves',
            'LALakers': 'LosAngelesLakers', 'Atlanta': 'AtlantaHawks',
            'Charlotte': 'CharlotteHornets', 'Cleveland': 'ClevelandCavaliers',
            'Sacramento': 'SacramentoKings', 'Milwaukee': 'MilwaukeeBucks'}
odds_df.replace({'Team': team_name}, inplace = True)

In [131]:
def format(date):
    'Fonction pour changer le format de l\'heure de notre dataframe'
    'Elle prend notre dataframe en entrée'
    'Elle nous retourne une date qui respecte le format AAAAMMJJ'
    
    year = path.split(' ')[2].split('.')[0].split('-')[0]
    
    if odds_df['Date'][0] <= date:
        return year + str(date)
    else:
        return str(int(year) + 1) + '0' + str(date)

In [132]:
# Formattage des données pour la création de notre identifiant
odds_df['Date'] = list(map(format, odds_df['Date']))
odds_df['ID'] = list(map(str, odds_df['Date'])) + odds_df['Team']
odds_df.tail()

Unnamed: 0,Date,Team,ML,ID
2641,20220610,BostonCeltics,-165,20220610BostonCeltics
2642,20220613,BostonCeltics,145,20220613BostonCeltics
2643,20220613,GoldenStateWarriors,-165,20220613GoldenStateWarriors
2644,20220616,GoldenStateWarriors,155,20220616GoldenStateWarriors
2645,20220616,BostonCeltics,-175,20220616BostonCeltics


In [133]:
def change_odds(x):
    'Fonction pour changer le format des cotes de notre dataframe'
    'Elle prend une cote américaine en entrée'
    'Elle nous retourne une cote qui correspond à la cote française'

    return round(1 + x / 100 , 2) if x > 0 else round(1 + 100 / np.abs(x),2)

In [134]:
# Modification du format des cotes
odds_df['ML'] = odds_df['ML'].astype(int)
odds_df['ODDS'] = odds_df['ML'].apply(lambda x : change_odds(x))
odds_df.head()

Unnamed: 0,Date,Team,ML,ID,ODDS
0,20211019,BrooklynNets,105,20211019BrooklynNets,2.05
1,20211019,MilwaukeeBucks,-125,20211019MilwaukeeBucks,1.8
2,20211019,GoldenStateWarriors,140,20211019GoldenStateWarriors,2.4
3,20211019,LosAngelesLakers,-160,20211019LosAngelesLakers,1.62
4,20211020,IndianaPacers,-125,20211020IndianaPacers,1.8


In [135]:
# Récupération du deuxième jeu de données
score_df = leaguegamefinder.LeagueGameFinder().get_data_frames()[0]
score_df.drop(['SEASON_ID', 'TEAM_ABBREVIATION', 'MIN'], axis = 1, inplace = True)
score_df.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PTS,FGM,FGA,FG_PCT,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,1612709902,Santa Cruz Warriors,2022200241,2023-02-07,SCW @ WIS,W,122,43,93,0.462,...,0.917,18,42,60,27,2,8,22,15,18.0
1,1612709921,Long Island Nets,2022200240,2023-02-07,LIN vs. SXF,W,112,44,92,0.478,...,0.833,12,36,48,25,11,5,14,22,17.6
2,1612709931,Mexico City Capitanes,2022200239,2023-02-07,MXC @ IWA,W,114,45,94,0.479,...,0.545,16,34,50,29,13,4,21,22,0.0
3,1610612737,Atlanta Hawks,22200818,2023-02-07,ATL @ NOP,L,107,38,85,0.447,...,0.76,10,31,41,30,4,6,11,21,-9.0
4,1610612760,Oklahoma City Thunder,22200821,2023-02-07,OKC @ LAL,W,133,49,94,0.521,...,0.9,7,30,37,32,12,2,12,18,3.0


In [136]:
# Suppression des tirets dans les dates et des espaces dans les noms des équipes
score_df.GAME_DATE = score_df.GAME_DATE.str.replace('-', '')
score_df.TEAM_NAME = score_df.TEAM_NAME.str.replace(' ', '')
score_df.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PTS,FGM,FGA,FG_PCT,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,1612709902,SantaCruzWarriors,2022200241,20230207,SCW @ WIS,W,122,43,93,0.462,...,0.917,18,42,60,27,2,8,22,15,18.0
1,1612709921,LongIslandNets,2022200240,20230207,LIN vs. SXF,W,112,44,92,0.478,...,0.833,12,36,48,25,11,5,14,22,17.6
2,1612709931,MexicoCityCapitanes,2022200239,20230207,MXC @ IWA,W,114,45,94,0.479,...,0.545,16,34,50,29,13,4,21,22,0.0
3,1610612737,AtlantaHawks,22200818,20230207,ATL @ NOP,L,107,38,85,0.447,...,0.76,10,31,41,30,4,6,11,21,-9.0
4,1610612760,OklahomaCityThunder,22200821,20230207,OKC @ LAL,W,133,49,94,0.521,...,0.9,7,30,37,32,12,2,12,18,3.0


In [137]:
# Suppression de la colonne MATCHUP et création des colonnes HOME et AWAY
score_df['HOME'] = [1 if '@' in score_df.MATCHUP[index] else 0 for index, row in score_df.iterrows()]
score_df.drop(['MATCHUP'], axis = 1, inplace = True)
score_df.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,WL,PTS,FGM,FGA,FG_PCT,FG3M,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,HOME
0,1612709902,SantaCruzWarriors,2022200241,20230207,W,122,43,93,0.462,18,...,18,42,60,27,2,8,22,15,18.0,1
1,1612709921,LongIslandNets,2022200240,20230207,W,112,44,92,0.478,16,...,12,36,48,25,11,5,14,22,17.6,0
2,1612709931,MexicoCityCapitanes,2022200239,20230207,W,114,45,94,0.479,14,...,16,34,50,29,13,4,21,22,0.0,1
3,1610612737,AtlantaHawks,22200818,20230207,L,107,38,85,0.447,12,...,10,31,41,30,4,6,11,21,-9.0,1
4,1610612760,OklahomaCityThunder,22200821,20230207,W,133,49,94,0.521,17,...,7,30,37,32,12,2,12,18,3.0,1


In [138]:
# Suppression de la colonne WL et création des colonnes W 
score_df['WL'] = score_df['WL'].astype(str)
score_df['WIN'] = [1 if 'W' in score_df.WL[index] else 0 for index, row in score_df.iterrows()]
score_df.drop(['WL'], axis = 1, inplace = True)
score_df.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,...,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,HOME,WIN
0,1612709902,SantaCruzWarriors,2022200241,20230207,122,43,93,0.462,18,45,...,42,60,27,2,8,22,15,18.0,1,1
1,1612709921,LongIslandNets,2022200240,20230207,112,44,92,0.478,16,42,...,36,48,25,11,5,14,22,17.6,0,1
2,1612709931,MexicoCityCapitanes,2022200239,20230207,114,45,94,0.479,14,35,...,34,50,29,13,4,21,22,0.0,1,1
3,1610612737,AtlantaHawks,22200818,20230207,107,38,85,0.447,12,36,...,31,41,30,4,6,11,21,-9.0,1,0
4,1610612760,OklahomaCityThunder,22200821,20230207,133,49,94,0.521,17,34,...,30,37,32,12,2,12,18,3.0,1,1


In [139]:
# On supprime toutes les lignes contenant une équipe qui ne fait pas partie des 30 équipes de la NBA 
teams = ['AtlantaHawks','BrooklynNets','BostonCeltics', 'CharlotteHornets', 'ChicagoBulls', 'ClevelandCavaliers', 
'DallasMavericks', 'DenverNuggets', 'DetroitPistons', 'GoldenStateWarriors', 'HoustonRockets', 'IndianaPacers', 
'LAClippers', 'LosAngelesLakers', 'MemphisGrizzlies', 'MiamiHeat', 'MilwaukeeBucks', 'MinnesotaTimberwolves',
'NewOrleansPelicans', 'NewYorkKnicks', 'OklahomaCityThunder', 'OrlandoMagic', 'Philadelphia76ers', 'PhoenixSuns', 
'PortlandTrailBlazers', 'SacramentoKings', 'SanAntonioSpurs', 'TorontoRaptors', 'UtahJazz', 'WashingtonWizards']

team_serie = pd.Series(score_df["TEAM_NAME"])
score_df = score_df[team_serie.str.contains('|'.join(teams))]

In [140]:
# On supprime toutes les lignes qui possède un GAME_ID unique
game_id_counts = score_df["GAME_ID"].value_counts()
single_occurrence_game_ids = game_id_counts[game_id_counts == 1].index

score_df = score_df[~score_df["GAME_ID"].isin(single_occurrence_game_ids)]
score_df.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,...,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,HOME,WIN
3,1610612737,AtlantaHawks,22200818,20230207,107,38,85,0.447,12,36,...,31,41,30,4,6,11,21,-9.0,1,0
4,1610612760,OklahomaCityThunder,22200821,20230207,133,49,94,0.521,17,34,...,30,37,32,12,2,12,18,3.0,1,1
5,1610612763,MemphisGrizzlies,22200819,20230207,104,39,81,0.481,11,33,...,34,39,22,10,7,13,21,15.0,0,1
7,1610612747,LosAngelesLakers,22200821,20230207,130,49,85,0.576,14,31,...,35,41,27,8,3,20,19,-3.0,0,0
8,1610612743,DenverNuggets,22200820,20230207,146,58,93,0.624,13,27,...,38,44,44,12,5,10,21,34.0,0,1


In [141]:
# Création d'un indice d'efficaicité lors des matchs et suppression des colonnes inutiles
score_df['EFF'] = (score_df['PTS'] + score_df['REB'] + score_df['AST'] + score_df['STL'] + score_df['BLK'] - ((score_df['FGA'] - score_df['FGM'])+(score_df['FTA']-score_df['FTM'])+score_df['TOV']))
score_df.drop(['PTS', 'REB', 'AST', 'STL', 'BLK', 'FGA', 'FGM', 'FTA', 'FTM', 'TOV', 'FG3M', 'FG3A', 'OREB', 'DREB'], axis = 1, inplace = True)
score_df.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,FG_PCT,FG3_PCT,FT_PCT,PF,PLUS_MINUS,HOME,WIN,EFF
3,1610612737,AtlantaHawks,22200818,20230207,0.447,0.333,0.76,21,-9.0,1,0,124
4,1610612760,OklahomaCityThunder,22200821,20230207,0.521,0.5,0.9,18,3.0,1,1,157
5,1610612763,MemphisGrizzlies,22200819,20230207,0.481,0.333,0.714,21,15.0,0,1,121
7,1610612747,LosAngelesLakers,22200821,20230207,0.576,0.452,0.692,19,-3.0,0,0,145
8,1610612743,DenverNuggets,22200820,20230207,0.624,0.481,0.773,21,34.0,0,1,201


In [142]:
# Création de notre identifiant pour le merge de nos deux dataframes
score_df['ID'] = score_df['GAME_DATE'] + score_df['TEAM_NAME']
score_df.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,FG_PCT,FG3_PCT,FT_PCT,PF,PLUS_MINUS,HOME,WIN,EFF,ID
3,1610612737,AtlantaHawks,22200818,20230207,0.447,0.333,0.76,21,-9.0,1,0,124,20230207AtlantaHawks
4,1610612760,OklahomaCityThunder,22200821,20230207,0.521,0.5,0.9,18,3.0,1,1,157,20230207OklahomaCityThunder
5,1610612763,MemphisGrizzlies,22200819,20230207,0.481,0.333,0.714,21,15.0,0,1,121,20230207MemphisGrizzlies
7,1610612747,LosAngelesLakers,22200821,20230207,0.576,0.452,0.692,19,-3.0,0,0,145,20230207LosAngelesLakers
8,1610612743,DenverNuggets,22200820,20230207,0.624,0.481,0.773,21,34.0,0,1,201,20230207DenverNuggets


In [143]:
# Merge des deux dataframes
df_merge = score_df.merge(odds_df, left_on = 'ID', right_on='ID')
df_merge.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,FG_PCT,FG3_PCT,FT_PCT,PF,PLUS_MINUS,HOME,WIN,EFF,ID,Date,Team,ML,ODDS
0,1610612738,BostonCeltics,42100406,20220616,0.425,0.393,0.917,16,-13.0,0,0,105,20220616BostonCeltics,20220616,BostonCeltics,-175,1.57
1,1610612744,GoldenStateWarriors,42100406,20220616,0.413,0.413,1.0,20,13.0,1,1,125,20220616GoldenStateWarriors,20220616,GoldenStateWarriors,155,2.55
2,1610612744,GoldenStateWarriors,42100405,20220613,0.466,0.225,0.867,28,10.0,0,1,122,20220613GoldenStateWarriors,20220613,GoldenStateWarriors,-165,1.61
3,1610612738,BostonCeltics,42100405,20220613,0.413,0.344,0.677,16,-10.0,1,0,91,20220613BostonCeltics,20220613,BostonCeltics,145,2.45
4,1610612744,GoldenStateWarriors,42100404,20220610,0.44,0.349,0.8,21,10.0,1,1,127,20220610GoldenStateWarriors,20220610,GoldenStateWarriors,145,2.45


In [144]:
# Suppression des dernières colonnes inutiles
df_merge.drop(['PF', 'ID', 'Date', 'Team', 'ML'], axis = 1, inplace = True)
df_merge.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,FG_PCT,FG3_PCT,FT_PCT,PLUS_MINUS,HOME,WIN,EFF,ODDS
0,1610612738,BostonCeltics,42100406,20220616,0.425,0.393,0.917,-13.0,0,0,105,1.57
1,1610612744,GoldenStateWarriors,42100406,20220616,0.413,0.413,1.0,13.0,1,1,125,2.55
2,1610612744,GoldenStateWarriors,42100405,20220613,0.466,0.225,0.867,10.0,0,1,122,1.61
3,1610612738,BostonCeltics,42100405,20220613,0.413,0.344,0.677,-10.0,1,0,91,2.45
4,1610612744,GoldenStateWarriors,42100404,20220610,0.44,0.349,0.8,10.0,1,1,127,2.45


In [190]:
def mean_stat(team_name, data, numberofgame):
    dataperteam = data.loc[data['TEAM_NAME'] == team_name].copy()
    dataperteam = dataperteam.iloc[1:numberofgame+1]
    columns_to_average = ["FG_PCT", "FG3_PCT", "FT_PCT", "PLUS_MINUS", "EFF"]
    mean = dataperteam[columns_to_average].mean()
    dataperteam.iloc[0, dataperteam.columns.isin(columns_to_average)] = mean
    return dataperteam



In [191]:
mean_stat("GoldenStateWarriors",df_merge,4)


Unnamed: 0,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,FG_PCT,FG3_PCT,FT_PCT,PLUS_MINUS,HOME,WIN,EFF,ODDS
2,1610612744,GoldenStateWarriors,42100405,20220613,0.45275,0.34275,0.81675,-2.0,0,1,118,1.61
4,1610612744,GoldenStateWarriors,42100404,20220610,0.44,0.349,0.8,10.0,1,1,127,2.45
6,1610612744,GoldenStateWarriors,42100403,20220608,0.462,0.375,0.867,-16.0,1,0,105,2.35
8,1610612744,GoldenStateWarriors,42100401,20220602,0.443,0.422,0.733,-12.0,0,0,118,1.61


In [147]:
# Sauvegarde de notre dataframe
df_merge.to_csv('preprocessed_data.csv', index = False)