# ML project: prediction

In this Machine learning project, our purpose is to predict who's going to be the MVP and given the winner of the NBA Season

## Part1 : Web Scraping NBA stats
### 1-Downloading MVP votes with requests 

In [9]:
import requests
years = list(range(1991,2022))

url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [10]:
for year in years:
    url = url_start.format(year)
    data = requests.get(url)
    with open("mvp/{}.html".format(year), "w+") as f:
         f.write(data.text)

### 2- Parsing the votes table with beautifulSoup

In [11]:
from bs4 import BeautifulSoup
import pandas as pd

dfs = []
for year in years:
    with open("mvp/{}.html".format(year)) as f :
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find("tr", class_="over_header").decompose()
    mvp_table = soup.find_all(id="mvp")
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["Year"] = year
    dfs.append(mvp)

## we have a list of dataframe
dfs[0]

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,...,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251,1991
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,...,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264,1991
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,...,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258,1991
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,...,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225,1991
5,6,Clyde Drexler,28,POR,1.0,75.0,960,0.078,82,34.8,...,6.7,6.0,1.8,0.7,0.482,0.319,0.794,12.4,0.209,1991
6,7,Kevin Johnson,24,PHO,0.0,32.0,960,0.033,77,36.0,...,3.5,10.1,2.1,0.1,0.516,0.205,0.843,12.7,0.22,1991
7,8,Dominique Wilkins,31,ATL,0.0,29.0,960,0.03,81,38.0,...,9.0,3.3,1.5,0.8,0.47,0.341,0.829,11.4,0.177,1991
8,9T,Larry Bird,34,BOS,0.0,25.0,960,0.026,60,38.0,...,8.5,7.2,1.8,1.0,0.454,0.389,0.891,6.6,0.14,1991
9,9T,Terry Porter,27,POR,0.0,25.0,960,0.026,81,32.9,...,3.5,8.0,2.0,0.1,0.515,0.415,0.823,13.0,0.235,1991


### 3- Combining MVP votes wuth pandas

In [12]:
mvps = pd.concat(dfs)
mvps.tail()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
10,11,Russell Westbrook,32,WAS,0.0,5.0,1010,0.005,65,36.4,...,11.5,11.7,1.4,0.4,0.439,0.315,0.656,3.7,0.075,2021
11,12,Ben Simmons,24,PHI,0.0,3.0,1010,0.003,58,32.4,...,7.2,6.9,1.6,0.6,0.557,0.3,0.613,6.0,0.153,2021
12,13T,James Harden,31,TOT,0.0,1.0,1010,0.001,44,36.6,...,7.9,10.8,1.2,0.8,0.466,0.362,0.861,7.0,0.208,2021
13,13T,LeBron James,36,LAL,0.0,1.0,1010,0.001,45,33.4,...,7.7,7.8,1.1,0.6,0.513,0.365,0.698,5.6,0.179,2021
14,13T,Kawhi Leonard,29,LAC,0.0,1.0,1010,0.001,52,34.1,...,6.5,5.2,1.6,0.4,0.512,0.398,0.885,8.8,0.238,2021


In [13]:
# store in a csv file
mvps.to_csv("mvps.csv")

Now to better train our model, we should use all the players stats in the different years

### 4- Downloading players stats

In [40]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"
year = 1991
url = player_stats_url.format(year)
data = requests.get(url)
with open("player/1991.html","w+") as f:
    f.write(data.text)

We are not getting all the data because the real page use javascript to fetch the remaining data.

### 5- Using selenium to scrape a javascript page


In [41]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service(executable_path="/Users/biad/opt/chromedriver/chromedriver")
driver = webdriver.Chrome(service = service)

In [42]:
import time 
driver.get(url)
driver.execute_script("window.scrollTo(1,10000)")
time.sleep(2)
# get all the stats for one year
html = driver.page_source

In [43]:
with open("player/{}".format(year), "w+") as f:
    f.write(html)

In [45]:
# Apply to the years
for year in years :
    url = player_stats_url.format(year)
    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)
    # get all the stats for one year
    html = driver.page_source
    with open("player/{}.html".format(year), "w+") as f:
        f.write(html)
    

### 6- Parsing the stats with beautifulSoup

In [48]:

stats = []
for year in years :
    with open("player/{}.html".format(year)) as f:
         page = f.read()
         soup = BeautifulSoup(page, "html.parser")
         soup.find('tr', class_="thead").decompose()
         player_stats = soup.find(id="per_game_stats")
         players = pd.read_html(str(player_stats))[0]
         players["Year"] = year
    stats.append(players)

stats[0].head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,...,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,...,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,...,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991


### 7- Combining player stats with Pandas



In [50]:
players = pd.concat(stats)
players.shape

(18044, 31)

In [51]:
players.to_csv("players.csv")

### 8- Download team data

In [53]:
team_stat_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"
for year in years:
    url = team_stat_url.format(year)
    data = requests.get(url)
    with open("team/{}.html".format(year), "w+") as f:
        f.write(data.text)

### 9- Parsing the team data with beautifulSoup and combining with pandas

In [58]:
dfs = []
for year in years:
    with open("team/{}.html".format(year)) as f :
        page = f.read()
        soup = BeautifulSoup(page, "html.parser")
        soup.find("tr", class_="thead").decompose()
        team_table = soup.find(id="divs_standings_E")
        team = pd.read_html(str(team_table))[0]
        team["Year"]= year
        team["Team"] = team["Eastern Conference"]
        del team["Eastern Conference"]
        dfs.append(team)

        soup = BeautifulSoup(page, "html.parser")
        soup.find("tr", class_="thead").decompose()
        team_table = soup.find(id="divs_standings_W")
        team = pd.read_html(str(team_table))[0]
        team["Year"]= year
        team["Team"] = team["Western Conference"]
        del team["Western Conference"]
        dfs.append(team)


teams = pd.concat(dfs)
teams.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,0.683,—,111.5,105.7,5.22,1991,Boston Celtics*
1,44,38,0.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers*
2,39,43,0.476,17.0,103.1,103.3,-0.43,1991,New York Knicks*
3,30,52,0.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,0.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets


In [None]:
teams.to_csv("teams.csv")