# <b> <font color='red'>  Project in Data Science </font> </b>

## Analysing NBA historical data to discover patterns that should be adopted by franchises to succeed

#### <b>Andreas Neocleous
a.neocleous12@hotmail.com
</b>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

### Scraping as a test from hoopshype site

In [2]:
url="https://hoopshype.com/salaries/players/"

r = requests.get(url,timeout=2.5)
r_html = r.text

soup = BeautifulSoup(r_html, 'html.parser')

salary_table = soup.find('table')

In [3]:
length=len(salary_table.find_all("td"))

player_names=[salary_table.find_all("td")[i].text.strip() for i in range(9,length,8)]

column1=[salary_table.find_all("td")[i].text.strip() for i in range(10,length,8)]
column2=[salary_table.find_all("td")[i].text.strip() for i in range(11,length,8)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(12,length,8)]
column4=[salary_table.find_all("td")[i].text.strip() for i in range(13,length,8)]
column5=[salary_table.find_all("td")[i].text.strip() for i in range(14,length,8)]
column6=[salary_table.find_all("td")[i].text.strip() for i in range(15,length,8)]

In [4]:
df_dict={'player_names':player_names,
        'salary ($)':column1}
        
salary_df=pd.DataFrame(df_dict)
salary_df[:5]

Unnamed: 0,player_names,salary ($)
0,Stephen Curry,"$45,780,966"
1,John Wall,"$44,310,840"
2,James Harden,"$44,310,840"
3,Russell Westbrook,"$44,211,146"
4,Kevin Durant,"$42,018,900"


### Transform salaries to numeric values

In [5]:
salary_df.replace({'\$':''}, regex = True,inplace=True)
salary_df.replace(',','', regex=True, inplace=True)

for item in salary_df.columns[1:]:
    
    salary_df[item]=pd.to_numeric(salary_df[item])
    
salary_df[:5]

Unnamed: 0,player_names,salary ($)
0,Stephen Curry,45780966
1,John Wall,44310840
2,James Harden,44310840
3,Russell Westbrook,44211146
4,Kevin Durant,42018900


### Creating arrays of years and team names in order to feed them as f string to for loops which is going to create for us all the DataFrames we need

In [6]:
years=[]
for i in range(2020,2000,-1):
    years.append(f"{i}-{i+1}")

team_names = ["portland_trail_blazers","oklahoma_city_thunder","los_angeles_clippers","cleveland_cavaliers","philadelphia_76ers",
              "miami_heat","golden_state_warriors","denver_nuggets","houston_rockets","orlando_magic",
              "los_angeles_lakers","milwaukee_bucks","toronto_raptors","washington_wizards","dallas_mavericks",
              "san_antonio_spurs","utah_jazz","brooklyn_nets","new_orleans_pelicans","boston_celtics",
              "minnesota_timberwolves","sacramento_kings","indiana_pacers","chicago_bulls","atlanta_hawks",
              "detroit_pistons","new_york_knicks","phoenix_suns","memphis_grizzlies","charlotte_hornets"]            
              
print(years)
print(team_names)
print(len(years))
print(len(team_names)) 


['2020-2021', '2019-2020', '2018-2019', '2017-2018', '2016-2017', '2015-2016', '2014-2015', '2013-2014', '2012-2013', '2011-2012', '2010-2011', '2009-2010', '2008-2009', '2007-2008', '2006-2007', '2005-2006', '2004-2005', '2003-2004', '2002-2003', '2001-2002']
['portland_trail_blazers', 'oklahoma_city_thunder', 'los_angeles_clippers', 'cleveland_cavaliers', 'philadelphia_76ers', 'miami_heat', 'golden_state_warriors', 'denver_nuggets', 'houston_rockets', 'orlando_magic', 'los_angeles_lakers', 'milwaukee_bucks', 'toronto_raptors', 'washington_wizards', 'dallas_mavericks', 'san_antonio_spurs', 'utah_jazz', 'brooklyn_nets', 'new_orleans_pelicans', 'boston_celtics', 'minnesota_timberwolves', 'sacramento_kings', 'indiana_pacers', 'chicago_bulls', 'atlanta_hawks', 'detroit_pistons', 'new_york_knicks', 'phoenix_suns', 'memphis_grizzlies', 'charlotte_hornets']
20
30


### Create a testing DataFrame by using f-strings

In [7]:
url=f"https://hoopshype.com/salaries/{team_names[0]}/{years[0]}/"
print(url)

r = requests.get(url,timeout=2.5)
r_html = r.text

soup = BeautifulSoup(r_html, 'html.parser')

salary_table = soup.find('table')



length=len(salary_table.find_all("td"))
player_names=[salary_table.find_all("td")[i].text.strip() for i in range(3,length,3)]
column1=[salary_table.find_all("td")[i].text.strip() for i in range(4,length,3)]

df_dict={'player_names':player_names,
        'salary ($)':column1}
        
salary_portland20_21=pd.DataFrame(df_dict)

https://hoopshype.com/salaries/portland_trail_blazers/2020-2021/


In [8]:
salary_portland20_21.replace({'\$':''}, regex = True,inplace=True)
salary_portland20_21.replace(',','', regex=True, inplace=True)

for item in salary_portland20_21.columns[1:]:
    
    salary_portland20_21[item]=pd.to_numeric(salary_df[item])
    
salary_portland20_21[:5]

Unnamed: 0,player_names,salary ($)
0,Damian Lillard,45780966
1,CJ McCollum,44310840
2,Jusuf Nurkic,44310840
3,Robert Covington,44211146
4,Norman Powell,42018900


### We are going to create a dictionary with keys being name of teams plus year using f-strings and their values will be their payroll as a DataFrame

In [65]:
payrolls=dict()
for i in range(0,len(team_names)):
    for j in range(0,len(years)):
        url = f"https://hoopshype.com/salaries/{team_names[i]}/{years[j]}/"
        
        salary_table = []
        try:
            r = requests.get(url,timeout=3)
            r_html = r.text
            soup = BeautifulSoup(r_html, 'html.parser')
            salary_table = soup.find('table')
        except:
            print("Something wrong")

        length=len(salary_table.find_all("td"))
        player_names=[salary_table.find_all("td")[i].text.strip() for i in range(3,length,3)]
        column1=[salary_table.find_all("td")[i].text.strip() for i in range(4,length,3)]
        
        column2 = []
        for k in range(0,len(player_names)):
            
            if k == len(player_names)-1:
                column2.append("NaN")
                break
            
            player_name_trans = player_names[k].replace(' ','-')
            
            url_player = f"https://hoopshype.com/player/{player_name_trans}/salary/"
            
            r = requests.get(url_player,timeout=5)
            r_html = r.text

            soup = BeautifulSoup(r_html, 'html.parser')
            htmlText = soup.find_all("span", class_="player-bio-text-line-value")
            positionString = str(htmlText[0])

            position = positionString.find(">")

            column2.append(positionString[position+1])
            
            
        
        salary_table = []
        try:
            r = requests.get(url,timeout=3)
            r_html = r.text
            soup = BeautifulSoup(r_html, 'html.parser')
            salary_table = soup.find('table')
        except:
            print("Something wrong")

        df_dict={'player_names':player_names,
                'salary ($)':column1,
                    'position':column2}
        
        
        
        tempDataFrame=pd.DataFrame(df_dict)
        
        tempDataFrame.replace({'\$':''}, regex = True,inplace=True)
        tempDataFrame.replace(',','', regex=True, inplace=True)
        for item in tempDataFrame.columns[1:2]:
            tempDataFrame[item]=pd.to_numeric(tempDataFrame[item])
            
        
        payrolls[f"{team_names[i]}{years[j]}"] = tempDataFrame
        

               player_names  salary ($) position
0            Damian Lillard    31626953        G
1               CJ McCollum    29354152        G
2              Jusuf Nurkic    12888889        C
3          Robert Covington    12138345        F
4             Norman Powell    10865952        G
5             Derrick Jones     9258000        F
6              Zach Collins     5406255        C
7               Enes Kanter     5005350        -
8          Andrew Nicholson     2844429        -
9           Carmelo Anthony     2564753        F
10          Anfernee Simons     2252040        G
11            Nassir Little     2210640        F
12         Anderson Varejao     1913345        -
13              Harry Giles     1620564        -
14                CJ Elleby      898310        F
15  Rondae Hollis-Jefferson      502957        -
16           Keljin Blevins      449115        G
17                  TJ Leaf      104598        -
18                   Totals   131904647      NaN
        player_names

ConnectTimeout: HTTPSConnectionPool(host='hoopshype.com', port=443): Max retries exceeded with url: /player/Mfiondu-Kabengele/salary/ (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x0000028810EB0288>, 'Connection to hoopshype.com timed out. (connect timeout=5)'))

### We will print first team's payroll of last year and last team's of first year

In [40]:
payrolls["portland_trail_blazers2020-2021"]

Unnamed: 0,player_names,salary ($)
0,Damian Lillard,31626953
1,CJ McCollum,29354152
2,Jusuf Nurkic,12888889
3,Robert Covington,12138345
4,Norman Powell,10865952
5,Derrick Jones,9258000
6,Zach Collins,5406255
7,Enes Kanter,5005350
8,Andrew Nicholson,2844429
9,Carmelo Anthony,2564753


In [41]:
payrolls[f"denver_nuggets2013-2014"]

KeyError: 'denver_nuggets2013-2014'

In [35]:
url=f"https://hoopshype.com/player/stephen-curry/salary"

r = requests.get(url,timeout=2.5)
r_html = r.text

soup = BeautifulSoup(r_html, 'html.parser')
htmlText = soup.find_all("span", class_="player-bio-text-line-value")
positionString = str(htmlText[0])

position = positionString.find(">")

print(positionString[position+1])

G
