# Team Data Scraper

### Imports and Set-up

In [77]:
import pandas as pd

In [78]:
# set the table cols
team_cols = [
    "League",
    "TeamName",
    "TeamCity",
    "TeamState",
    "StadiumCapacity",
    "HomeGames",
    "TeamSalary",
    "TeamRevenue",
    "TeamPlayers",
    "TeamCity",
    "TeamState",
    "TeamLocationLat",
    "TeamLocationLong",
    "IGFollowers",
    "XFollowers",
]

league_cols = [
    "League",
    "LeagueSeason",
    "SeasonStart",
    "SeasonEnd",
    "PlayoffStart",
    "PlayoffEnd",
    "SalaryCap",
]

## NBA

In [79]:
# set a URL to scrape a table from
url = 'https://en.wikipedia.org/wiki/National_Basketball_Association#Teams'
tables = pd.read_html(url)
nba_teams = tables[3]

# drop division, arena, founded, and joined columns
nba_teams = nba_teams.drop(['Conference', 'Division', 'Arena', 'Founded', 'Joined'], axis=1)

# drop the coordinates before the slash
nba_teams['Coordinates'] = nba_teams['Coordinates'].str.split('/').str[1]

# organize to lat and long
nba_teams['Latitude'] = nba_teams['Coordinates'].str.split('°').str[0]
nba_teams['Longitude'] = nba_teams['Coordinates'].str.split('°').str[1]
nba_teams['Longitude'] = nba_teams['Longitude'].str.replace('N ', '')

# strip the latitude and longitude columns
nba_teams['Latitude'] = nba_teams['Latitude'].str.strip()
nba_teams['Longitude'] = nba_teams['Longitude'].str.strip()

# drop coordinates
nba_teams = nba_teams.drop(['Coordinates'], axis=1)

# split the location into TeamCity and TeamState
nba_teams['TeamCity'] = nba_teams['Location'].str.split(',').str[0]
nba_teams['TeamState'] = nba_teams['Location'].str.split(',').str[1]

# strip the location columns
nba_teams['TeamCity'] = nba_teams['TeamCity'].str.strip()
nba_teams['TeamState'] = nba_teams['TeamState'].str.strip()

# rename cols
nba_teams = nba_teams.rename(columns={
    'Team': 'TeamName',
    'Capacity': 'StadiumCapacity',
    'Latitude': 'TeamLocationLat',
    'Longitude': 'TeamLocationLong'
})

# strip \ufeff from the beginning of the team location lat
nba_teams['TeamLocationLat'] = nba_teams['TeamLocationLat'].str.replace('\ufeff', '')

# type Lat and Long as float
nba_teams['TeamLocationLat'] = nba_teams['TeamLocationLat'].astype(float)
nba_teams['TeamLocationLong'] = nba_teams['TeamLocationLong'].astype(float)

# convert all long values to negative
nba_teams['TeamLocationLong'] = nba_teams['TeamLocationLong'] * -1

# drop location
nba_teams = nba_teams.drop(['Location'], axis=1)

# add a column for HomeGames
nba_teams['HomeGames'] = 41

nba_teams["LeagueSeason"] = "2023-2024"
nba_teams["SeasonStart"] = "2023-10-24"
nba_teams["SeasonEnd"] = "2024-04-15"
nba_teams["PlayoffStart"] = "2024-04-20"
nba_teams["PlayoffEnd"] = "2024-06-23"
nba_teams["SalaryCap"] = 136000000
nba_teams["League"] = "NBA"

# get salary cap data
nba_salary = pd.read_html('https://www.spotrac.com/nba/cap/')
nba_salary = nba_salary[0]

# subset to Team, PlayersSigned, and Total Cap
nba_salary = nba_salary[['Team', 'PlayersSigned', 'Total Cap']]

# rename cols
nba_salary = nba_salary.rename(columns={
    'Team': 'TeamName',
    'PlayersSigned': 'TeamPlayers',
    'Total Cap': 'TeamSalary'
})

# rename LA Clippers to Los Angeles Clippers
nba_salary['TeamName'] = nba_salary['TeamName'].replace('LA Clippers', 'Los Angeles Clippers')

# merge the salary data with the team data
nba_teams = nba_teams.merge(nba_salary, on='TeamName')

# make a dictionary for Twitter followers
XFollowers = {
    "Los Angeles Lakers": 11880000,
    "Golden State Warriors": 8450000,
    "Miami Heat": 5060000,
    "Chicago Bulls": 4730000,
    "Boston Celtics": 4160000,
    "San Antonio Spurs": 3520000,
    "Cleveland Cavaliers": 3300000,
    "Houston Rockets": 3240000,
    "Oklahoma City Thunder": 2800000,
    "Toronto Raptors": 2580000,
    "New York Knicks": 2460000,
    "Philadelphia 76ers": 2340000,
    "Dallas Mavericks": 2250000,
    "Brooklyn Nets": 2040000,
    "Los Angeles Clippers": 2020000,
    "Milwaukee Bucks": 2020000,
    "Phoenix Suns": 1750000,
    "Orlando Magic": 1630000,
    "Memphis Grizzlies": 1610000,
    "Atlanta Hawks": 1580000,
    "Portland Trail Blazers": 1550000,
    "Indiana Pacers": 1440000,
    "Denver Nuggets": 1430000,
    "New Orleans Pelicans": 1370000,
    "Utah Jazz": 1280000,
    "Washington Wizards": 1260000,
    "Minnesota Timberwolves": 1250000,
    "Sacramento Kings": 1250000,
    "Charlotte Hornets": 1210000,
    "Detroit Pistons": 1120000,
}

assert len(XFollowers) == 30

# make a dictionary for Instagram followers
IGFollowers = {
    "Golden State Warriors": 28300000,
    "Los Angeles Lakers": 21500000,
    "Cleveland Cavaliers": 14700000,
    "Chicago Bulls": 9300000,
    "Boston Celtics": 7100000,
    "Miami Heat": 6000000,
    "Houston Rockets": 5900000,
    "Oklahoma City Thunder": 5500000,
    "Milwaukee Bucks": 500000,
    "Brooklyn Nets": 4700000,
    "Los Angeles Clippers": 4600000,
    "San Antonio Spurs": 4400000,
    "Toronto Raptors": 3900000,
    "Philadelphia 76ers": 3800000,
    "New York Knicks": 3600000,
    "Dallas Mavericks": 3400000,
    "Phoenix Suns": 2900000,
    "Indiana Pacers": 2700000,
    "Minnesota Timberwolves": 2700000,
    "Portland Trail Blazers": 2700000,
    "New Orleans Pelicans": 2600000,
    "Atlanta Hawks": 2300000,
    "Charlotte Hornets": 2200000,
    "Sacramento Kings": 1900000,
    "Denver Nuggets": 1900000,
    "Memphis Grizzlies": 1600000,
    "Orlando Magic": 1600000,
    "Washington Wizards": 1400000,
    "Detroit Pistons": 1400000,
    "Utah Jazz": 1200000,
}

assert len(IGFollowers) == 30

# add the social data to the team data
nba_teams['IGFollowers'] = nba_teams['TeamName'].map(IGFollowers)
nba_teams['XFollowers'] = nba_teams['TeamName'].map(XFollowers)

# add a dictionary for revenue
TeamRevenue = {
    "Golden State Warriors": 765000000,
    "Los Angeles Lakers": 516000000,
    "New York Knicks": 504000000,
    "Boston Celtics": 443000000,
    "Dallas Mavericks": 429000000,
    "Los Angeles Clippers": 425000000,
    "Houston Rockets": 381000000,
    "Chicago Bulls": 372000000,
    "Philadelphia 76ers": 371000000,
    "Miami Heat": 371000000,
    "Brooklyn Nets": 367000000,
    "Phoenix Suns": 366000000,
    "Denver Nuggets": 348000000,
    "Cleveland Cavaliers": 348000000,
    "Milwaukee Bucks": 329000000,
    "Atlanta Hawks": 326000000,
    "Washington Wizards": 323000000,
    "San Antonio Spurs": 319000000,
    "Toronto Raptors": 305000000,
    "Portland Trail Blazers": 300000000,
    "Sacramento Kings": 289000000,
    "Utah Jazz": 274000000,
    "Detroit Pistons": 274000000,
    "Charlotte Hornets": 269000000,
    "Oklahoma City Thunder": 267000000,
    "Indiana Pacers": 263000000,
    "New Orleans Pelicans": 262000000,
    "Orlando Magic": 261000000,
    "Minnesota Timberwolves": 259000000,
    "Memphis Grizzlies": 258000000,
}

assert len(TeamRevenue) == 30

# add the revenue data to the team data
nba_teams['TeamRevenue'] = nba_teams['TeamName'].map(TeamRevenue)

# fix dtypes
nba_teams['SeasonStart'] = pd.to_datetime(nba_teams['SeasonStart'])
nba_teams['SeasonEnd'] = pd.to_datetime(nba_teams['SeasonEnd'])
nba_teams['PlayoffStart'] = pd.to_datetime(nba_teams['PlayoffStart'])
nba_teams['PlayoffEnd'] = pd.to_datetime(nba_teams['PlayoffEnd'])
nba_teams['HomeGames'] = nba_teams['HomeGames'].astype(int)
nba_teams['StadiumCapacity'] = nba_teams['StadiumCapacity'].astype(int)
nba_teams['SalaryCap'] = nba_teams['SalaryCap'].astype(int)
nba_teams['TeamSalary'] = nba_teams['TeamSalary'].str.replace('$', '').str.replace(',', '').astype(int)
nba_teams['TeamRevenue'] = nba_teams['TeamRevenue'].astype(int)
nba_teams['TeamPlayers'] = nba_teams['TeamPlayers'].astype(int)
nba_teams['IGFollowers'] = nba_teams['IGFollowers'].astype(int)
nba_teams['XFollowers'] = nba_teams['XFollowers'].astype(int)

***Add NBA data to the dfs***

In [80]:
# add the data to the corresponding dataframe
team_df = nba_teams[team_cols].copy()

# add total followers
team_df['TotalFollowers'] = team_df['IGFollowers'] + team_df['XFollowers']

# # add revenue per follower, revenue per game, revenue per player, and salary per player
team_df["RevPerFollower"] = team_df["TeamRevenue"] / team_df["TotalFollowers"]
team_df["RevPerGame"] = team_df["TeamRevenue"] / team_df["HomeGames"]
team_df["RevPerPlayer"] = team_df["TeamRevenue"] / team_df["TeamPlayers"]
team_df["SalaryPerPlayer"] = team_df["TeamSalary"] / team_df["TeamPlayers"]

league_df = nba_teams[league_cols].copy()

league_df = league_df.drop_duplicates()
league_df["LeagueXFollowers"] = 44570000
league_df["LeagueIGFollowers"] = 86300000
league_df["LeagueRevenue"] = 10580000000
league_df["LeaguePlayers"] = 450
league_df["LeagueSalary"] = 6120000000
league_df["LeagueTotalFollowers"] = league_df["LeagueXFollowers"] + league_df["LeagueIGFollowers"]

league_df["IGDate"] = "2023-03-01"
league_df["XDate"] = "2023-10-01"
    


## Save Data

In [81]:
save_switch = True
assert save_switch == True

# save the four dataframes
team_df.to_csv('data/team_data.csv', index=False)
league_df.to_csv('data/league_data.csv', index=False)