In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sportsreference.ncaab.roster import Roster
from sportsreference.ncaab.roster import Player
from sportsipy.ncaab.teams import Team, Teams

pd.set_option('display.max_columns', None)

# Select teams and years you want
> I chose to include the years 2010 to 2020 (inclusive) and to only get players from Duke and Kentucky. In the function below, we'll pull all the player ids into a set (if there are duplicate items, it simply doesn't add them).

In [2]:
# years from 2010 to 2020
years = np.arange(2010, 2021, 1).astype(str)
teams = ['Duke', 'Kentucky']
player_ids = set()
# iterate through teams
for team in teams:
    # iterate through years for an individual team
    for year in years:
        data = Roster(team, year = year)
        for player in data.players:
            # add the player id to a set
            # if they already exist, the set won't add them
            player_ids.add(player.player_id)

## Initialize an empty dataframe
> We'll use this to concatenate data to.

### use a get_player_df function to grab some player information (useful stuff would be names and season years, including career which looks like an aggregate)
[Sportsreference Medium Article I used](https://towardsdatascience.com/sports-reference-api-intro-dbce09e89e52?gi=ce7f0fc79a3e)

In [3]:
df = pd.DataFrame()

In [4]:
# Function to get player info from Player class object.
def get_player_df(player):
    
    # helper function to get player age during each season.
    def get_age(year, bd):
        if year[0] == "Career":
            return None
        else:
            year_dt = datetime(int(year[0][0:4]) + 1, 1, 1)
            age_years = relativedelta(year_dt, bd).years + relativedelta(year_dt, bd).months/12
            return age_years
        
    # helper function to get year for each row and denote
    # rows that contain career totals.
    def get_year(ix):
        if ix[0] == "Career":
            return "Career"
        elif ix[0] == "1999-00":
            return "2000"
        else:
            return ix[0][0:2] + ix[0][-2:]
    
    # get player df and add some extra info
    player_df = player.dataframe
    player_df['player_id'] = player.player_id
    player_df['name'] = player.name
    player_df['year'] = [get_year(ix) for ix in player_df.index]
    player_df['id'] = [player_id + ' ' + year for player_id,
                       year in zip(player_df['player_id'],
                       player_df['year'])]
    player_df.set_index('id', drop = True, inplace = True)
    
    return player_df

## Loop through player ids and get all the players into a single dataframe

In [5]:
for player_id in player_ids:
    player = Player(player_id)
    player_frame = get_player_df(player)
    df = pd.concat([df, player_frame])
df

Unnamed: 0_level_0,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,defensive_box_plus_minus,defensive_rebound_percentage,defensive_rebounds,defensive_win_shares,...,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,weight,win_shares,win_shares_per_40_minutes,name,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dominique-hawkins-1 2014,7.2,12,0.4,1,-0.4,sec,3.0,6.2,16,0.3,...,6,15,0.467,7,7.6,195,0.3,0.035,Dominique Hawkins,2014
dominique-hawkins-1 2015,14.9,14,0.7,1,3.5,sec,5.6,8.6,13,0.4,...,5,14,0.357,5,14.4,195,0.5,0.113,Dominique Hawkins,2015
dominique-hawkins-1 2016,8.2,13,1.2,3,4.1,sec,2.4,3.9,9,0.3,...,6,17,0.529,9,12.8,195,0.7,0.12,Dominique Hawkins,2016
dominique-hawkins-1 2017,13.1,64,0.0,0,5.9,sec,4.7,5.7,39,1.2,...,18,75,0.453,34,11.5,195,2.5,0.137,Dominique Hawkins,2017
dominique-hawkins-1 Career,11.3,103,0.4,5,4.1,,4.0,5.8,77,2.1,...,35,121,0.455,55,11.3,195,4.0,0.111,Dominique Hawkins,Career
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
justise-winslow-1 Career,13.2,82,2.9,34,10.0,,4.6,19.8,203,2.5,...,71,246,0.516,127,22.9,225,5.5,0.196,Justise Winslow,Career
nick-richards-2 2018,3.0,9,7.0,33,5.7,sec,3.1,20.7,108,1.0,...,26,112,0.616,69,15.5,247,2.4,0.173,Nick Richards,2018
nick-richards-2 2019,2.9,7,12.0,47,6.6,sec,4.8,16.5,66,1.0,...,26,82,0.598,49,16.2,247,2.2,0.193,Nick Richards,2019
nick-richards-2 2020,1.7,7,8.0,66,7.0,sec,2.4,18.6,160,1.6,...,49,254,0.642,163,20.7,247,5.1,0.222,Nick Richards,2020


## Export all your work for later use.

In [6]:
df.to_csv('data/duke_kentucky_2010_to_2020.csv', index_label = 'id')