In [251]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import re

base_url = 'https://www.sports-reference.com/cbb/'

   
"""
the url for champions and final four in a given year
https://www.sports-reference.com/cbb/postseason/2018-ncaa.html
"""    
def getChamps(year):
    champs_url = base_url + 'postseason/' + year + '-ncaa.html'
    response = requests.get(champs_url)
    soup = BeautifulSoup(response.text, "html.parser")
    natty_champs = soup.find_all(href=re.compile("/cbb/schools"))
    
    return natty_champs[1].contents[0]
    
def getFinalFour(year):
    champs_url = base_url + 'postseason/' + year + '-ncaa.html'
    response = requests.get(champs_url)
    soup = BeautifulSoup(response.text, "html.parser")
    four = soup.find_all(href=re.compile("/cbb/schools"))
    
    final_four = []
    for teams in range(1, 6):
            team = four[teams].contents[0]
            if team not in final_four:
                final_four.append(team)
        
    return final_four


"""
the url for rosters for each team in a given year
https://www.sports-reference.com/cbb/schools/duke/2014.html
""" 
def getRosters(team, year):
    roster_url = base_url + 'schools/' + team + "/" + year + '.html'
    response = requests.get(roster_url)
    soup = BeautifulSoup(response.text, "html.parser")
    team_links = soup.find_all(href=re.compile("/cbb/players"))
    team_roster = soup.find("tbody")
    raw_roster = team_roster.find_all('tr')
    
    roster = {}
    for player in raw_roster:
        player_content = player.find('a')
        value='https://www.sports-reference.com' + player_content.get('href')
        key=player_content.get_text().upper()
        roster.update({str(key) : value})
    
    return roster

"""
we can get the link for each player from the return dict of roster
enter all params as string: getStats('jabari parker', 'duke', '2014')
https://www.sports-reference.com/cbb/players/jabari-parker-1.html
"""
def getStats(player, team, year):
    
    stats = {}
    player = player.upper()
    roster = getRosters(team, year)
    
    player_link = roster.get(player)
    
    response = requests.get(player_link)
    soup = BeautifulSoup(response.text, "html.parser")
    #print(soup.prettify())
    
    stat_ids = soup.find_all('th')
    stat_numbers = soup.find('tbody')
    
    stat_labels = []
    for statline in stat_ids:
        stat_label = statline.contents[0]
        
        
        if stat_label == '\xa0' or stat_label == 'Conf':
            continue
        else:
            stat_labels.append(stat_label.string)
    
    raw_stats = stat_numbers.find_all('td')
    raw_stats = stat_numbers.find_all('tr')
    
    index=0;
    
    raw_td_class = [];
    for raw_stat in raw_stats:
        stat_year = int(raw_stat.a.contents[0].get_text()[0:4]) + 1
        if str(stat_year) == year:
            raw_td_class = raw_stat.find_all('td');
            break;
    
    index = 0;
    actual_stats =[]
    for stat in raw_td_class:
        if index == 0:
            actual_stats.append(year)
            index+=1
            continue
        if index == 1:
            actual_stats.append(team.upper())
            index+=1
            continue
        if index == 26:
            index+=1
            continue
        actual_stats.append(stat.contents[0])
        index+=1
        
    
    for (stat_label, actual_stat) in zip(stat_labels, actual_stats):
        stats.update({stat_label: actual_stat})
    
    return stats
 
getStats('quinn cook', 'duke', '2015')    






{'Season': '2015',
 'School': 'DUKE',
 'G': '39',
 'GS': '39',
 'MP': '35.8',
 'FG': '5.1',
 'FGA': '11.3',
 'FG%': '.453',
 '2P': '2.5',
 '2PA': '4.6',
 '2P%': '.536',
 '3P': '2.6',
 '3PA': '6.6',
 '3P%': '.395',
 'FT': '2.5',
 'FTA': '2.8',
 'FT%': '.891',
 'ORB': '0.4',
 'DRB': '3.0',
 'TRB': '3.4',
 'AST': '2.6',
 'STL': '1.0',
 'BLK': '0.0',
 'TOV': '1.2',
 'PF': '1.9',
 'PTS': '15.3',
 'SOS': '9.87'}