<a href="https://colab.research.google.com/github/cam-d/cam-d.github.io/blob/master/teamRankings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Get Rankings from [TeamRankings.com](https://www.teamrankings.com/ncaa-basketball/ranking/predictive-by-other)

In [0]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re


In [0]:
base_url = 'https://www.teamrankings.com/ncaa-basketball/ranking/predictive-by-other/?date='
years = range(2007,2021) #only have it going back to 06-07 szn

In [0]:
data = pd.DataFrame({'Rank':[], 'Team':[], 'Rating':[], 'v1-25':[], 'v26-50':[], 'v51-100':[],'Hi':[], 'Low':[], 'Last':[], 'Year':[]})

In [0]:
for year in years:
  url = base_url+str(year)+'-'+'03-10'
  f = requests.get(url)
  soup = BeautifulSoup(f.text)
  table_html = soup.find_all('table', {'class': 'tr-table datatable scrollable'})
  thead = table_html[0].find_all('thead')

  table = table_html[0]
  for x in thead:
    table = str(table).replace(str(x), '')

  df = pd.read_html(table)[0]
  df.columns = ['Rank', 'Team', 'Rating', 'v1-25', 'v26-50', 'v51-100','Hi', 'Low', 'Last']
  df['Year'] = year-1 #rankings shown at season end
  data = data.append(df)

In [0]:
data.head(3)

Unnamed: 0,Rank,Team,Rating,v1-25,v26-50,v51-100,Hi,Low,Last,Year
0,1.0,N Carolina (26-6),33.3,7-4,5-1,3-1,1.0,18.0,1.0,2006.0
1,2.0,Kansas (28-4),29.6,2-1,4-0,7-3,2.0,28.0,2.0,2006.0
2,3.0,Florida (27-5),29.1,4-1,5-2,5-2,1.0,12.0,3.0,2006.0


In [0]:
d1 = data.copy()

In [0]:
from string import digits
remove_digits = str.maketrans('', '', digits)

In [0]:
data['Team'] = [x.translate(remove_digits) for x in data['Team']]
data['Team'] = [x[:-3] for x in data['Team']]

In [0]:
data['v1-25'] = data['v1-25'].str.split(pat = '-', expand = True)

In [0]:
data.head(2)

Unnamed: 0,Rank,Team,Rating,v1-25,v26-50,v51-100,Hi,Low,Last,Year
0,1.0,N Carolina,33.3,7,5-1,3-1,1.0,18.0,1.0,2006.0
1,2.0,Kansas,29.6,2,4-0,7-3,2.0,28.0,2.0,2006.0
2,3.0,Florida,29.1,4,5-2,5-2,1.0,12.0,3.0,2006.0
3,4.0,Texas A&M,27.7,2,3-0,6-4,3.0,37.0,4.0,2006.0
4,5.0,Memphis,26.7,1,2-1,3-0,2.0,30.0,5.0,2006.0
5,6.0,Ohio State,26.4,4,4-0,5-0,1.0,10.0,6.0,2006.0
6,7.0,UCLA,25.8,5,6-2,5-3,3.0,35.0,7.0,2006.0
7,8.0,Wisconsin,25.6,4,6-0,4-0,6.0,72.0,8.0,2006.0
8,9.0,Maryland,25.1,7,3-2,2-2,4.0,21.0,9.0,2006.0
9,10.0,Duke,25.0,6,4-2,4-2,3.0,14.0,10.0,2006.0


## Get Stats from  [Ken Pomeroy's Basketball Stats](https://kenpom.com/)
##### Credit to Kaggle user  [Walterhan](https://www.kaggle.com/walterhan/scrape-kenpom-data/comments)



In [0]:
# Base url, and a lambda func to return url for a given year
base_url = 'http://kenpom.com/index.php'
url_year = lambda x: '%s?y=%s' % (base_url, str(x) if x != 2020 else base_url)

years = range(2002, 2021)

# Create a method that parses a given year and spits out a raw dataframe
def import_raw_year(year):
    """
    Imports raw data from a ken pom year into a dataframe
    """
    f = requests.get(url_year(year))
    soup = BeautifulSoup(f.text)
    table_html = soup.find_all('table', {'id': 'ratings-table'})

   
    # Let's find all the thead contents and just replace/remove them
    # This allows us to easily put the table row data into a dataframe using panda
    thead = table_html[0].find_all('thead')

    table = table_html[0]
    for x in thead:
        table = str(table).replace(str(x), '')

    df = pd.read_html(table)[0]
    df['year'] = year
    return df
    

# Import all the years into a singular dataframe
df = None
for x in years:
    df = pd.concat( (df, import_raw_year(x)), axis=0) \
        if df is not None else import_raw_year(2002)

# Column rename based off of original website
df.columns = ['Rank', 'Team', 'Conference', 'W-L', 'Pyth', 
             'AdjustO', 'AdjustO Rank', 'AdjustD', 'AdjustD Rank',
             'AdjustT', 'AdjustT Rank', 'Luck', 'Luck Rank', 
             'SOS Pyth', 'SOS Pyth Rank', 'SOS OppO', 'SOS OppO Rank',
             'SOS OppD', 'SOS OppD Rank', 'NCSOS Pyth', 'NCSOS Pyth Rank', 'Year']
             
# Lambda that returns true if given string is a number and a valid seed number (1-16)
valid_seed = lambda x: True if str(x).replace(' ', '').isdigit() \
                and int(x) > 0 and int(x) <= 16 else False

# Use lambda to parse out seed/team
df['Seed'] = df['Team'].apply(lambda x: x[-2:].replace(' ', '') \
                              if valid_seed(x[-2:]) else np.nan )

df['Team'] = df['Team'].apply(lambda x: x[:-2] if valid_seed(x[-2:]) else x)

# Split W-L column into wins and losses
df['Wins'] = df['W-L'].apply(lambda x: int(re.sub('-.*', '', x)) )
df['Losses'] = df['W-L'].apply(lambda x: int(re.sub('.*-', '', x)) )
df.drop('W-L', inplace=True, axis=1)



df=df[[ 'Year', 'Rank', 'Team', 'Conference', 'Wins', 'Losses', 'Seed','Pyth', 
             'AdjustO', 'AdjustO Rank', 'AdjustD', 'AdjustD Rank',
             'AdjustT', 'AdjustT Rank', 'Luck', 'Luck Rank', 
             'SOS Pyth', 'SOS Pyth Rank', 'SOS OppO', 'SOS OppO Rank',
             'SOS OppD', 'SOS OppD Rank', 'NCSOS Pyth', 'NCSOS Pyth Rank']]
             