In [1]:
import pandas as pd
import time
import requests
import re
from bs4 import BeautifulSoup


In [2]:
SPORTSIPY_KENPOM = {'Alabama Birmingham': 'UAB',
                    'Albany NY': 'Albany',
                    'Bowling Green St.': 'Bowling Green',
                    'Brigham Young': 'BYU',
                    'Cal State Bakersfield': 'Cal St. Bakersfield',
                    'Cal State Fullerton': 'Cal St. Fullerton',
                    'Cal State Northridge': 'Cal St. Northridge',
                    'California Baptist': 'Cal Baptist',
                    'University of California': 'UCLA',
                    'Central Connecticut St.': 'Central Connecticut',
                    'Central Florida': 'UCF',
                    'Citadel': 'The Citadel',
                    'College of Charleston': 'Charleston',
                    'Detroit Mercy': 'Detroit',
                    'Florida International': 'FIU', 
                    'Grambling': 'Grambling St.',
                    'Cal State Long Beach': 'Long Beach St.',
                    'Long Island University': 'LIU',
                    'Louisiana St.': 'LSU',
                    'Loyola IL': 'Loyola Chicago',
                    'Maryland Baltimore County': 'UMBC',
                    'Massachusetts Lowell': 'UMass Lowell',
                    'Missouri Kansas City': 'UMKC',
                    'Omaha': 'Nebraska Omaha',
                    'Nevada Las Vegas': 'UNLV',
                    'North Carolina Asheville': 'UNC Asheville',
                    'North Carolina Greensboro': 'UNC Greensboro',
                    'North Carolina St.': 'N.C. State',
                    'North Carolina Wilmington': 'UNC Wilmington',
                    'Pennsylvania': 'Penn',
                    'Prairie View': 'Prairie View A&M',
                    'Saint Francis PA': 'St. Francis PA',
                    "Saint Mary's CA": "Saint Mary's",
                    'South Carolina Upstate': 'USC Upstate',
                    'Southern California': 'USC',
                    'Southern Methodist': 'SMU',
                    'Southern Mississippi': 'Southern Miss',
                    "St. John's NY": "St. John's",
                    'Texas A&M Corpus Christi': 'Texas A&M Corpus Chris',
                    'Texas Arlington': 'UT Arlington',
                    'Texas Christian': 'TCU',
                    'Texas El Paso': 'UTEP',
                    'Texas Rio Grande Valley': 'UT Rio Grande Valley',
                    'Texas San Antonio': 'UTSA',
                    'Virginia Commonwealth': 'VCU'}

In [3]:
def scrape_ken_pom():
    
    url="https://kenpom.com/index.php"
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, 'html.parser')

    #find the table and header tags
    table = soup.find_all('table', {'id':'ratings-table'})[0]
    headers = table.find('thead').find('tr', {'class':'thead2'})
    #store the header tags titles into a list
    cols = [th.text for th in headers.find_all('th')]

    #initialize the dataframe
    
    kenpom_df = pd.DataFrame(columns=cols)

    #get the data elements from the table
    body = table.find_all('tbody')[0]
    rows = body.find_all('tr')

    #scrape the data from each row
    for r in rows:
        data = r.find_all('td')
        info = []

        for d in data:
            try:
                if(d["class"] != ['td-right']):
                    info.append(d.text)
            except:
                info.append(d.text)

        #if the list isnt empty
        if(info != []):
            #create a series with the list and add it to the dataframe
            info_df = pd.Series(info, index=cols)
            kenpom_df = kenpom_df.append(info_df, ignore_index=True)

    #clean and convert columns
    kenpom_df["AdjO"] = pd.to_numeric(kenpom_df["AdjO"])
    kenpom_df["AdjD"] = pd.to_numeric(kenpom_df["AdjD"])
    kenpom_df["AdjT"] = pd.to_numeric(kenpom_df["AdjT"])

    return kenpom_df

In [4]:
kenpom_df = scrape_ken_pom()

In [12]:
kenpom_df


Unnamed: 0,Rk,Team,Conf,W-L,AdjEM,AdjO,AdjD,AdjT,Luck,AdjEM.1,OppO,OppD,AdjEM.2
0,1,Gonzaga,WCC,0-0,+31.52,120.1,88.5,75.9,+.000,+0.00,0.0,0.0,+0.00
1,2,Michigan,B10,0-0,+28.30,114.8,86.5,71.5,+.000,+0.00,0.0,0.0,+0.00
2,3,Kansas,B12,0-0,+26.30,114.4,88.1,72.5,+.000,+0.00,0.0,0.0,+0.00
3,4,Illinois,B10,0-0,+25.00,112.2,87.2,73.8,+.000,+0.00,0.0,0.0,+0.00
4,5,Purdue,B10,0-0,+24.93,115.0,90.0,70.7,+.000,+0.00,0.0,0.0,+0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,354,Bethune Cookman,SWAC,0-0,-25.79,87.2,113.0,72.5,+.000,+0.00,0.0,0.0,+0.00
354,355,Maryland Eastern Shore,MEAC,0-0,-27.75,81.3,109.0,71.9,+.000,+0.00,0.0,0.0,+0.00
355,356,Arkansas Pine Bluff,SWAC,0-0,-29.09,81.7,110.8,72.0,+.000,+0.00,0.0,0.0,+0.00
356,357,Chicago St.,WAC,0-0,-30.99,83.4,114.4,72.2,+.000,+0.00,0.0,0.0,+0.00


In [5]:
from sportsipy.ncaab.teams import Teams
teams = Teams()

In [6]:
sportsipy_team_names = [t.name for t in teams]
kenpom_names = list(kenpom_df.Team.values)

In [7]:
def map_spy_to_kenpom(name_spy):
    name_kp = name_spy
    name_kp = name_kp.replace('(', '')
    name_kp = name_kp.replace(')', '')
    name_kp = name_kp.replace('-', ' ')
    
    if(name_kp.endswith("State")):
        name_kp = name_kp[:-5] + "St."
    
    try:
        name_kp = SPORTSIPY_KENPOM[name_kp]
    except KeyError:
        pass
    
    print(name_spy, name_kp, sep=" > ")
    
    return name_kp
    

In [8]:
mapped_names = [map_spy_to_kenpom(x) for x in sportsipy_team_names]

matches = [name for name in mapped_names if name in kenpom_names]
non_matches = [name for name in mapped_names if name not in kenpom_names]

Abilene Christian > Abilene Christian
Air Force > Air Force
Akron > Akron
Alabama A&M > Alabama A&M
Alabama-Birmingham > UAB
Alabama State > Alabama St.
Alabama > Alabama
Albany (NY) > Albany
Alcorn State > Alcorn St.
American > American
Appalachian State > Appalachian St.
Arizona State > Arizona St.
Arizona > Arizona
Little Rock > Little Rock
Arkansas-Pine Bluff > Arkansas Pine Bluff
Arkansas State > Arkansas St.
Arkansas > Arkansas
Army > Army
Auburn > Auburn
Austin Peay > Austin Peay
Ball State > Ball St.
Baylor > Baylor
Bellarmine > Bellarmine
Belmont > Belmont
Bethune-Cookman > Bethune Cookman
Binghamton > Binghamton
Boise State > Boise St.
Boston College > Boston College
Boston University > Boston University
Bowling Green State > Bowling Green
Bradley > Bradley
Brigham Young > BYU
Brown > Brown
Bryant > Bryant
Bucknell > Bucknell
Buffalo > Buffalo
Butler > Butler
Cal Poly > Cal Poly
Cal State Bakersfield > Cal St. Bakersfield
Cal State Fullerton > Cal St. Fullerton
Cal State Nort

In [9]:
d = {}
for name in mapped_names:
    if(name not in kenpom_names):
        d[name] = ""

print(d)

{'Detroit': '', 'St. Thomas MN': ''}


In [10]:
print(len(matches))
print(len(non_matches))


356
2


In [11]:
string = "Loyola (MD)"
string = string.replace('(', '')
string = string.replace(')', '')
print(string)

Loyola MD
