In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
import re
import time

In [2]:
with open('boxrec_creds.json') as f:
    payload = json.load(f)
session = requests.Session()
LOGIN_URL = 'https://boxrec.com/en/login'
post_response = session.post(LOGIN_URL,data=payload)

In [3]:
class Boxer:
    
    base_url = 'https://boxrec.com'
    fight_url_base = 'https://boxrec.com/en/proboxer/{id}'
    
    def __init__(self, name, id, session, fight_uris=None, df_fights=None):
        self.name = name
        self.id = id
        self.session = session
        self.fight_uris = fight_uris if fight_uris else list()
        self.df_fights = None
        
    def scrape_fight_uris(self):
        boxer_url = Boxer.fight_url_base.format(id=self.id)
        response = self.session.get(boxer_url)
        assert(response.status_code == 200)
        soup = BeautifulSoup(response.text, 'html.parser')
        for node in soup.find_all('div', class_='boutP'):
            href = node.parent.get('href')
            self.fight_uris.append(href)
    
    @staticmethod
    def scrape_fight_url(url):
        response = session.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        table = str(soup.find('table', class_='responseLessDataTable'))
        df = pd.read_html(table)[0]
        fight_date = soup.find('a', href=re.compile(r'/en/date\?.*')).text
        return df, fight_date
    
    @staticmethod
    def clean_fight_df(input_df):
        df = input_df.copy()
        start_row = 1 if len(df.index) == 23 else 0 # check if the referee score row is included
        df = (df.iloc[start_row:16]
              .drop(2) # drop column with nans
              .pipe(Boxer.fill_fight_index_cols)
              .set_index(1)
              .drop(['details', 'record'])
              .applymap(lambda x: x.strip() if isinstance(x, str) else x)
             )
        # Set result for winner only
        df.loc['ruling', ] = [Boxer.ruling_from_name(x) for x in df.loc['name', ].tolist()]
        df.loc['ruling', ] = df.loc['ruling', ~df.loc['ruling', ].isna()].iat[0]
        # Fill result across both rows
        df.loc['result', ] = [Boxer.result_from_name(x) for x in df.loc['name', ].tolist()]
        # Clean name column
        df.loc['name', ] = [Boxer.clean_name(x) for x in df.loc['name', ].tolist()]
        # Fill winner across both rows
        df.loc['winner', ] = Boxer.winner_row(df)
        return df
    
    @staticmethod
    def fill_fight_index_cols(input_df):
        df = input_df.copy()
        df.iloc[0, 1] = 'name'
        df.iloc[1, 1] = 'ranking'
        return df
    
    @staticmethod
    def pivot_fight_df(input_df, name, fight_date, url):
        df = input_df.copy()
        df = (df.loc[:, df.loc['name', ] != name]
         .T
         .rename(columns=lambda x: 'opponent_' + x if x not in ['winner'] else x)
         .assign(fight_id=url.split('/')[-1],
                 name=name, 
                 date=fight_date,
                 url=url
                )
         .set_index(['name', 'fight_id'])
        )
        return df
    
    @staticmethod
    def ruling_from_name(name):
        if 'won' in name:
            return name.split(' won ')[1].strip()
        elif 'drawn' in name:
            return name.split('drawn ')[1].strip()
        else:
            return None

    @staticmethod
    def result_from_name(name):
        if 'won' in name:
            return 'win'
        elif 'drawn' in name:
            return 'draw'
        else:
            return 'loss'

    @staticmethod
    def clean_name(name):
        if 'won' in name:
            return name.split(' won ')[0].strip()
        elif 'drawn' in name:
            return name.split('drawn ')[0].strip()
        else:
            return name
    
    @staticmethod
    def winner_row(df):
        if (df.loc['result', ] == 'draw').all():
            return 'N/A'
        else:
            return df.loc['name', df.loc['result', ] == 'win'].iat[0, ]
    
    def scrape_fights(self):
        frames = list()
        for uri in self.fight_uris:
            url = Boxer.base_url + uri
            try:
                df, fight_date = Boxer.scrape_fight_url(url)
                df = (df.pipe(Boxer.clean_fight_df)
                      .pipe(Boxer.pivot_fight_df, self.name, fight_date, url)
                     )
            except Exception as e:
                print(f"Error on url: {url}")
                raise e
            frames.append(df)
            time.sleep(30)
        self.df_fights = pd.concat(frames)

In [4]:
fury = Boxer('Tyson Fury', 479205, session)
fury.scrape_fight_uris()
fury.scrape_fights()
wilder = Boxer('Deontay Wilder', 468841, session)
wilder.scrape_fight_uris()
wilder.scrape_fights()

In [6]:
frames = [fury.df_fights, wilder.df_fights]
df = pd.concat(frames)
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,opponent_name,opponent_ranking,opponent_before fight,opponent_after fight,opponent_age,opponent_stance,opponent_height,opponent_reach,opponent_won,opponent_lost,opponent_drawn,opponent_KOs,opponent_ruling,opponent_result,winner,date,url,opponent_last 6
name,fight_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Tyson Fury,2424982,Deontay Wilder,5.0,756.3 points,481.3 points,34,orthodox,6′ 7″ / 201cm,83″ / 211cm,42,0,1,41,TKO round 7,loss,Tyson Fury,"Saturday 22, February 2020",https://boxrec.com/en/event/798948/2424982,
Tyson Fury,2380608,Otto Wallin,59.0,42.80 points,42.80 points,28,southpaw,6′ 5½″ / 197cm,78″ / 198cm,20,0,0,13,UD round 12,loss,Tyson Fury,"Saturday 14, September 2019",https://boxrec.com/en/event/793512/2380608,
Tyson Fury,2335845,Tom Schwarz,60.0,27.80 points,27.80 points,25,orthodox,6′ 5½″ / 197cm,,24,0,0,16,TKO round 2,loss,Tyson Fury,"Saturday 15, June 2019",https://boxrec.com/en/event/784993/2335845,
Tyson Fury,2281110,Deontay Wilder,5.0,635.7 points,557.7 points,33,orthodox,6′ 7″ / 201cm,83″ / 211cm,40,0,0,39,SD,draw,,"Saturday 1, December 2018",https://boxrec.com/en/event/775415/2281110,
Tyson Fury,2256926,Francesco Pianeta,,23.44 points,23.44 points,33,southpaw,6′ 5″ / 196cm,80″ / 203cm,35,4,1,21,PTS round 10,loss,Tyson Fury,"Saturday 18, August 2018",https://boxrec.com/en/event/769039/2256926,
Tyson Fury,2243639,Sefer Seferi,33.0,3.844 points,3.844 points,39,orthodox,,,23,1,0,21,RTD round 4,loss,Tyson Fury,"Saturday 9, June 2018",https://boxrec.com/en/event/768104/2243639,
Tyson Fury,2012278,Wladimir Klitschko,,1454 points,929.2 points,39,orthodox,6′ 6″ / 198cm,81″ / 206cm,64,3,0,53,UD round 12,loss,Tyson Fury,"Saturday 28, November 2015",https://boxrec.com/en/event/721333/2012278,
Tyson Fury,1942244,Christian Hammer,18.0,201.2 points,158.5 points,27,orthodox,6′ 2½″ / 189cm,78″ / 198cm,17,3,0,10,RTD round 8,loss,Tyson Fury,"Saturday 28, February 2015",https://boxrec.com/en/event/705496/1942244,
Tyson Fury,1921854,Dereck Chisora,10.0,310.8 points,202.8 points,30,orthodox,6′ 1½″ / 187cm,74″ / 188cm,20,4,0,13,RTD round 10,loss,Tyson Fury,"Saturday 29, November 2014",https://boxrec.com/en/event/699689/1921854,
Tyson Fury,1859820,Joey Abell,47.0,6.614 points,6.614 points,32,southpaw,6′ 4″ / 193cm,76″ / 193cm,29,7,0,28,TKO round 4,loss,Tyson Fury,"Saturday 15, February 2014",https://boxrec.com/en/event/683497/1859820,


In [8]:
df.to_excel(r'data/FuryWilderFightHistory.xlsx')