In [50]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas_profiling

# Data Collection: Scraping from Baseball Reference (BR)
Baseball Reference is the only resource I found with historical data for the KBO preceding 2014. For thoroughness, I pulled all of the available data. There are two steps to the scraping because indivdual player season-to-season performance data is nested away on different individual pages of the website.

In [51]:
# scraping BR's KBO encyclopedia page for individual team/season page IDs
url = "https://www.baseball-reference.com/register/league.cgi?code=KBO&class=Fgn"
r = requests.get(url)
html_doc = r.text
regex = "\/register\/team\.cgi\?id=\w{8}"
matches = re.findall(regex, html_doc)
team_season_ids = [match[-8:] for match in matches]

In [52]:
# scraping each team/season page ID's table, and concatenating into one DataFrame
raw = pd.DataFrame()
base_url = "https://www.baseball-reference.com/register/team.cgi?id="
with tqdm(total=len(team_season_ids)) as pbar:    
    for id in team_season_ids:
        team_season_url = base_url + id
        html_doc = requests.get(team_season_url).text
        raw_new = pd.DataFrame(pd.read_html(html_doc)[0])
        raw_new['Team'] = str(BeautifulSoup(html_doc).title.string.split()[1]) + " " + str(BeautifulSoup(html_doc).title.string.split()[2])
        raw_new['Season'] = BeautifulSoup(html_doc).title.string.split()[0]
        raw = pd.concat([raw, raw_new], ignore_index=True)
        pbar.update(1)

100%|████████████████████████████████████████████████████████████████████████████████| 313/313 [14:52<00:00,  2.85s/it]


In [87]:
# saving to local directory, reading from saved csv to variable df
raw.to_csv('KBO_Projections_Raw_Data.csv')
df = pd.read_csv('KBO_Projections_Raw_Data.csv')

In [92]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8070 entries, 0 to 8381
Data columns (total 28 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8070 non-null   int64  
 1   Name        8070 non-null   object 
 2   Age         8070 non-null   object 
 3   G           8070 non-null   int64  
 4   PA          8070 non-null   int64  
 5   AB          8070 non-null   int64  
 6   R           8070 non-null   int64  
 7   H           8070 non-null   int64  
 8   2B          8070 non-null   int64  
 9   3B          8070 non-null   int64  
 10  HR          8070 non-null   int64  
 11  RBI         8070 non-null   int64  
 12  SB          4534 non-null   float64
 13  CS          4534 non-null   float64
 14  BB          8070 non-null   int64  
 15  SO          8070 non-null   int64  
 16  BA          8025 non-null   float64
 17  OBP         8041 non-null   float64
 18  SLG         8025 non-null   float64
 19  OPS         8025 non-null  

In [91]:
# delete rows that are team totals and not individual players
df.drop(df.loc[df['Rk'].isna()].index, inplace=True)
# delete cols that aren't substantive
df.drop(columns = ['Rk', 'Notes'], inplace=True)

In [None]:
# list of obvious problems/first blush solutions
'''
Extract batting stance from Name column
Columns could be reordered
Null entries in CS, SB because base-stealing data wasn't recorded--
Null entries in IBB because international balls data wasn't recorded--
Age data type should be int, not object
SB, CS dtype should be int
'''

# Pandas Profiling Report to check out again once data is cleaned
#report = df.profile_report(sort='None', html={'style':{'full_width': True}}, progress_bar=False)
#report