In [24]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas_profiling

# Data Collection: Scraping from Baseball Reference (BR)
Baseball Reference is the only resource I found with historical data for the KBO preceding 2014. For thoroughness, I pulled all of the available data. There are two steps to the scraping because indivdual player season-to-season performance data is nested away on different individual pages of the website.

In [7]:
# scraping BR's KBO encyclopedia page for individual team/season page IDs
url = "https://www.baseball-reference.com/register/league.cgi?code=KBO&class=Fgn"
r = requests.get(url)
html_doc = r.text
regex = "\/register\/team\.cgi\?id=\w{8}"
matches = re.findall(regex, html_doc)
team_season_ids = [match[-8:] for match in matches]

In [21]:
#scraping each team/season page ID's table, and concatenating into one DataFrame
df = pd.DataFrame()
base_url = "https://www.baseball-reference.com/register/team.cgi?id="
with tqdm(total=len(team_season_ids)) as pbar:    
    for id in team_season_ids:
        team_season_url = base_url + id
        html_doc = requests.get(team_season_url).text
        df_new = pd.DataFrame(pd.read_html(html_doc)[0])
        df_new['Team'] = str(BeautifulSoup(html_doc).title.string.split()[1]) + " " + str(BeautifulSoup(html_doc).title.string.split()[2])
        df_new['Season'] = BeautifulSoup(html_doc).title.string.split()[0]
        df = pd.concat([df, df_new], ignore_index=True)
        pbar.update(1)

100%|████████████████████████████████████████████████████████████████████████████████| 313/313 [04:20<00:00,  1.20it/s]


In [23]:
df.to_csv('KBO Projections Raw Data')

In [22]:
print(df.shape)
print(df.head())
print(df.tail())
print(df.describe())
print(df.info())
print(df.dtypes)

(8383, 29)
    Rk                    Name Age   G   PA   AB   R   H  2B  3B  ...    OPS  \
0  1.0            Gweon Su An?  27  24   16   15   4   3   0   0  ...  0.450   
1  2.0           Ju-hwan Choi*  32  29  114  103  14  27   7   0  ...  0.821   
2  3.0           Sang-ho Chung  37  16   34   32   0   6   3   0  ...  0.463   
3  4.0          Soo-bin Chung*  29  28  110  100  16  27   4   1  ...  0.648   
4  5.0  Jose Miguel Fernandez*  32  30  139  124  24  51   9   0  ...  1.058   

   TB  GDP  HBP  SH  SF  IBB  Notes          Team  Season  
0   3    1    0   0   0  0.0    NaN  Doosan Bears    2020  
1  52    3    2   0   2  1.0    NaN  Doosan Bears    2020  
2   9    2    0   1   1  0.0    NaN  Doosan Bears    2020  
3  33    1    0   3   0  0.0    NaN  Doosan Bears    2020  
4  75    5    1   0   3  0.0    NaN  Doosan Bears    2020  

[5 rows x 29 columns]
        Rk             Name   Age   G    PA    AB    R    H   2B  3B  ...  \
8378  15.0  Jeong-hwan Seo?    26  50   135   12

In [None]:
report = df.profile_report(sort='None', html={'style':{'full_width': True}}, progress_bar=False)
report

In [None]:
# list of obvious problems/first blush solutions
'''
some rows are the team total rows -> search and delete row entries that str.contain("players")
Rk col is useless->delete
Notes col is useless -> delete
Extract batting stance from Name column
Columns could be reordered
Null entries in CS, SB because base-stealing data wasn't recorded--
Null entries in IBB because international balls data wasn't recorded--
Age data type should be int, not object
SB, CS dtype should be int
'''