In [1]:
import pandas as pd
import numpy as np
import requests
import time
import re
import os

# Web Scraping

## Base

In [2]:
r = requests.get('https://www.pba.ph/stats',
                 params={
                     'conf': 25805,
                     'type': 'AVG'
                 })
options = re.findall('value="(.+)"\>(.+)\</option',r.text)
conferences = [x for x in options if 'Cup' in x[1]]
teams = [x for x in options if x not in conferences]
stat_types = ['TOT','AVG']
try:
    os.mkdir('data')
except OSError:
    pass

In [3]:
conf_teams = [x[:3].upper() for x in re.findall('logo/(.+).png',r.text)]

In [13]:
def conf_namer(conf):
    year = conf[:4]
    if 'Governor' in conf:
        con = 'GOV'
    elif 'Philippine' in conf:
        con = 'PH'
    else:
        con = 'COM'
    return year+'_'+con

def create_teams_csv():
    """Creates csv files for every season for every team. returns the list of filenames created"""
    path = "data\\pba_team_csv\\"
    print('Path',path)
    try:
        os.mkdir(path)
    except OSError:
        pass
    else:
        pass
    titles = []
    for conf in conferences:
        for team in teams[1:]:
            for stat_type in stat_types:
                req = requests.get('https://www.pba.ph/stats',
                                   params={
                                       'conf': conf[0],
                                       'team': team[0],
                                       'stat': 'sPointsAverage',
                                       'type': stat_type,
                                       'sort': 'DESC'
                                   })

                df = pd.read_html(req.text)[0].drop_duplicates()
                df.drop('Unnamed: 1',axis=1,inplace=True)
                df['Team'] = team[0]
                title = '_'.join([conf_namer(conf[1]),
                                  team[0],
                                  stat_type]).replace(' ', '_')
                title = title + '.csv'
                df.to_csv(path + title, index=False)
                print(title,'done')
                titles.append(title)
#                 time.sleep(1)
    return titles

In [14]:
create_teams_csv()

Path data\pba_team_csv\
2020_PH_ALA_TOT.csv done
2020_PH_ALA_AVG.csv done
2020_PH_GIN_TOT.csv done
2020_PH_GIN_AVG.csv done
2020_PH_BWE_TOT.csv done
2020_PH_BWE_AVG.csv done
2020_PH_COL_TOT.csv done
2020_PH_COL_AVG.csv done
2020_PH_MAG_TOT.csv done
2020_PH_MAG_AVG.csv done
2020_PH_MER_TOT.csv done
2020_PH_MER_AVG.csv done
2020_PH_NLX_TOT.csv done
2020_PH_NLX_AVG.csv done
2020_PH_NOR_TOT.csv done
2020_PH_NOR_AVG.csv done
2020_PH_PHX_TOT.csv done
2020_PH_PHX_AVG.csv done
2020_PH_ROS_TOT.csv done
2020_PH_ROS_AVG.csv done
2020_PH_SMB_TOT.csv done
2020_PH_SMB_AVG.csv done
2020_PH_TNT_TOT.csv done
2020_PH_TNT_AVG.csv done
2019_GOV_ALA_TOT.csv done
2019_GOV_ALA_AVG.csv done
2019_GOV_GIN_TOT.csv done
2019_GOV_GIN_AVG.csv done
2019_GOV_BWE_TOT.csv done
2019_GOV_BWE_AVG.csv done
2019_GOV_COL_TOT.csv done
2019_GOV_COL_AVG.csv done
2019_GOV_MAG_TOT.csv done
2019_GOV_MAG_AVG.csv done
2019_GOV_MER_TOT.csv done
2019_GOV_MER_AVG.csv done
2019_GOV_NLX_TOT.csv done
2019_GOV_NLX_AVG.csv done
2019_GOV_NOR

['2020_PH_ALA_TOT.csv',
 '2020_PH_ALA_AVG.csv',
 '2020_PH_GIN_TOT.csv',
 '2020_PH_GIN_AVG.csv',
 '2020_PH_BWE_TOT.csv',
 '2020_PH_BWE_AVG.csv',
 '2020_PH_COL_TOT.csv',
 '2020_PH_COL_AVG.csv',
 '2020_PH_MAG_TOT.csv',
 '2020_PH_MAG_AVG.csv',
 '2020_PH_MER_TOT.csv',
 '2020_PH_MER_AVG.csv',
 '2020_PH_NLX_TOT.csv',
 '2020_PH_NLX_AVG.csv',
 '2020_PH_NOR_TOT.csv',
 '2020_PH_NOR_AVG.csv',
 '2020_PH_PHX_TOT.csv',
 '2020_PH_PHX_AVG.csv',
 '2020_PH_ROS_TOT.csv',
 '2020_PH_ROS_AVG.csv',
 '2020_PH_SMB_TOT.csv',
 '2020_PH_SMB_AVG.csv',
 '2020_PH_TNT_TOT.csv',
 '2020_PH_TNT_AVG.csv',
 '2019_GOV_ALA_TOT.csv',
 '2019_GOV_ALA_AVG.csv',
 '2019_GOV_GIN_TOT.csv',
 '2019_GOV_GIN_AVG.csv',
 '2019_GOV_BWE_TOT.csv',
 '2019_GOV_BWE_AVG.csv',
 '2019_GOV_COL_TOT.csv',
 '2019_GOV_COL_AVG.csv',
 '2019_GOV_MAG_TOT.csv',
 '2019_GOV_MAG_AVG.csv',
 '2019_GOV_MER_TOT.csv',
 '2019_GOV_MER_AVG.csv',
 '2019_GOV_NLX_TOT.csv',
 '2019_GOV_NLX_AVG.csv',
 '2019_GOV_NOR_TOT.csv',
 '2019_GOV_NOR_AVG.csv',
 '2019_GOV_PHX_TOT.csv',

## Season

In [15]:
 def get_season_csv():   
    columns = ['Team',
              'W-L',
              'GP',
              '3Pm',
              '3Pa',
              '3P%',
              '2Pm',
              '2Pa',
              '2P%',
              'FGm',
              'FGa',
              'FG%',
              'FTm',
              'FTa',
              'FT%',
              'dREB',
              'oREB',
              'REB',
              'drop',
              'AST',
              'STL',
              'BLK',
              'TO',
              'PTO',
              'PF',
              'FBm',
              'FBa',
              'FBm%',
              'bPTS',
              'PTS']

    path = "data\\pba_season_csv\\"
    print('Path:',path)
    try:
        os.mkdir(path)
    except OSError:
        pass
    else:
        pass
    titles = []
    
    for conf in conferences:
        for stat_type in stat_types:
            reqs = requests.get('https://www.pba.ph/teamstats',
                                params={
                                    'conf': conf[0],
                                    'type': stat_type
                                })
            conf_teams = [x[:3].upper() for x in 
                          re.findall('logo/(.+).png',reqs.text)]
            
            df = pd.read_html(reqs.text)[0].iloc[3:,:].reset_index(drop=True)
            df.columns = columns
            df.Team = conf_teams[:len(df.Team)]
            
            df['W'] = df['W-L'].apply(lambda x: re.findall('(.+)-',x)[0])
            df['L'] = df['W-L'].apply(lambda x: re.findall('-(.+)',x)[0])
            df.drop(['drop','W-L'],axis=1,inplace = True)

            title = '_'.join([conf_namer(conf[1]),
                              stat_type]).replace(' ', '_')
            title = title + '.csv'
            df.to_csv(path + title, index=False)
            
            titles.append(title)
            print(title,'done')
            
    return titles

In [16]:
get_season_csv()

Path: data\pba_season_csv\
2020_PH_TOT.csv done
2020_PH_AVG.csv done
2019_GOV_TOT.csv done
2019_GOV_AVG.csv done
2019_COM_TOT.csv done
2019_COM_AVG.csv done
2019_PH_TOT.csv done
2019_PH_AVG.csv done
2018_GOV_TOT.csv done
2018_GOV_AVG.csv done
2018_COM_TOT.csv done
2018_COM_AVG.csv done
2018_PH_TOT.csv done
2018_PH_AVG.csv done
2017_GOV_TOT.csv done
2017_GOV_AVG.csv done


['2020_PH_TOT.csv',
 '2020_PH_AVG.csv',
 '2019_GOV_TOT.csv',
 '2019_GOV_AVG.csv',
 '2019_COM_TOT.csv',
 '2019_COM_AVG.csv',
 '2019_PH_TOT.csv',
 '2019_PH_AVG.csv',
 '2018_GOV_TOT.csv',
 '2018_GOV_AVG.csv',
 '2018_COM_TOT.csv',
 '2018_COM_AVG.csv',
 '2018_PH_TOT.csv',
 '2018_PH_AVG.csv',
 '2017_GOV_TOT.csv',
 '2017_GOV_AVG.csv']

## Conference

In [17]:
def get_all_players_conference():
    """Creates csv files for every season for every team. returns the list of filenames created"""
    path = "data\\pba_all_players_conference_csv\\"
    print('Path', path)
    try:
        os.mkdir(path)
    except OSError:
        pass
    else:
        pass
    titles = []
    for conf in conferences:
        for stat_type in stat_types:
            req = requests.get('https://www.pba.ph/stats',
                               params={
                                   'conf': conf[0],
                                   'team': 'all',
                                   'stat': 'sPointsAverage',
                                   'type': stat_type,
                                   'sort': 'DESC'
                               })
            df = pd.read_html(req.text)[0].drop('Unnamed: 1',axis=1)
            teams = [x[:3].upper() for x in re.findall('logo/(.+).png',req.text)]
            df['Team'] = teams
            title = '_'.join([conf_namer(conf[1]),
                              'ALL',
                              stat_type]).replace(' ', '_')
            title = title + '.csv'
            df.drop_duplicates(inplace=True)
            df.to_csv(path + title, index=False)
            print(title, 'done')
            titles.append(title)
#             time.sleep(1)
    return titles


In [18]:
get_all_players_conference()

Path data\pba_all_players_conference_csv\
2020_PH_ALL_TOT.csv done
2020_PH_ALL_AVG.csv done
2019_GOV_ALL_TOT.csv done
2019_GOV_ALL_AVG.csv done
2019_COM_ALL_TOT.csv done
2019_COM_ALL_AVG.csv done
2019_PH_ALL_TOT.csv done
2019_PH_ALL_AVG.csv done
2018_GOV_ALL_TOT.csv done
2018_GOV_ALL_AVG.csv done
2018_COM_ALL_TOT.csv done
2018_COM_ALL_AVG.csv done
2018_PH_ALL_TOT.csv done
2018_PH_ALL_AVG.csv done
2017_GOV_ALL_TOT.csv done
2017_GOV_ALL_AVG.csv done


['2020_PH_ALL_TOT.csv',
 '2020_PH_ALL_AVG.csv',
 '2019_GOV_ALL_TOT.csv',
 '2019_GOV_ALL_AVG.csv',
 '2019_COM_ALL_TOT.csv',
 '2019_COM_ALL_AVG.csv',
 '2019_PH_ALL_TOT.csv',
 '2019_PH_ALL_AVG.csv',
 '2018_GOV_ALL_TOT.csv',
 '2018_GOV_ALL_AVG.csv',
 '2018_COM_ALL_TOT.csv',
 '2018_COM_ALL_AVG.csv',
 '2018_PH_ALL_TOT.csv',
 '2018_PH_ALL_AVG.csv',
 '2017_GOV_ALL_TOT.csv',
 '2017_GOV_ALL_AVG.csv']

## Player descriptions

In [19]:
def name_gen(x):
    string = x.split('-')
    last_name = string[-1].capitalize()
    f_init = string[0][0].upper()
    return f_init + '. ' + last_name

def extract(desc,x):
    descs = desc.replace(' ','').split('/')
    height = descs[1].split("'")
    try: 
        height_in = 12 * int(height[0]) + int(height[1])
    except:
        height_in = None
    
    return [descs[0][1:], height_in, '/'.join(descs[2:])][x]
#     return(height_in)

pdata = ['j_number','height','pos']

def player_desc():
    reqs = requests.get('https://www.pba.ph/players')
    players = re.findall("href='players/(.+)'>",reqs.text)[::2]
    descriptions = []
    name = []
    for player in players:
        req_player = requests.get('https://www.pba.ph/players/'+player).text
        descriptions.append(re.findall('(#.+/.*/.+)<br',req_player)[0])
    df_raw = pd.DataFrame({'Link':players,'Description':descriptions})
    df_new = pd.DataFrame()
    df_new['Name'] = df_raw['Link'].apply(name_gen)
    
    pdata = ['j_number','height','pos']
    for x in range(3):
        df_new[pdata[x]] = df_raw['Description'].apply(lambda y: extract(y,x))
    df_new.to_csv('data\\player_desc.csv',index=False)
    return df_new

In [20]:
player_desc()

Unnamed: 0,Name,j_number,height,pos
0,M. Abundo,29,68.0,G
1,R. Adams,1,,F/G
2,K. Agovida,17,,G
3,J. Aguilar,25,81.0,F
4,R. Aguilar,23,76.0,F
...,...,...,...,...
178,W. Wilson,28,74.0,F
179,A. Wong,3,,G
180,M. Wright,35,76.0,G
181,J. Yap,18,75.0,G
