In [22]:
import pandas as pd
from google.cloud import bigquery
from bs4 import BeautifulSoup, Comment
import requests

class NBA:
    """
    NBA - Documentation goes here
    """
    
    def __init__(self, url, table_id, column_schema, row_schema, data_schema,
                 use_links=[], add_links={}, add_links_to_text={}, name_change={}, filter_rows = {}, ):
        self.url = url
        self.table_id = table_id
        self.use_links = use_links
        self.add_links = add_links
        self.column_schema = column_schema
        self.row_schema = row_schema
        self.data_schema = data_schema
        self.name_change = name_change
        self.filter_rows = filter_rows
        self.add_links_to_text = add_links_to_text
        
        try:
            # print('0')
            self.get_soup()
            # print('1')
            self.get_columns()
            # print('2')
            self.get_rows()    
            # print('3')
            self.get_data()
            # print('4')
            self.add_link()
            # print('5')
            self.add_link_to_text()
            # print('5.5')
            self.filter_by_value()
            # print('6')
        except:
            # print('error')
            self.df = pd.DataFrame()

    def get_soup(self):

        r = requests.get(self.url)
        soup = BeautifulSoup(r.content, 'lxml')
        self.soup = soup.find('table', {"id": self.table_id})
        
    def get_columns(self):

        base="self.soup"
        add=".findAll('{}'){}"

        for i in range(len(self.column_schema['column_attrs'])):
            if self.column_schema['offset'][i] != None:
                base = f"{base}{add.format(self.column_schema['column_attrs'][i],'['+str(self.column_schema['offset'][i])+']')}"
            else:
                base = f"{base}{add.format(self.column_schema['column_attrs'][i],'')}"

        loop = f"[x.getText() for x in {base}][{self.column_schema['shift']}:]"

        self.columns = eval(loop)

        for k, v in self.name_change.items():
            self.columns[k] = v   
            
    def get_rows(self):

        base="self.soup"
        add=".findAll('{}'){}"

        for i in range(len(self.row_schema['row_attrs'])):
            if self.row_schema['row_offset'][i] != None:
                base = f"{base}{add.format(self.row_schema['row_attrs'][i], '['+str(self.row_schema['row_offset'][i])+']')}"
            else:
                base = f"{base}{add.format(self.row_schema['row_attrs'][i], '')}"

        self.rows = eval(base)

    def get_data(self):

        data = [[x.getText() if j not in self.use_links else x.a['href'] \
                 for j, x in enumerate(self.rows[i].findAll(self.data_schema['data_attrs']))][self.data_schema["data_offset"]:] \
                for i in range(len(self.rows))]

        max_list_len = max(list(map(lambda x: len(x), data)))
        self.data = [self.pad_list(sub,max_list_len) for sub in data if len(sub) > 0]

        self.df = pd.DataFrame(data=data, columns=self.columns).dropna(how='all').reset_index(drop=True)
        
    def add_link(self):

        for key, value in self.add_links.items():
            self.df[key] = [[x.a['href'] for j,x in enumerate(self.rows[i].findAll(self.data_schema['data_attrs'])) \
                             if j == value][0] for i in range(len(self.data))]
            
    def add_link_to_text(self):
        
        for key, value in self.add_links_to_text.items():
                    
            l1 = [[x.findAll('a') for i,x in enumerate(self.rows[j].findAll(self.data_schema['data_attrs'])) if i == value and x.a != None] \
                  for j in range(len(self.data))]
                        
            self.df[key] = [dict(zip(map(lambda z: z.getText(), x[0]), map(lambda z: z['href'], x[0]))) if len(x) > 0 else None for x in l1]
            self.df[key] = self.df[key].astype(str)
            
            
            
    def pad_list(self, l,n):
        while len(l) < n:
            l.append("")
        return l
    
    def filter_by_value(self):
        for key, value in self.filter_rows.items():
            self.df = self.df[self.df[key] != value].reset_index(drop=True)

def pbp():
    
    client = bigquery.Client()

    boxscore_query = """
    WITH
      pbp AS (
      SELECT
        DISTINCT boxscore
      FROM
        `dulcet-name-296415.nba_test.pbp_raw`),
      sch AS (
      SELECT
        DISTINCT boxscore
      FROM
        `dulcet-name-296415.nba_test.schedule`)
    SELECT
      sch.boxscore
    FROM
      sch
    LEFT JOIN
      pbp
    ON
      sch.boxscore = pbp.boxscore
    WHERE
      pbp.boxscore IS NULL
    ORDER BY
      RAND()
    LIMIT
      10
    """

    boxscores = client.query(boxscore_query).to_dataframe().boxscore.tolist()

    url = "https://www.basketball-reference.com{}"

    pbp_col_attrs = {"column_attrs": ['tr', 'th'], "shift": 0, "offset": ['1', None]}
    pbp_row_attrs = {"row_attrs": ['tr'], "row_offset": ['2:']}
    pbp_data_attrs = {"data_attrs": ['td'], "data_offset": 0}

    out = pd.DataFrame()

    for box in boxscores:

        temp_url = url.format(box.replace("boxscores", "boxscores/pbp"))

        pbp = NBA(url = temp_url, 
              table_id = 'pbp',
              column_schema = pbp_col_attrs, 
              row_schema = pbp_row_attrs, 
              data_schema = pbp_data_attrs,
              name_change={2:"plus1", 4:'plus2'},
                 add_links_to_text={'tm1_players': 1, "tm2_players":5})


        if len(pbp.df) > 0:
            pbp.df['boxscore'] = box
            pbp.df['tm1_name'] = pbp.df.columns[1]
            pbp.df['tm2_name'] = pbp.df.columns[5]

            pbp.df = pbp.df.rename(columns={pbp.df.columns[1]: 'tm1_play', pbp.df.columns[5]: 'tm2_play'})

            out = pd.concat([out, pbp.df])
        
    schema = [
        bigquery.SchemaField('Time','STRING'),
        bigquery.SchemaField('tm1_play', 'STRING'),
        bigquery.SchemaField('plus1', 'STRING'),
        bigquery.SchemaField('Score', 'STRING'),
        bigquery.SchemaField('plus2', 'STRING'),
        bigquery.SchemaField('tm2_play', 'STRING'),
        bigquery.SchemaField('tm1_players', 'STRING'),
        bigquery.SchemaField('tm2_players', 'STRING'),
        bigquery.SchemaField('boxscore', 'STRING'),
        bigquery.SchemaField('tm1_name', 'STRING'),
        bigquery.SchemaField('tm2_name', 'STRING')
        ]
    
    # return out
    
    job_config = bigquery.LoadJobConfig(
        write_disposition = 'WRITE_TRUNCATE',
        schema = schema
    )

    table_ref = "dulcet-name-296415.nba_test.pbp_raw"

    job = client.load_table_from_dataframe(out, table_ref, job_config=job_config, location="US")

    job.result()  # Waits for table load to complete.

In [None]:
pbp()