In [21]:
#import our libraries
import sys
import time
import psycopg2
import pandas as pd
import numpy as np
import urllib2
import re
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database



In [22]:
def make_it_lower(array):
    
    new_array = []
    for item in array:
        new_array.append(item.lower())
        
    return new_array

In [23]:
def bs_column_names(tag):
    return tag.name == 'th' and tag.has_attr('width') 

In [24]:
def get_column_names(namedata):
    
    column_names = []
    for name in namedata.find_all(bs_column_names):
        try:
            match = re.search('([A-Z3].*[A-Z])', str(name))
            #if name.get_text() not in column_names: 
            if match.group(0).strip() not in column_names: 
                column_names.append(match.group(0).strip())
        except:
            a = 1
            
    return column_names

In [25]:
def bs_column_data(tag):
    return tag.name == 'td' and not tag.has_attr('style')

In [26]:
def get_column_data(datadata):

    column_data = []
    for data in datadata.find_all(bs_column_data):
        try:
            match = re.search('[0-9]*[0-9\-]*[0-9\-]*[0-9\-]', str(data))
            column_data.append(match.group(0).strip())
        except:
            a = 1 
            
    return column_data

In [27]:
def clean_player(mydict):
    mykeys = mydict.keys()
    myvals = mydict.values()

    for ii in np.arange(len(mykeys)):
        match_key = re.search('(.*?)-(.)', mykeys[ii])
        match_value = re.search('(.*?)-(.*)', myvals[ii])
        if match_key is not None:
            mydict[match_key.group(1)] = match_value.group(1)
            newkey = match_key.group(1)
            newkey = newkey.replace(newkey[-1],match_key.group(2))
            mydict[newkey] = match_value.group(2)
    
    return mydict

In [28]:
def build_player(bit1, alldict, team, data_columns):
 
    #print bit1.prettify()
    #team1_columns = get_column_names(bit1)
    #print team1_columns
    column_data = get_column_data(bit1)
    #print column_data
    matches = re.findall(', ([A-Z])', str(bit1))
    #print matches


    for player in bit1.find_all('a'):

        #start a dictionaries
        team1_dict1 = {'team_name':team, 'player':'', 'player_pos':'', 'player_url':''}
        team1_dict2 = {}

        #start filling dictionaries
        team1_dict1['player_url'] = player['href']
        team1_dict1['player'] = player.get_text().replace(r'\n', '').strip()
        team1_dict1['player_pos'] = matches[0]
        del matches[0]
        #print team1_dict1
   
        for column in data_columns:
            #print column
            #print column_data[0]
            team1_dict2[column] = column_data[0]
            del column_data[0]
        #print team1_dict2     

        #combine the two dictionaries
        finaldict = team1_dict1.copy()
        finaldict.update(team1_dict2)
        finaldict = clean_player(finaldict)
        alldict.append(finaldict)
    
    return alldict



In [29]:
def build_box_score(soup):

    alldict1 = []
    try:
        box_score = soup.body.div.table

        #begin with the first team
        bit1 = box_score.contents[1]
        team = bit1.th.get_text()
        team = team.replace(r'\n', '')
        team = team.strip()
        data_columns = get_column_names(bit1)
        data_columns = make_it_lower(data_columns)
        alldict1 = build_player(box_score.contents[3], alldict1, team, data_columns)
        alldict1 = build_player(box_score.contents[7], alldict1, team, data_columns)

        #now do the second team
        bit1 = box_score.contents[13]
        team = bit1.th.get_text()
        team = team.replace(r'\n', '')
        team = team.strip()
        data_columns = get_column_names(bit1)
        data_columns = make_it_lower(data_columns)
        alldict1 = build_player(box_score.contents[15], alldict1, team, data_columns)
        alldict1 = build_player(box_score.contents[19], alldict1, team, data_columns)
    except:
        alldict1 = []
        
    return alldict1

In [30]:
def get_med_dir(my_con, boxscore_table_name, id):
    '''
    function to return a directory folder based on 
    specified user inputs and knowledge of the games db.
    
    my_con: connection to Postgres server
    boxscore_table_name: name of table on Postgres server to query
    id: unique ESPN game id to find month and year of
    '''
    
    
    sql_query = " SELECT year, month FROM %s WHERE id=%s; " % (boxscore_table_name, id)
    #print sql_query
    try:
        from_sql_query = pd.read_sql_query(sql_query, my_con)
        #print from_sql_query.head(5)
    except:
        print '  games table, %s, does not exist' % boxscore_table_name

    month = from_sql_query['month'][0]
    year = from_sql_query['year'][0]
    if int(month) > 7:
        med_dir = str(year) + '-' + str(int(year)+1) + '/'
    else:
        med_dir = str(int(year)-1) + '-' + str(year) + '/'

    return med_dir

In [31]:
def create_snippet(file_in, file_out=None):
    
    
    if file_out is None:
        file_out = 'temp.txt'
    else:
        file_out = str(file_out)
    
    #print 'File in:', file_in
    #print 'File out:', file_out
    
    #read in entire web page
    target = open(file_in, 'r')
    text = target.read()
    target.close()
    
    #apply BS
    soup = BeautifulSoup(text, 'lxml')
    box_score = soup.div(id='my-players-table')

    #read the file back in
    target = open(file_out, 'w')
    text = target.write(str(box_score))
    target.close()
    
    #reapply BS - makes the structure a bit more managable 
    #  and hopefully consistent :|
    target = open(file_out, 'r')
    text = target.read()
    target.close()
    soup = BeautifulSoup(text, 'lxml')
    
    #resave file with better structure
    #target = open(file_out, 'w')
    #text = target.write(str(soup.prettify()))
    #target.close()

    return soup

In [32]:
def start_games_db(dbname, username):
    print '  Firing up the data base.'
    
    engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
    print '    DB url:', engine.url
    db_exist = database_exists(engine.url)
    if not db_exist:
        create_database(engine.url)
    print '    DB exists? %s' % db_exist
    
    out_dict = {'dbname':dbname, 'username':username, 
                'exists':db_exist, 'engine_url':engine.url, 'engine':engine}
    
    return out_dict

In [33]:
def query_boxscore_db(my_con, boxscore_table_name):
    print '    Now getting games for which to find stats.'
    
    sql_query = " SELECT id FROM %s WHERE in_hand='%s' and id>284000064; " % (boxscore_table_name, 'yes')
    #print sql_query
    try:
        from_sql_query = pd.read_sql_query(sql_query, my_con)
        #print from_sql_query.head(5)
    except:
        print '  games table, %s, does not exist' % boxscore_table_name

    
    return from_sql_query

In [34]:
def make_gamestats_db(reset, dataframe, db_connect, stats_table):
    
    #print reset
    #print dataframe
    #print stats_table
    #print db_connect
    
    if reset == 1:
        my_if_exists = 'replace'
    else:
        my_if_exists = 'append'
    
    #print my_if_exists
    dataframe.to_sql(stats_table, db_connect['engine'], if_exists=my_if_exists)

    reset = 0 #now it will append instead of replacing
    return reset

In [None]:
def main(remake_db=False):
    
    dbname = 'ncaa_mbb_db'
    username = 'smaug'

    boxscore_dir = 'boxscore_pages/'
    boxscore_file = 'ncaa_mbb_boxscore_DDDDDDDDD.txt'
    boxscore_table_name = 'games'

    stats_table_name = 'stats'

    reset = 1
    if remake_db:
        print '  Now remaking the statistics database.'
        
        #fire up the database engine
        db_connect = start_games_db(dbname, username)
        #print db_engine
        
        #get a connection to the database
        my_con = None
        my_con = psycopg2.connect(database=dbname, user=username)
        #print my_con
        
        
        #query the boxscore database
        games_to_get = query_boxscore_db(my_con, boxscore_table_name)
        #print len(games_to_get)

        cnt = 1
        alldict = [] #this is a guess!!!!
        team = 'unknown'#this is a guess!!!!
        for game_to_get in games_to_get['id']:
            print game_to_get

            med_dir = get_med_dir(my_con, boxscore_table_name, game_to_get)    
            this_file = (boxscore_dir + med_dir + 
                        boxscore_file.replace('DDDDDDDDD', str(game_to_get)))
            this_game = create_snippet(this_file)
            #print this_game
            
            my_boxscore = build_box_score(this_game)
            if my_boxscore != []:
                #print my_boxscore
                df_team1 = pd.DataFrame.from_records(my_boxscore)
                #df_team1.to_csv('example1.csv')
                #df_team2 = pd.DataFrame.from_csv('example1.csv')
                #print df_team2

                #print df_team1
                reset = make_gamestats_db(reset, df_team1, 
                                          db_connect, stats_table_name)

            #if cnt >=4:
            #    sys.exit(0)
            cnt = cnt + 1


In [None]:
# boilerplate to execute call to main() function
if __name__ == '__main__':
    main(remake_db=True)

  Now remaking the statistics database.
  Firing up the data base.
    DB url: postgres://smaug@localhost/ncaa_mbb_db
    DB exists? True
    Now getting games for which to find stats.
290502599
290500222
290502429
290502250
290500026
290500085
290500130
290502115
290502210
290502229
290502341
290502405
290502430
290502514
290502681
290502803
290502908
290500290
290502454
290500249
290502031
290502393
290502443
290502710
290500140
290500045
290500084
290490153
290490235
290490097
290490154
290492305
290490356
290490270
290490008
290500155
290502172
290502483
290502635
290500147
290500204
290502501
290502502
290500030
290500278
290502608
290490042
290490047
290490052
290490057
290490107
290490111
290490113
290490119
290490218
290490276
290490277
290490322
290490325
290490349
290490350
290492006
290492084
290492117
290492198
290492241
290492244
290492272
290492275
290492325
290470041
290492520
290492542
290492619
290492649
290492670
290492711
290492466
290490036
290490082
290490145
29049