In [1]:
#import packages

import sys
import re
import os
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

In [2]:
'''
credential and the three main databases info 
    1. scoreboard: database with info regarding the various ESPN scoreboard pages
    2. games: database with info regarding all games played
        -filled with data scraped from scoreboard pages 
    3. stats: database with info from all games played
        -filled with data scrapped from games pages

'''


username = 'smaug'
dbname = 'ncaa_mbb_db'

scoreboard_dir = 'scoreboard_pages/'
scoreboard_file = 'ncaa_mbb_scoreboard_full_YYYYMMDD.txt'
scoreboard_table_name = 'scoreboard'
scoreboard_table_range = [['20021101','20030430'], 
                            ['20031101','20040430'], 
                            ['20041101','20050430'],
                            ['20051101','20060430'],
                            ['20061101','20070430'],
                            ['20071101','20080430'],
                            ['20081101','20090430'],
                            ['20091101','20100430'],
                            ['20101101','20110430'],
                            ['20111101','20120430'],
                            ['20121101','20130430'],
                            ['20131101','20140430'],
                            ['20141101','20150430'],
                            ['20151101','20160123'],
                           ] 

boxscore_dir = 'boxscore_pages/'
boxscore_file = 'ncaa_mbb_boxscore_DDDDDDDDD.txt'
boxscore_table_name = 'games'

stats_table_name = 'stats'


# For creating the base scoreboard database

In [15]:
### make base scoreboard database

engine = create_engine('postgres://%s@localhost/%s'%(username, dbname))
print '  DB url:', engine.url
db_exist = database_exists(engine.url)
if not db_exist:
    create_database(engine.url)
print '  DB exists? %s' % db_exist

con = None
con = psycopg2.connect(database=dbname, user=username)

my_dates = []
my_years = []
my_months = []
my_days = []
for scoreboard in scoreboard_table_range:
    dates_range = pd.date_range(start=scoreboard[0], end=scoreboard[1], freq='D')
    for date_range in dates_range:
        match = re.search('(\d\d\d\d)-(\d\d)-(\d\d)', str(date_range))
        my_dates.append(match.group(0))
        my_years.append(match.group(1))
        my_months.append(match.group(2))
        my_days.append(match.group(3))

in_hand = []
for ii in np.arange(len(my_dates)):
    bit1 = scoreboard_file
    bit1 = bit1.replace('YYYY', my_years[ii])
    bit1 = bit1.replace('MM', my_months[ii])
    bit1 = bit1.replace('DD', my_days[ii])
    #print bit1

    bit2 = scoreboard_dir  
    if int(my_months[ii]) > 7:
        bit2 = str(my_years[ii]) + '-' + str(int(my_years[ii])+1) + '/'
    else:
        bit2 = str(int(my_years[ii])-1) + '-' + str(my_years[ii]) + '/'
    #print bit2
   
    line = 'ls ' + scoreboard_dir + bit2 + bit1
    #print line
    f = os.popen(line)
    try:
        f.readlines()[0]
        in_hand.append('yes')
    except:
        #print f.readlines()
        in_hand.append('no')


scoreboard_df = pd.DataFrame({'date':my_dates, 
                              'year':my_years, 
                              'month':my_months,
                              'day':my_days,
                              'in_hand':in_hand,        
                            })
print scoreboard_df.tail(5)


##################################################################
###are you really sure you want to rebuild the entire scoreboard database???
#scoreboard_df.to_sql(scoreboard_table_name, engine, if_exists='replace')
##################################################################



  DB url: postgres://smaug@localhost/ncaa_mbb_db
  DB exists? True
            date day in_hand month  year
2435  2016-01-19  19     yes    01  2016
2436  2016-01-20  20     yes    01  2016
2437  2016-01-21  21     yes    01  2016
2438  2016-01-22  22     yes    01  2016
2439  2016-01-23  23     yes    01  2016


In [5]:
con = None
con = psycopg2.connect(database=dbname, user=username)
print '  ', con


sql_query = "SELECT COUNT(*) FROM %s;" % (scoreboard_table_name)
print sql_query
try:
    from_sql_query = pd.read_sql_query(sql_query, con)
    print from_sql_query
except:
    print '  scoreboard_table does not exist' 

sql_query = "SELECT COUNT(*) FROM %s WHERE in_hand='%s';" % (scoreboard_table_name, 'no')
print sql_query
try:
    from_sql_query = pd.read_sql_query(sql_query, con)
    print from_sql_query
except:
    print '  scoreboard_table does not exist' 





   <connection object at 0x109a29770; dsn: 'dbname=ncaa_mbb_db user=smaug', closed: 0>
SELECT COUNT(*) FROM scoreboard;
   count
0   2440
SELECT COUNT(*) FROM scoreboard WHERE in_hand='no';
   count
0      0


# For testing out the games database

In [16]:
'''
test out the 'games' database

just two sample queries:
 1. to find the total number in the database
 2. to find the total number of entries in the database left to download
'''

con = None
con = psycopg2.connect(database=dbname, user=username)
print '  ', con


sql_query = "SELECT COUNT(*) FROM %s;" % (boxscore_table_name)
print sql_query
try:
    from_sql_query = pd.read_sql_query(sql_query, con)
    print from_sql_query.head(5)
except:
    print '  games table, %s, does not exist' % boxscore_table_name

sql_query = "SELECT id, year FROM %s WHERE in_hand='%s' AND id >320000000;" % (boxscore_table_name, 'yes')
print sql_query
try:
    from_sql_query = pd.read_sql_query(sql_query, con)
    print from_sql_query.head(5)
except:
    print '  games table, %s, does not exist' % boxscore_table_name






   <connection object at 0x109a29770; dsn: 'dbname=ncaa_mbb_db user=smaug', closed: 0>
SELECT COUNT(*) FROM games;
   count
0  80100
SELECT id, year FROM games WHERE in_hand='yes' AND id >320000000;
          id  year
0  320930096  2012
1  320910096  2012
2  320912305  2012
3  320900221  2012
4  320890024  2012


# For testing out the gamestats database

In [12]:
con = None
con = psycopg2.connect(database=dbname, user=username)
print '  ', con


sql_query = "SELECT COUNT(*) FROM %s;" % (stats_table_name)
print sql_query
try:
    from_sql_query = pd.read_sql_query(sql_query, con)
    print from_sql_query.head(5)
except:
    print '  stats table, %s, does not exist' % stats_table_name


#sql_query = "select * from INFORMATION_SCHEMA.COLUMNS where TABLE_NAME='%s';" % (stats_table_name)
#sql_query = "SELECT player, player_url, team_name, player_pos FROM %s;" % (stats_table_name)
sql_query = "SELECT pf, pts, ftm, fta, fgm FROM %s;" % (stats_table_name)


print sql_query
try:
    from_sql_query = pd.read_sql_query(sql_query, con)
    print from_sql_query.head(15)
except:
    print '  stats table, %s, does not exist' % stats_table_name




   <connection object at 0x109a29770; dsn: 'dbname=ncaa_mbb_db user=smaug', closed: 0>
SELECT COUNT(*) FROM stats;
   count
0     77
SELECT pf, pts, ftm, fta, fgm FROM stats;
   pf pts ftm fta fgm
0   5  15   1   4   5
1   2   9   1   2   4
2   1  19  11  13   4
3   3  11   0   0   5
4   1  18  10  14   3
5   2   0   0   0   0
6   0   0   0   0   0
7   0   0   0   0   0
8   4   4   0   0   2
9   2   0   0   0   0
10  5  15   3   3   6
11  5   4   0   0   2
12  5  20   5   6   7
13  4  10   2   3   4
14  5   7   0   0   3
