In [1]:
#Imports
import numpy as np

In [2]:
#Let's start loading our data into arrays

#2001->2017
#Data contains all of statistical information from the game
data_list = []
#Info has the dates and which teams played
info_list = []

for year in range(2001,2018):

    file_ref = "./Game Data/{}.csv".format(year)
    data_cols = (2,3,4,5,6,9,10,17,19,20,21,22,23,26,27,34)
    info_cols = (0,1,18)
    
    data_list.append(np.loadtxt(file_ref,delimiter=',',skiprows=1,usecols=data_cols,ndmin=2))
    info_list.append(np.loadtxt(file_ref,dtype='str',delimiter=',',skiprows=1,usecols=info_cols,ndmin=2))

In [3]:
#Data Check
print(data_list[0][:3])
print(info_list[0][:3])

[[101.  30. 272.  32.  18.   0.   1.  45. 407.  68. 137.  16.   8.   0.
    0.  24.]
 [ 64.  35. 226.  41.  17.   4.   1.  27. 108.  33. 178.  31.  21.   0.
    1.  41.]
 [159.  50. 151.  18.  10.   0.   1.  21.  56.  28. 130.  22.  11.   0.
    0.   7.]]
[['8/23/2001' ' Louisville' ' New Mexico St']
 ['8/25/2001' ' UNC' ' Oklahoma']
 ['8/25/2001' ' Nebraska' ' TCU']]


In [4]:
#Let's read in our team map
teams = np.loadtxt("./teams.csv",dtype='str',delimiter=', ',ndmin=2)

def findteam(team_name):
    #Looks for index given a team name
    team_name = team_name[1:]
    
    for i in range(len(teams)):
        for j in range(3):
            if teams[i,j] == team_name:
                return i
    #200 is false
    return 200

In [5]:
#Let's create a function that looks at a year of ordered dates and converts them to a list of scalars
def scale_dates(date_list):
    
    print(date_list[0])
    if "/" in date_list[0]:
        for i in range(len(date_list)):
            hold_date = date_list[i].split('/')

            if len(hold_date[0]) == 1:
                hold_date[0] = "0" + hold_date[0]

            if len(hold_date[1]) == 1:
                hold_date[1] = "0" + hold_date[1]

            date_list[i] = "{}-{}-{}".format(hold_date[2],hold_date[0],hold_date[1])
    
    num_games = len(date_list)
    scaled_dates = np.zeros(num_games)
    
    first_date = np.datetime64(date_list[0])
    last_date = np.datetime64(date_list[-1])
    total_days = last_date - first_date
    
    for game in range(num_games):
        scaled_dates[game] = (np.datetime64(date_list[game]) - first_date)/total_days
        
    return scaled_dates

In [6]:
#Now let's create a list of numpy arrays with our date and opponent data
modified_info = []

assert(len(data_list) == len(info_list))
years = len(data_list)

for year in range(years):
    
    curr_data = data_list[year]
    curr_info = info_list[year]
    
    #assert(len(curr_data) == len(curr_info))
    games = len(curr_data)
    visitor_list = []
    home_list = []
    date_list = []
    del_list = []
    
    for game in range(games):
        #We need to make sure that both teams are fbs members
        visitor = findteam(curr_info[game,1])
        home = findteam(curr_info[game,2])
        
        if visitor == 200 or home == 200:
            del_list.append(game)
        else:
            visitor_list.append(visitor)
            home_list.append(home)
            date_list.append(curr_info[game,0])
    
    data_list[year] = np.delete(curr_data, del_list, 0)
    info_list[year] = np.delete(curr_info, del_list, 0)
    assert(len(data_list[year]) == len(visitor_list))
    
    modified = np.zeros((len(data_list[year]),4))
    modified[:,0] = year + 2001
    modified[:,1] = scale_dates(date_list)
    modified[:,2] = visitor_list
    modified[:,3] = home_list
    
    modified_info.append(modified)

8/23/2001
2002-08-22
2003-08-23
2004-08-28
2005-09-01
2006-08-31
2007-08-30
2008-08-28
2009-09-03
2010-09-02
2011-09-01
2012-08-30
2013-08-29
2014-08-28
2015-09-03
8/27/2016
8/26/2017


In [7]:
print(modified_info[0])

[[2.00100000e+03 0.00000000e+00 4.30000000e+01 6.00000000e+01]
 [2.00100000e+03 1.86915888e-02 6.10000000e+01 6.90000000e+01]
 [2.00100000e+03 1.86915888e-02 5.70000000e+01 8.70000000e+01]
 ...
 [2.00100000e+03 9.90654206e-01 8.40000000e+01 8.70000000e+01]
 [2.00100000e+03 1.00000000e+00 4.40000000e+01 8.90000000e+01]
 [2.00100000e+03 1.00000000e+00 1.40000000e+01 3.10000000e+01]]


In [8]:
###Normalization!!!!

means = np.zeros(8)
stds = np.zeros(8)

#Years 2001-2015
all_data = data_list[0:15][:]

for i in range(8):

    flat_stat = []
    for years in all_data:
        for games in years:
            flat_stat.append(games[i])
            flat_stat.append(games[i+8])
    
    #print(flat_stat)
    means[i] = np.mean(flat_stat)
    stds[i] = np.std(flat_stat)
    
    for year in range(len(data_list)):
        for game in range(len(data_list[year])):
            data_list[year][game][i] = (data_list[year][game][i] - means[i])/stds[i]

            data_list[year][game][i+8] = (data_list[year][game][i+8] - means[i])/stds[i]

print(data_list[0][0:5][:])

0
1
2
3
4
5
6
7
[[-0.6436303  -0.80213641  0.50104176  0.02279448 -0.06184219 -0.89446651
  -0.0038844   1.03301879  2.78820267  2.91156713 -0.91461981 -1.46685294
  -1.41266531 -0.89446651 -0.95765302 -0.15957056]
 [-1.0585905  -0.31349121  0.01866819  0.86072115 -0.19692451  3.4419017
  -0.0038844   0.01079935 -0.56512432 -0.50894929 -0.48467815 -0.07030849
   0.34340474 -0.89446651 -0.0038844   0.80585891]
 [ 0.00684784  1.1524444  -0.76781046 -1.28064701 -1.14250069 -0.89446651
  -0.0038844  -0.32994047 -1.14831162 -0.99759449 -0.98802448 -0.90823516
  -1.00741838 -0.89446651 -0.95765302 -1.12500004]
 [ 3.12465688  1.05471536  0.76320131  0.11589744  1.15389861  0.18962554
  -0.95765302  2.45276802  0.67975627  0.46834112  0.67931026  1.60554485
   0.61356936 -0.89446651 -0.0038844   0.4651191 ]
 [-0.34082151 -0.50894929 -0.48467815  0.02279448 -0.73725375  0.18962554
   0.94988423 -0.55710035  0.78069253  1.25017344 -0.34835518 -0.72202923
  -1.14250069 -0.89446651 -0.0038844  -0.

In [9]:
def stat_vectors(game,info):
    #[VRY VRA VPY VPA VPC VF VI VS HRY HRA HPY HPA HPC HF HI HS] - 16
    #order doesn't matter too much on these vectors
    visitor_vec = np.zeros(28)
    home_vec = np.zeros(28)
        
    visitor_vec[2:18] = game
    home_vec[2:10] = game[8:]
    home_vec[10:18] = game[:8]
    
    home_vec[1] = 1
    home_vec[0] = info[1]
    visitor_vec[0] = info[1]
    #Year and opposing team number
    home_vec[26] = info[0]
    visitor_vec[26] = info[0]
    home_vec[27] = info[2]
    visitor_vec[27] = info[3]
    
    #(0,19:) is left open for input
    return visitor_vec, home_vec

In [10]:
#Finally, let's put togethor our data to be saved into a bunch of numpy arrays

years = len(data_list)
stat_list = ["None"] * len(teams)

for year in range(years):
    
    print(year+2001)
    num_games = len(modified_info[year])
    
    for game in range(num_games):
        
        curr_info = (modified_info[year])[game]
        visit_team = int(curr_info[2])
        home_team = int(curr_info[3])
        
        visit_vec, home_vec = stat_vectors((data_list[year])[game],curr_info)
        
        if stat_list[visit_team] == "None":
            stat_list[visit_team] = visit_vec
        else:
            
            if stat_list[visit_team].ndim == 1:
                last_vec = stat_list[visit_team]
            else:
                last_vec = stat_list[visit_team][-1,0:18]
            
            if not np.array_equal(visit_vec[0:18],last_vec):
            #Conditionally check if the previous game is a copy (problem with dataset)
                if year >= 1:
                    visit_vec[18:26] = sum((stat_list[home_team])[-10:,2:10])/10
                stat_list[visit_team] = np.vstack((stat_list[visit_team],visit_vec))
        
        
        if stat_list[home_team] == "None":
            stat_list[home_team] = home_vec
        else:
            
            if stat_list[home_team].ndim == 1:
                last_vec = stat_list[home_team]
            else:
                last_vec = stat_list[home_team][-1,0:18]
                
            if not np.array_equal(home_vec[0:18],last_vec):
                if year >= 1:
                    home_vec[18:26] = sum((stat_list[visit_team])[-10:,10:18])/10
                stat_list[home_team] = np.vstack((stat_list[home_team],home_vec))

2001
2002
2003
2004
2005
2006




2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017


In [11]:
stat_list[1][50,:]

array([ 4.64000000e-01,  0.00000000e+00, -6.04430015e-02,  3.70612076e-01,
       -2.01545831e-01, -2.56514415e-01, -4.67089129e-01, -8.94466511e-01,
       -9.57653025e-01, -1.59570562e-01,  3.88162616e-01, -2.03040858e-02,
       -5.37110055e-01, -7.03084879e-02,  4.78487052e-01,  1.89625542e-01,
       -3.88439605e-03, -7.27470253e-01, -2.67975802e-02, -1.57124743e-01,
        1.27598539e+00,  7.95549072e-01,  8.43209294e-01, -2.71928687e-02,
       -9.92612589e-02,  7.49068944e-01,  2.00500000e+03,  1.30000000e+01])

In [12]:
for team in range(len(teams)):
    file_ref = "./Stored NPs/Stats/team_{}.npy".format(team)
    np.save(file_ref,stat_list[team])
    
np.save("./Stored NPs/stat_means.npy",means)
np.save("./Stored NPs/stat_stds.npy",stds)