In [1]:
#Imports
import numpy as np

In [2]:
#Data contains all of statistical information from the game
spread_list = []
game_list = []

for year in range(2003,2018):

    file_ref = "./Betting Odds/{}.csv".format(year)
    data_cols = (3,4,5)
    info_cols = (0,1,2)
    
    spread_list.append(np.loadtxt(file_ref,delimiter=',',skiprows=1,usecols=data_cols,ndmin=2))
    game_list.append(np.loadtxt(file_ref,dtype='str',delimiter=',',skiprows=1,usecols=info_cols,ndmin=2))

In [3]:
#Data Check
print(spread_list[0][:3])
print(game_list[0][:3])

[[ 27.5 -27.5  60. ]
 [ 21.  -21.    0. ]
 [ -8.    8.   51. ]]
[['2003-08-23' ' Cal' ' Kansas State']
 ['2003-08-23' ' Grambling St' ' San Jose State']
 ['2003-08-28' ' Maryland' ' N Illinois']]


In [4]:
#Let's read in our team map
teams = np.loadtxt("./teams.csv",dtype='str',delimiter=', ',ndmin=2)

def findteam(team_name):
    #Looks for index given a team name
    team_name = team_name[1:]
    
    for i in range(len(teams)):
        for j in range(3):
            if teams[i,j] == team_name:
                return i
    #200 is false
    return 200

In [5]:
teams[50,:]

array(['Michigan', 'Michigan', 'Michigan'], dtype='<U23')

In [6]:
#Let's create a function that looks at a year of ordered dates and converts them to a list of scalars
def scale_dates(date_list):
    
    print(date_list[0])
    #Fix if I touched the file and it only does this '/' stuff
    if "/" in date_list[0]:
        for i in range(len(date_list)):
            hold_date = date_list[i].split('/')

            if len(hold_date[0]) == 1:
                hold_date[0] = "0" + hold_date[0]

            if len(hold_date[1]) == 1:
                hold_date[1] = "0" + hold_date[1]

            date_list[i] = "{}-{}-{}".format(hold_date[2],hold_date[0],hold_date[1])
    
    num_games = len(date_list)
    scaled_dates = np.zeros(num_games)
    
    first_date = np.datetime64(date_list[0])
    last_date = np.datetime64(date_list[-1])
    total_days = last_date - first_date
    
    for game in range(num_games):
        scaled_dates[game] = (np.datetime64(date_list[game]) - first_date)/total_days
        
    return scaled_dates

In [7]:
#Now let's create a list of numpy arrays with our date and opponent data
modified_info = []

for year in range(2018-2003):
    
    curr_spreads = spread_list[year]
    curr_games = game_list[year]
    
    #assert(len(curr_data) == len(curr_info))
    games = len(curr_spreads)
    
    visitor_list = []
    home_list = []
    date_list = []
    del_list = []
    
    for game in range(games):
        #We need to make sure that both teams are fbs members
        visitor = findteam(curr_games[game,1])
        home = findteam(curr_games[game,2])
        
        if visitor == 200 or home == 200:
            del_list.append(game)
        else:
            visitor_list.append(visitor)
            home_list.append(home)
            date_list.append(curr_games[game,0])
    
    spread_list[year] = np.delete(curr_spreads, del_list, 0)
    game_list[year] = np.delete(curr_games, del_list, 0)
    
    assert(len(game_list[year]) == len(visitor_list))
    
    modified = np.zeros((len(game_list[year]),4))
    modified[:,0] = year + 2003
    modified[:,1] = scale_dates(date_list)
    modified[:,2] = visitor_list
    modified[:,3] = home_list
    
    modified_info.append(modified)

2003-08-23
2004-08-28
2005-09-01
2006-08-31
2007-08-30
2008-08-28
2009-09-03
2010-09-02
2011-09-01
2012-08-30
2013-08-29
2014-08-28
9/3/2015
8/26/2016
8/26/2017


In [8]:
print(modified_info[0])

[[2.00300000e+03 0.00000000e+00 1.60000000e+01 3.90000000e+01]
 [2.00300000e+03 4.76190476e-02 4.60000000e+01 6.40000000e+01]
 [2.00300000e+03 4.76190476e-02 4.80000000e+01 4.20000000e+01]
 ...
 [2.00300000e+03 1.00000000e+00 7.30000000e+01 8.10000000e+01]
 [2.00300000e+03 1.00000000e+00 2.90000000e+01 4.40000000e+01]
 [2.00300000e+03 1.00000000e+00 3.90000000e+01 6.90000000e+01]]


In [9]:
#Breaks out our info into vectors
def get_vec(game_info,spread_info):
    
    #[year,day_scalar,other_team,team_spread,o/u]
    vis_vec = np.array([game_info[0],game_info[1],game_info[3],spread_info[0],spread_info[2]])
    home_vec = np.array([game_info[0],game_info[1],game_info[2],spread_info[1],spread_info[2]])
    
    return vis_vec, home_vec

In [10]:
years = len(spread_list)

final_spreads = ["None"] * len(teams)

for year in range(years):
    
    print(year+2003)
    
    num_games = len(modified_info[year])
    
    for game in range(num_games):

        curr_info = (modified_info[year])[game]
        
        visit_team = int(curr_info[2])
        home_team = int(curr_info[3])
        
        visit_vec, home_vec = get_vec(curr_info,(spread_list[year])[game])
        
        if final_spreads[visit_team] == "None":
            final_spreads[visit_team] = visit_vec
        else:
            final_spreads[visit_team] = np.vstack((final_spreads[visit_team],visit_vec))
        
        if final_spreads[home_team] == "None":
            final_spreads[home_team] = home_vec
        else:
            final_spreads[home_team] = np.vstack((final_spreads[home_team],home_vec))

2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017




In [11]:
def update_game_info(array,stats,spreads,team):
    
    vec = np.zeros(7)
    vec[2] = stats[26]
    vec[3] = stats[0]
    vec[6] = spreads[4]

    own_score = (stats[9]*score_std) + score_mean
    oth_score = (stats[17]*score_std) + score_mean
    
    if stats[1] == 0:
        #If visitor
        vec[0] = team
        vec[1] = stats[27]
        vec[4] = own_score - oth_score
        vec[5] = -spreads[3]
    else:
        vec[1] = team
        vec[0] = stats[27]
        vec[4] = oth_score - own_score
        vec[5] = spreads[3]
    
    for array_item in array:
        if np.array_equal(array_item,vec):
            return array
        
    array.append(vec)
    return array
        

In [12]:
#Let's get the data we developed earlier
stat_info = []

for team in range(len(teams)):
    file_ref = "./Stored NPs/Stats/team_{}.npy".format(team)
    stat_info.append(np.load(file_ref))

score_mean = np.load("./Stored NPs/stat_means.npy")[7]
score_std = np.load("./Stored NPs/stat_stds.npy")[7]

In [13]:
#Let's store a matrix which holds all of our game info
final_game_info = []
    
for team in range(len(teams)):
    
    team_stats = stat_info[team][stat_info[team][:,26] > 2002]
    team_spreads = final_spreads[team]
    
    print(team)
    
    stats_len = len(team_stats)
    stats_iter = 0
    delete_stats = []
    spread_len = len(team_spreads)
    spread_iter = 0
    delete_spreads = []
    done = False
    
    while done is False:
        #print("Stat is {}/{} and spread is {}/{}".format(stats_iter,stats_len,spread_iter,spread_len))
        #print("{},{}".format(stats_iter,spread_iter))
        
        if team_stats[stats_iter,27] == team_spreads[spread_iter,2]:
            final_game_info = update_game_info(final_game_info,team_stats[stats_iter],team_spreads[spread_iter],team)
            stats_iter += 1
            spread_iter += 1
        elif team_stats[stats_iter,27] != team_spreads[spread_iter,2]:
            cond = True
            if stats_iter + 1 < stats_len:
                cond = False
                if team_stats[stats_iter+1,27] == team_spreads[spread_iter,2]:
                    delete_stats.append(stats_iter)
                    stats_iter += 1
                else:
                    cond = True
                    
            if stats_iter + 2 < stats_len and cond:
                cond = False
                if team_stats[stats_iter+2,27] == team_spreads[spread_iter,2]:
                    delete_stats.append(stats_iter)
                    delete_stats.append(stats_iter+1)
                    stats_iter += 2
                else:
                    cond = True

            if spread_iter + 1 < spread_len and cond:
                cond = False
                if team_spreads[spread_iter+1,2] == team_stats[stats_iter,27]:
                    delete_spreads.append(spread_iter)
                    spread_iter += 1
                else:
                    cond = True

            if spread_iter + 2 < spread_len and cond:
                cond = False
                if team_spreads[spread_iter+2,2] == team_stats[stats_iter,27]:
                    delete_spreads.append(spread_iter)
                    delete_spreads.append(spread_iter+1)
                    spread_iter += 2
                else:
                    cond = True
            
            if cond:
                print("Issue with team {} in year {}".format(team,team_stats[stats_iter,26]))
                delete_stats.append(stats_iter)
                delete_spreads.append(spread_iter)
                stats_iter += 1
                spread_iter += 1
        
        if stats_iter >= stats_len and spread_iter >= spread_len:
            done = True
        elif stats_iter >= stats_len and spread_iter < spread_len:
            for x in range(spread_iter,spread_len):
                delete_spreads.append(x)
            done = True
        elif spread_iter >= spread_len and stats_iter < stats_len:
            for x in range(stats_iter,stats_len):
                delete_stats.append(x)
            done = True
        
    stat_info[team] = np.concatenate((stat_info[team][stat_info[team][:,26] <= 2002],np.delete(team_stats,delete_stats,axis = 0)),axis=0)
    final_spreads[team] = np.delete(team_spreads,delete_spreads,axis = 0)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
Issue with team 50 in year 2015.0
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112


In [14]:
print(stat_info[50][stat_info[50][:,26] > 2002,27])
print(final_spreads[50][:,2])

#print(final_spreads[50][143:146,0])
print(final_game_info[10:20])

[ 17.  32.  66.  72.  35.  36.  53.  34.  76.  51.  65.  68.  49.  66.
  79.  36.  35.  53.  34.  76.  51.  65.  68.  64.  66.  24. 111.  51.
  53.  74.  36.  65.  35.  68.  57. 103.  17.  66. 111.  53.  51.  74.
  36.  65.   9.  35.  68.  81.  72.  66.  74.  65.  24.  76.  34.  53.
  51. 111.  68.  26. 100.  49.  66. 111.  34.  93.  74.  51.  76.  53.
  65.  68. 110.  66.  24.  35.  51.  36.  74.  34.  76. 111.  68.  22.
  66.  13.  35.  51.  36.  74.  34.  76. 111.  68.  54.  66.  24.  79.
  53.  65.  51.  76.  36.  34.  57.  68. 105.   2.   0.  66.  76.  34.
  51.  57.  53.  65.  36.  68.  81.  17.  66.   1.  22.  53.  74.  35.
  51.  57.  65.  36.  68.  66.  49. 100.  53.  78.  74.  51.  35.  65.
  46.  68.  73.  99.  14.  46.  65.  51.  53.  78.  35.  74.  68.  26.
  31.  97.  20.  74. 111.  78.  34.  51.  46.  36.  35.  68.  27.  26.
  18.   0.  76.  51.  35.  74.  78.  46. 111.  68.]
[ 17.  32.  66.  72.  35.  36.  53.  34.  76.  51.  65.  68.  49.  66.
  79.  36.  35.  53.  34.

In [15]:
for team in range(len(teams)):
    
    all_team_stats = stat_info[team]
    curr_spreads = final_spreads[team]
    
    for year in range(2003,2018):
        
        stats_len = sum(all_team_stats[:,26] == year)
        spread_len = sum(curr_spreads[:,0] == year)
        
        if stats_len != spread_len:
            print("At team {} year {} with statlen {} and spreadlen {}".format(team,year,stats_len,spread_len))

In [16]:
for team in range(len(teams)):
    file_ref = "./Stored NPs/Stats/team_{}.npy".format(team)
    np.save(file_ref,stat_info[team])
    file_ref = "./Stored NPs/Spreads/team_{}.npy".format(team)
    np.save(file_ref,final_spreads[team])

In [17]:
save_array = np.asarray(final_game_info)
print(save_array.shape)
file_ref = "./Stored NPs/game_info.npy"
np.save(file_ref,save_array)

(9528, 7)
