# My College Basketball Data

In [1]:
import csv
import numpy as np
import torch 

In [2]:
#set up dictionary for team number and team name
teamidentities = {}
with open('Teams.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    
    for row in readCSV:
        teamidentities.update({row[0]: row[1]})
    
    #deletes first key 
    del teamidentities["Team_Id"]

In [3]:
#read and data split tourney seeds
#1985-2017

Seeds = {}

with open('TourneySeeds.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    
    #skips header line
    
    #{row[1] = 'Seed': {row[0] = 'Season': row[2] = 'Team'}
    for row in readCSV:
        #takes numbers in the seedings
        #row[0] = int(row[0])
        #row[2] = int(row[2])
        
        if row[0] in Seeds.keys():
            Seeds[row[0]].update({row[2]: row[1]})
        else:
            Seeds[row[0]] = {}
            Seeds[row[0]].update({row[2]: row[1]})
        

In [4]:
#create three seperate datasets: trainingSet, testSet, validationSet
#dataSplit function organizes three datasets

trainingSet = []
testSet = []
validationSet = []

def dataSplit():
    for row in readCSV:
        if row[0] == "2014":
            testSet.append(row)
        elif row[0] == "2015":
            validationSet.append(row)
        else:
            trainingSet.append(row)

In [5]:
#read and data split regular season stats
#2003-2015
#trainingSet is 2003-2013
#testSet is 2014
#validationSet 2015

with open('RegularSeasonDetailedResults.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    
    dataSplit()

In [6]:
#read and data split tournament stats
#2003-2016
#trainingSet is 2003-2013
#testSet is 2014
#validationSet 2015-2016

with open('TourneyDetailedResults.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    
    dataSplit()

del (trainingSet[0])
del (trainingSet[65920])

#used dataSplit() to assemble trainingDataSet, testDataSet, validationDataSet by combining data from RegularSeasonedDetailedResults.csv and TourneyDetailedResults
#trainingDataSet made up of data from RSDR years of 2003-2013 and TDR years 2003-2013, 2016
#testDataSet made up of data from RSDR years 2014 and TDR years 2014
#validationDataSet made up of data from RSDR years 2015 and TDR years 2015

In [7]:
#create a function to add the seeds at the end of the game array
def addSeeds(dataSet):
    for game in dataSet:
        season = game[0]
        team_id1 = game[2]
        team_id2 = game[4]
   
        #no seed for the team in particular year due to no qualification to March Madness warrants a automatic seeding of 16
        if(season not in Seeds.keys() or team_id1 not in Seeds[season].keys()):
            seed1 = 16;
        else:
            seed1 = Seeds[season][team_id1]
            #deleting the letters from the seedings
            seed1 = seed1[1:]
            if len(seed1) == 3:
                seed1 = seed1[:-1]
                
        
        if(season not in Seeds.keys() or team_id2 not in Seeds[season].keys()):
            seed2 = 16;
        else:
            seed2 = Seeds[season][team_id2]
            #deleting the letters from the seedings
            seed2 = seed2[1:]
            if len(seed2) == 3:
                seed2 = seed2[:-1]

        game.append(seed1)
        game.append(seed2)

#add seeds to trainingSet, testSet, validationSet
addSeeds(trainingSet)  
addSeeds(testSet)
addSeeds(validationSet)

In [8]:
print(len(trainingSet[0]))
print(len(testSet[0]))
print(len(validationSet[0]))

36
36
36


In [9]:
#extracting features for each DataSet
#first half of the feature vector is winning team stats and last half is losing team stats
#features taken out: [Wteam (row[2]), Wseed, Wfgm, Wfga, Wfgm3, Wfga3, Wftm, Wfta, Wor, Wdr, Wast, Wto, Wstl, Wblk, Wpf, Lteam(row[4],), Lseed, Lfgm, Lfga, Lfgm3, Lfga3, Lftm, Lfta, Lor, Ldr, Last, Lto, Lstl, Lblk, Lpf]
#score is not taken into account


#function to featureExtract
def featureExtract(dataSetArray):
    featureExtractArray = []
    for i, row in enumerate(dataSetArray):
        test = [row[34], row[8], row[9], row[10], row[11], row[12], row[13], row[14], row[15], row[16], row[17], row[18], row[19], row[20], row[35], row[21], row[22], row[23], row[24], row[25], row[26], row[27], row[28], row[29], row[30], row[31], row[32], row[33]]
        
        try:
        
            test = [float(ele) for ele in test]
        except:
            print(i, test)
        featureExtractArray.append(test)
    return featureExtractArray[1:].copy()
    #print(featureExtractArray)
    
#trainingSet feature extraction
featurestrainingSet = featureExtract(trainingSet)

#testSet feature extraction
featurestestSet = featureExtract(testSet)

#validationSet feature extraction
featuresvalidationSet = featureExtract(validationSet)

In [10]:
print(len(featurestrainingSet[0]))
print(len(featurestestSet[0]))
print(len(featuresvalidationSet[0]))

28
28
28


In [11]:
#making labels

#randomization of winning and losing team statistics from feature trainingDataSet
labelstrainingSet = np.random.randint(2, size = len(featurestrainingSet))
#30 elements in featuretrainingDataSet

# randomization of winning and losing team statistics from feature testDataSet
labelstestSet = np.random.randint(2, size = len(featurestestSet))

#randomization of winning and losing team statistics from feature validationDataSet
labelsvalidationSet = np.random.randint(2, size = len(featuresvalidationSet))

n_feat = len(featurestrainingSet[0])
assert n_feat % 2 == 0

#randomization function flips the back half of the data if corresponding label is a 1
def randomization(labelList, originalDataSet):
    randomizedDataSet = []
    for i, do_flip in enumerate(labelList):    
        #if its a 1 that means the back half won so flip
        x = originalDataSet[i] 
        if do_flip == 1:   
            x_rand = x[n_feat // 2:] + x[0:n_feat // 2]
        else:
            x_rand = x
        randomizedDataSet.append(x_rand)
    return randomizedDataSet

randomizedfeaturetrainingSet = randomization(labelstrainingSet, featurestrainingSet)
randomizedfeaturetestSet = randomization(labelstestSet, featurestestSet)
randomizedfeaturevalidationSet = randomization(labelsvalidationSet, featuresvalidationSet)

#test case
print(randomizedfeaturetrainingSet[0:10])
print(labelstrainingSet[0:10])

[[7.0, 26.0, 62.0, 8.0, 20.0, 10.0, 19.0, 15.0, 28.0, 16.0, 13.0, 4.0, 4.0, 18.0, 3.0, 24.0, 67.0, 6.0, 24.0, 9.0, 20.0, 20.0, 25.0, 7.0, 12.0, 8.0, 6.0, 16.0], [3.0, 24.0, 58.0, 8.0, 18.0, 17.0, 29.0, 17.0, 26.0, 15.0, 10.0, 5.0, 2.0, 25.0, 16.0, 22.0, 73.0, 3.0, 26.0, 14.0, 23.0, 31.0, 22.0, 9.0, 12.0, 2.0, 5.0, 23.0], [16.0, 18.0, 38.0, 3.0, 9.0, 17.0, 31.0, 6.0, 19.0, 11.0, 12.0, 14.0, 2.0, 18.0, 16.0, 18.0, 49.0, 6.0, 22.0, 8.0, 15.0, 17.0, 20.0, 9.0, 19.0, 4.0, 3.0, 23.0], [16.0, 24.0, 62.0, 6.0, 16.0, 17.0, 27.0, 21.0, 15.0, 12.0, 10.0, 7.0, 1.0, 14.0, 1.0, 30.0, 61.0, 6.0, 14.0, 11.0, 13.0, 17.0, 22.0, 12.0, 14.0, 4.0, 4.0, 20.0], [5.0, 26.0, 57.0, 6.0, 12.0, 23.0, 27.0, 12.0, 24.0, 12.0, 9.0, 9.0, 3.0, 18.0, 16.0, 20.0, 46.0, 3.0, 11.0, 12.0, 17.0, 6.0, 22.0, 8.0, 19.0, 4.0, 3.0, 25.0], [14.0, 23.0, 55.0, 2.0, 8.0, 32.0, 39.0, 13.0, 18.0, 14.0, 17.0, 11.0, 1.0, 25.0, 16.0, 19.0, 41.0, 4.0, 15.0, 20.0, 28.0, 9.0, 21.0, 11.0, 30.0, 10.0, 4.0, 28.0], [16.0, 28.0, 62.0, 4.0, 14.0,

In [12]:
print(len(randomizedfeaturetrainingSet[0]))
print(len(randomizedfeaturetestSet[0]))
print(len(randomizedfeaturevalidationSet[0]))

28
28
28


In [13]:
#pull labels out and then flip the respecitve lists within the data set corresponding to the data set
#save the array into np and then turn into torch
#implement logistical reggression

In [14]:
#converts to numpy array and pytorch array
def PytorchArray (labels, randomizedDataSet):
    randomizedDataSet_np = np.array(randomizedDataSet).astype(np.float32)
    labels_np = np.array(labels)
    randomizedDataSet_pytorch = torch.from_numpy(randomizedDataSet_np)
    labels_pytorch = torch.from_numpy(labels_np)    
    return randomizedDataSet_pytorch, labels_pytorch

#assigns variables names to pytorch array
featurestrainingData_pytorch, labelstrainingData_pytorch = PytorchArray(labelstrainingSet, randomizedfeaturetrainingSet)
featurestestData_pytorch, labelstestData_pytorch = PytorchArray(labelstestSet, randomizedfeaturetestSet)
featuresvalidationData_pytorch, labelsvalidationData_pytorch = PytorchArray(labelsvalidationSet, randomizedfeaturevalidationSet)

#save data sets pytorch array
torch.save(featurestrainingData_pytorch, 'featurestraining_pytorch.pt')
torch.save(featurestestData_pytorch, 'featurestest_pytorch.pt')
torch.save(featuresvalidationData_pytorch, 'featuresvalidation_pytorch.pt')

#save labels pytorch array
torch.save(labelstrainingData_pytorch, 'labelstraining_pytorch.pt')
torch.save(labelstestData_pytorch, 'labelstest_pytorch.pt')
torch.save(labelsvalidationData_pytorch, 'labelsvalidation_pytorch.pt')