# March Madness
## Final Project
### Group 15: Andrew Marion, Dallas Hutchinson, Aydan Koyles


# Setting Up File

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pyspark.mllib.classification import LogisticRegressionModel,LogisticRegressionWithLBFGS, SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from pyspark.mllib.clustering import *
import pyspark.sql
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.mllib.stat import Statistics
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.clustering import LDA
import pyspark.sql.functions as f

In [None]:
# Read in data

seeds = pd.read_csv('/content/gdrive/MyDrive/MDataFiles_Stage1/MNCAATourneySeeds.csv')
conferences = pd.read_csv('/content/gdrive/MyDrive/MDataFiles_Stage1/MTeamConferences.csv')
regulardetail = pd.read_csv('/content/gdrive/MyDrive/MDataFiles_Stage1/MRegularSeasonDetailedResults.csv') #all games not march madness
tournamentcompact = pd.read_csv('/content/gdrive/MyDrive/MDataFiles_Stage1/MNCAATourneyCompactResults.csv')
tournamentdetailed = pd.read_csv('/content/gdrive/MyDrive/MDataFiles_Stage1/MNCAATourneyDetailedResults.csv')
sample = pd.read_csv('/content/gdrive/MyDrive/MDataFiles_Stage1/MSampleSubmissionStage1.csv')
conference = pd.read_csv('/content/gdrive/MyDrive/MDataFiles_Stage1/MTeamConferences.csv')
coach = pd.read_csv('/content/gdrive/MyDrive/MDataFiles_Stage1/MTeamCoaches.csv')
massey = pd.read_csv('/content/gdrive/MyDrive/MDataFiles_Stage1/MMasseyOrdinals.csv')

In [None]:
WinTeams = pd.DataFrame()
LoseTeams = pd.DataFrame()

columns = ['Season', 'TeamID', 'Points', 'OppPoints', 'Loc',
       'NumOT', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR',
       'Ast', 'TO', 'Stl', 'Blk', 'PF', 'OppFGM', 'OppFGA', 'OppFGM3', 'OppFGA3',
       'OppFTM', 'OppFTA', 'OppOR', 'OppDR', 'OppAst', 'OppTO', 'OppStl', 'OppBlk', 'OppPF']

WinTeams[columns] = regulardetail[['Season', 'WTeamID', 'WScore', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']]

WinTeams['Wins'] = 1
WinTeams['Losses'] = 0

LoseTeams[columns] = regulardetail[['Season', 'LTeamID', 'LScore', 'WScore', 'WLoc',
       'NumOT', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR',
       'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3',
       'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

#change location for losing
def change_loc(loc):
    if loc == 'H':
        return 'A'
    elif loc == 'A':
        return 'H'
    else:
        return 'N'
    
LoseTeams['Loc'] = LoseTeams['Loc'].apply(change_loc)

LoseTeams['Wins'] = 0
LoseTeams['Losses'] = 1

#combine win and lose

WinLoseTeams = pd.concat([WinTeams,LoseTeams])

combinedTeams = WinLoseTeams.groupby(['Season','TeamID']).sum()

combinedTeams['NumGames'] = combinedTeams['Wins'] + combinedTeams['Losses']

combinedTeams

Unnamed: 0_level_0,Unnamed: 1_level_0,Points,OppPoints,NumOT,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,...,OppOR,OppDR,OppAst,OppTO,OppStl,OppBlk,OppPF,Wins,Losses,NumGames
Season,TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2003,1102,1603,1596,0,536,1114,219,583,312,479,117,...,269,564,256,363,152,44,514,12,16,28
2003,1103,2127,2110,8,733,1508,147,434,514,698,264,...,325,595,418,414,173,77,606,13,14,27
2003,1104,1940,1820,1,673,1601,178,556,416,586,380,...,305,634,327,388,155,89,539,17,11,28
2003,1105,1866,1993,4,634,1602,197,540,401,568,351,...,343,686,411,489,244,109,496,7,19,26
2003,1106,1781,1785,1,656,1548,171,494,298,461,344,...,317,626,330,422,246,89,452,13,15,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,1468,1332,1418,1,501,1097,143,442,187,251,118,...,178,461,275,225,113,46,318,9,11,20
2022,1469,1321,1473,0,458,1111,123,411,282,408,166,...,169,498,289,237,146,78,389,7,12,19
2022,1470,1332,1396,1,475,1153,108,350,274,369,155,...,182,511,288,332,91,119,366,8,13,21
2022,1471,1342,1422,1,454,1048,162,474,272,372,96,...,170,497,257,238,137,52,353,8,12,20


In [None]:
RegularSeasonInput = pd.DataFrame()

#Wins, PPG, PPG Allowed, Points Ratio, OTs
RegularSeasonInput['WinRatio'] = combinedTeams['Wins'] / combinedTeams['NumGames']
RegularSeasonInput['PointsPerGame'] = combinedTeams['Points'] / combinedTeams['NumGames']
RegularSeasonInput['PointsAllowedPerGame'] = combinedTeams['OppPoints'] / combinedTeams['NumGames']
RegularSeasonInput['PointsRatio'] = combinedTeams['Points'] / combinedTeams['OppPoints']
RegularSeasonInput['OTsPerGame'] = combinedTeams['NumOT'] / combinedTeams['NumGames']

#Field Goals
RegularSeasonInput['FGPerGame'] = combinedTeams['FGM'] / combinedTeams['NumGames']
RegularSeasonInput['FGRatio'] = combinedTeams['FGM'] / combinedTeams['FGA']
RegularSeasonInput['FGAPerGame'] = combinedTeams['FGA'] / combinedTeams['NumGames']
RegularSeasonInput['FGAllowedPerGame'] = combinedTeams['OppFGM'] / combinedTeams['NumGames']

#Three Pointers
RegularSeasonInput['FG3PerGame'] = combinedTeams['FGM3'] / combinedTeams['NumGames']
RegularSeasonInput['FG3Ratio'] = combinedTeams['FGM3'] / combinedTeams['FGA3']
RegularSeasonInput['FG3APerGame'] = combinedTeams['FGA3'] / combinedTeams['NumGames']
RegularSeasonInput['FG3AllowedPerGame'] = combinedTeams['OppFGM3'] / combinedTeams['NumGames']

#Free Throws
RegularSeasonInput['FTPerGame'] = combinedTeams['FTM'] / combinedTeams['NumGames']
RegularSeasonInput['FTRatio'] = combinedTeams['FTM'] / combinedTeams['FTA']
RegularSeasonInput['FTAPerGame'] = combinedTeams['FTA'] / combinedTeams['NumGames']
RegularSeasonInput['FTllowedPerGame'] = combinedTeams['OppFTM'] / combinedTeams['NumGames']

#Rebounds
RegularSeasonInput['ORPerGame'] = combinedTeams['OR'] / combinedTeams['NumGames']
RegularSeasonInput['DRPerGame'] = combinedTeams['DR'] / combinedTeams['NumGames']
RegularSeasonInput['TRPerGame'] = (combinedTeams['OR'] + combinedTeams['DR']) / combinedTeams['NumGames']
RegularSeasonInput['OppORPerGame'] = combinedTeams['OppOR'] / combinedTeams['NumGames']
RegularSeasonInput['OppDRPerGame'] = combinedTeams['OppDR'] / combinedTeams['NumGames']
RegularSeasonInput['OppTRPerGame'] = (combinedTeams['OppOR'] + combinedTeams['OppDR']) / combinedTeams['NumGames']
RegularSeasonInput['ORRatio'] = combinedTeams['OR'] / combinedTeams['OppOR']
RegularSeasonInput['DRRatio'] = combinedTeams['DR'] / combinedTeams['OppDR']
RegularSeasonInput['TRRatio'] = (combinedTeams['OR'] + combinedTeams['DR']) / (combinedTeams['OppOR'] + combinedTeams['OppDR'])

#Assists
RegularSeasonInput['AstPerGame'] = combinedTeams['Ast'] / combinedTeams['NumGames']
RegularSeasonInput['OppAstPerGame'] = combinedTeams['OppAst'] / combinedTeams['NumGames']

#Steals
RegularSeasonInput['StlPerGame'] = combinedTeams['Stl'] / combinedTeams['NumGames']
RegularSeasonInput['OppStlPerGame'] = combinedTeams['OppStl'] / combinedTeams['NumGames']

#Turnovers
RegularSeasonInput['TOPerGame'] = combinedTeams['TO'] / combinedTeams['NumGames']
RegularSeasonInput['OppTOPerGame'] = combinedTeams['OppTO'] / combinedTeams['NumGames']

#Blocks
RegularSeasonInput['BlkPerGame'] = combinedTeams['Blk'] / combinedTeams['NumGames']
RegularSeasonInput['OppBlkPerGame'] = combinedTeams['OppBlk'] / combinedTeams['NumGames']

#Personal Fouls
RegularSeasonInput['PFPerGame'] = combinedTeams['PF'] / combinedTeams['NumGames']
RegularSeasonInput['OppPFPerGame'] = combinedTeams['OppPF'] / combinedTeams['NumGames']



RegularSeasonInput.columns

Index(['WinRatio', 'PointsPerGame', 'PointsAllowedPerGame', 'PointsRatio',
       'OTsPerGame', 'FGPerGame', 'FGRatio', 'FGAPerGame', 'FGAllowedPerGame',
       'FG3PerGame', 'FG3Ratio', 'FG3APerGame', 'FG3AllowedPerGame',
       'FTPerGame', 'FTRatio', 'FTAPerGame', 'FTllowedPerGame', 'ORPerGame',
       'DRPerGame', 'TRPerGame', 'OppORPerGame', 'OppDRPerGame',
       'OppTRPerGame', 'ORRatio', 'DRRatio', 'TRRatio', 'AstPerGame',
       'OppAstPerGame', 'StlPerGame', 'OppStlPerGame', 'TOPerGame',
       'OppTOPerGame', 'BlkPerGame', 'OppBlkPerGame', 'PFPerGame',
       'OppPFPerGame'],
      dtype='object')

In [None]:
RegularSeasonInput.describe()

Unnamed: 0,WinRatio,PointsPerGame,PointsAllowedPerGame,PointsRatio,OTsPerGame,FGPerGame,FGRatio,FGAPerGame,FGAllowedPerGame,FG3PerGame,...,AstPerGame,OppAstPerGame,StlPerGame,OppStlPerGame,TOPerGame,OppTOPerGame,BlkPerGame,OppBlkPerGame,PFPerGame,OppPFPerGame
count,6892.0,6892.0,6892.0,6892.0,6892.0,6892.0,6892.0,6892.0,6892.0,6892.0,...,6892.0,6892.0,6892.0,6892.0,6892.0,6892.0,6892.0,6892.0,6892.0,6892.0
mean,0.494102,69.46114,69.676889,1.001182,0.068303,24.42926,0.435921,56.03825,24.499946,6.680315,...,13.03527,13.095354,6.491325,6.513089,13.554449,13.522759,3.308914,3.330369,17.826441,17.791239
std,0.188209,5.835748,5.549137,0.096713,0.060351,2.166095,0.02651,3.575067,2.210511,1.401041,...,1.863353,1.766612,1.257087,1.035175,1.833738,1.884981,1.089374,0.635868,3.476077,3.346955
min,0.0,49.24,50.428571,0.625313,0.0,16.12,0.342072,39.785714,16.962963,1.83871,...,7.518519,7.676471,2.875,3.366667,7.387097,6.96875,0.655172,1.451613,0.0,0.0
25%,0.357143,65.5,65.965209,0.934764,0.03125,22.966379,0.418115,53.717634,22.966667,5.692308,...,11.732692,11.902419,5.59375,5.793864,12.3125,12.225806,2.518519,2.891865,16.969697,17.137931
50%,0.5,69.428571,69.550056,0.997408,0.0625,24.387097,0.436036,56.0,24.430952,6.586207,...,12.928571,13.0,6.40625,6.4375,13.419355,13.4,3.192308,3.290323,18.354839,18.357143
75%,0.633333,73.34375,73.2,1.064353,0.1,25.851852,0.45358,58.333333,25.90696,7.59375,...,14.269231,14.214286,7.266667,7.115385,14.7,14.678571,3.941176,3.727273,19.666667,19.481481
max,1.0,95.551724,98.206897,1.391685,0.4,34.153846,0.550868,78.62069,37.2,13.25,...,21.484848,22.241379,13.241379,11.642857,22.321429,24.275862,9.333333,6.0,27.884615,25.172414


In [None]:
seed_dict = seeds.set_index(['Season', 'TeamID'])

TourneyInput = pd.DataFrame()

winIDs = tournamentcompact['WTeamID']
loseIDs = tournamentcompact['LTeamID']
season = tournamentcompact['Season']

winners = pd.DataFrame()
winners[['Season', 'Team1','Team2']] = tournamentcompact[['Season', 'WTeamID', 'LTeamID']]
winners['Result'] = 1

losers = pd.DataFrame()
losers[['Season', 'Team1','Team2']] = tournamentcompact[['Season', 'LTeamID', 'WTeamID']]
losers['Result'] = 0

TourneyInput = pd.concat([winners,losers])
TourneyInput = TourneyInput[TourneyInput['Season']>=2003].reset_index(drop=True)

team1seeds = []
team2seeds = []

for x in range(len(TourneyInput)):
    idx = (TourneyInput['Season'][x], TourneyInput['Team1'][x])
    seed = seed_dict.loc[idx].values[0]
    if len(seed) == 4:
        seed = int(seed[1:-1])
    else:
        seed = int(seed[1:])
    team1seeds.append(seed)
    
    idx = (TourneyInput['Season'][x],TourneyInput['Team2'][x])
    seed = seed_dict.loc[idx].values[0]
    if len(seed) == 4:
        seed = int(seed[1:-1])
    else:
        seed = int(seed[1:])
    team2seeds.append(seed)

TourneyInput['Team1Seeds'] = team1seeds
TourneyInput['Team2Seeds'] = team2seeds

In [None]:
conference_dict = conference.set_index(['Season', 'TeamID'])
conference_dict

team1conf = []
team2conf = []

for x in range(len(TourneyInput)):
    idx = (TourneyInput['Season'][x], TourneyInput['Team1'][x])
    conf = conference_dict.loc[idx].values[0]
    team1conf.append(conf)
    
    idx = (TourneyInput['Season'][x],TourneyInput['Team2'][x])
    conf = conference_dict.loc[idx].values[0]
    team2conf.append(conf)

TourneyInput['Team1Conference'] = team1conf
TourneyInput['Team2Conference'] = team2conf

display(TourneyInput)

Unnamed: 0,Season,Team1,Team2,Result,Team1Seeds,Team2Seeds,Team1Conference,Team2Conference
0,2003,1421,1411,1,16,16,big_south,swac
1,2003,1112,1436,1,1,16,pac_ten,aec
2,2003,1113,1272,1,10,7,pac_ten,cusa
3,2003,1141,1166,1,11,6,mac,mvc
4,2003,1143,1301,1,8,9,pac_ten,acc
...,...,...,...,...,...,...,...,...
2357,2021,1425,1211,0,6,1,pac_twelve,wcc
2358,2021,1276,1417,0,1,11,big_ten,pac_twelve
2359,2021,1222,1124,0,2,1,aac,big_twelve
2360,2021,1417,1211,0,11,1,pac_twelve,wcc


In [None]:
outscores = []

for x in range(len(TourneyInput)):
    idx = (TourneyInput['Season'][x],TourneyInput['Team1'][x])
    team1score = RegularSeasonInput.loc[idx]
    team1score['Seed'] = TourneyInput['Team1Seeds'][x]

    idx = (TourneyInput['Season'][x],TourneyInput['Team2'][x])
    team2score = RegularSeasonInput.loc[idx]
    team2score['Seed'] = TourneyInput['Team2Seeds'][x]
    
    outscore = team1score - team2score
    outscore['Result'] = TourneyInput['Result'][x]
    outscores.append(outscore)
    
    idx = (TourneyInput['Season'][x],TourneyInput['Team2'][x])
    team2conf = RegularSeasonInput.loc[idx]
    team2conf['Conference'] = TourneyInput['Team2Conference'][x]    
    
outscores = pd.DataFrame(outscores)

display(outscores)
display(outscores.describe())

Unnamed: 0,WinRatio,PointsPerGame,PointsAllowedPerGame,PointsRatio,OTsPerGame,FGPerGame,FGRatio,FGAPerGame,FGAllowedPerGame,FG3PerGame,...,StlPerGame,OppStlPerGame,TOPerGame,OppTOPerGame,BlkPerGame,OppBlkPerGame,PFPerGame,OppPFPerGame,Seed,Result
0,-0.151724,-1.593103,7.614943,-0.120072,0.139080,-0.354023,-0.018262,1.526437,2.126437,0.549425,...,0.635632,0.827586,0.973563,-1.505747,0.766667,1.641379,0.803448,-2.943678,0.0,1.0
1,0.237685,17.421182,7.112069,0.139285,0.002463,5.493842,0.016969,9.852217,3.598522,1.759852,...,1.602217,-1.139163,0.716749,3.857143,1.248768,-1.262315,1.853448,4.140394,-15.0,1.0
2,-0.172414,1.448276,3.344828,-0.033801,0.034483,0.931034,0.040251,-3.103448,1.517241,-3.000000,...,-2.172414,-1.275862,0.206897,0.448276,-0.827586,0.758621,0.655172,2.931034,3.0,1.0
3,-0.085684,0.102403,8.908046,-0.148414,-0.030303,-2.076280,0.005763,-4.764890,2.466040,-1.142111,...,-1.290491,2.114943,4.877743,-0.991641,-0.454545,-0.092999,3.692790,2.295716,5.0,1.0
4,0.124138,2.082759,1.758621,0.003015,0.070115,3.011494,0.009399,5.390805,1.945977,-1.552874,...,-1.214943,-1.502299,-0.027586,0.297701,-0.273563,-0.316092,-1.563218,-0.229885,-1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2357,-0.241379,-17.356764,-4.011936,-0.184472,0.137931,-6.981432,-0.083767,-3.827586,-2.025199,-0.997347,...,-3.514589,-0.551724,0.180371,-3.461538,2.395225,-0.057029,0.905836,1.067639,5.0,0.0
2358,0.179487,3.403846,-3.125000,0.102900,-0.189103,1.490385,0.021778,0.509615,-0.782051,0.887821,...,-0.660256,-0.285256,-0.217949,-1.653846,1.637821,-0.102564,1.862179,2.317308,-10.0,0.0
2359,-0.032051,-8.067308,-8.108974,0.038318,-0.041667,-4.429487,-0.054730,-2.233974,-4.650641,-1.217949,...,-1.112179,-1.243590,-2.012821,-3.025641,0.903846,-1.256410,0.057692,-0.339744,1.0,0.0
2360,-0.346154,-19.269231,-0.615385,-0.269329,0.230769,-7.769231,-0.088916,-4.884615,-0.961538,-0.653846,...,-3.230769,-0.923077,-1.038462,-3.307692,-0.192308,-0.115385,0.769231,-0.038462,10.0,0.0


Unnamed: 0,WinRatio,PointsPerGame,PointsAllowedPerGame,PointsRatio,OTsPerGame,FGPerGame,FGRatio,FGAPerGame,FGAllowedPerGame,FG3PerGame,...,StlPerGame,OppStlPerGame,TOPerGame,OppTOPerGame,BlkPerGame,OppBlkPerGame,PFPerGame,OppPFPerGame,Seed,Result
count,2362.0,2362.0,2362.0,2362.0,2362.0,2362.0,2362.0,2362.0,2362.0,2362.0,...,2362.0,2362.0,2362.0,2362.0,2362.0,2362.0,2362.0,2362.0,2362.0,2362.0
mean,4.653348e-18,-2.226086e-16,-2.25316e-15,-3.2150400000000004e-17,6.800821e-18,1.323619e-16,-4.559341e-18,-2.256169e-16,1.669565e-16,-1.086721e-16,...,1.522914e-16,-7.971796e-17,1.458989e-16,-1.098002e-16,1.985428e-16,-2.5005870000000003e-17,-2.361457e-16,-2.9518210000000005e-17,0.0,0.5
std,0.1460046,7.3359,6.4253,0.1066465,0.07344928,2.935077,0.03266969,4.856,2.664932,1.807514,...,1.859996,1.250696,2.094783,2.551147,1.831655,0.8870199,2.497858,2.066392,7.487841,0.500106
min,-0.6333333,-22.89286,-25.03571,-0.4004886,-0.2931689,-9.357143,-0.1211496,-22.07143,-9.97861,-6.0625,...,-6.283681,-4.533333,-7.666667,-9.499051,-6.752688,-2.924099,-7.896552,-6.809384,-15.0,0.0
25%,-0.09375,-4.864448,-4.359807,-0.06555868,-0.03548387,-1.998377,-0.02177467,-3.298872,-1.741014,-1.1875,...,-1.237955,-0.8243299,-1.349484,-1.68892,-1.164783,-0.567803,-1.727774,-1.346507,-5.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
75%,0.09375,4.864448,4.359807,0.06555868,0.03548387,1.998377,0.02177467,3.298872,1.741014,1.1875,...,1.237955,0.8243299,1.349484,1.68892,1.164783,0.567803,1.727774,1.346507,5.0,1.0
max,0.6333333,22.89286,25.03571,0.4004886,0.2931689,9.357143,0.1211496,22.07143,9.97861,6.0625,...,6.283681,4.533333,7.666667,9.499051,6.752688,2.924099,7.896552,6.809384,15.0,1.0


In [None]:
corrs = round(outscores.corr(),2)
corrs

Unnamed: 0,WinRatio,PointsPerGame,PointsAllowedPerGame,PointsRatio,OTsPerGame,FGPerGame,FGRatio,FGAPerGame,FGAllowedPerGame,FG3PerGame,...,StlPerGame,OppStlPerGame,TOPerGame,OppTOPerGame,BlkPerGame,OppBlkPerGame,PFPerGame,OppPFPerGame,Seed,Result
WinRatio,1.0,0.45,-0.35,0.84,-0.23,0.45,0.5,0.17,-0.21,0.13,...,0.15,-0.18,-0.25,0.08,0.24,-0.17,-0.26,0.04,-0.63,0.34
PointsPerGame,0.45,1.0,0.56,0.46,-0.06,0.92,0.56,0.73,0.57,0.35,...,0.31,0.19,0.09,0.25,0.25,0.15,0.08,0.32,-0.42,0.23
PointsAllowedPerGame,-0.35,0.56,1.0,-0.48,0.2,0.44,0.03,0.54,0.88,0.21,...,0.14,0.38,0.35,0.15,-0.04,0.31,0.32,0.35,0.23,-0.17
PointsRatio,0.84,0.46,-0.48,1.0,-0.27,0.5,0.55,0.19,-0.34,0.15,...,0.17,-0.2,-0.29,0.11,0.3,-0.18,-0.26,-0.02,-0.68,0.42
OTsPerGame,-0.23,-0.06,0.2,-0.27,1.0,-0.09,-0.14,-0.0,0.17,-0.07,...,-0.05,0.08,0.09,-0.06,-0.02,0.08,0.02,0.04,0.14,-0.12
FGPerGame,0.45,0.92,0.44,0.5,-0.09,1.0,0.64,0.77,0.48,0.19,...,0.33,0.17,0.06,0.23,0.31,0.12,-0.03,0.08,-0.44,0.26
FGRatio,0.5,0.56,0.03,0.55,-0.14,0.64,1.0,0.01,0.16,0.08,...,-0.04,0.08,0.01,-0.14,0.09,-0.24,-0.25,-0.05,-0.35,0.21
FGAPerGame,0.17,0.73,0.54,0.19,-0.0,0.77,0.01,1.0,0.5,0.18,...,0.47,0.16,0.06,0.42,0.33,0.35,0.18,0.15,-0.28,0.17
FGAllowedPerGame,-0.21,0.57,0.88,-0.34,0.17,0.48,0.16,0.5,1.0,0.24,...,0.03,0.27,0.17,-0.03,-0.03,0.25,-0.08,0.2,0.1,-0.1
FG3PerGame,0.13,0.35,0.21,0.15,-0.07,0.19,0.08,0.18,0.24,1.0,...,0.04,-0.14,-0.25,0.02,-0.19,-0.12,-0.03,-0.14,-0.09,0.05


In [None]:
display((corrs['Result']))

WinRatio                0.34
PointsPerGame           0.23
PointsAllowedPerGame   -0.17
PointsRatio             0.42
OTsPerGame             -0.12
FGPerGame               0.26
FGRatio                 0.21
FGAPerGame              0.17
FGAllowedPerGame       -0.10
FG3PerGame              0.05
FG3Ratio                0.10
FG3APerGame             0.01
FG3AllowedPerGame      -0.06
FTPerGame               0.02
FTRatio                 0.04
FTAPerGame              0.00
FTllowedPerGame        -0.17
ORPerGame               0.15
DRPerGame               0.12
TRPerGame               0.17
OppORPerGame            0.05
OppDRPerGame           -0.18
OppTRPerGame           -0.10
ORRatio                 0.10
DRRatio                 0.21
TRRatio                 0.21
AstPerGame              0.19
OppAstPerGame          -0.10
StlPerGame              0.11
OppStlPerGame          -0.10
TOPerGame              -0.16
OppTOPerGame            0.06
BlkPerGame              0.21
OppBlkPerGame           0.01
PFPerGame     

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(corrs)
plt.show()

NameError: ignored

<Figure size 1080x720 with 0 Axes>

In [None]:
X = outscores[outscores.columns[:-1]].values
y = outscores['Result'].values

np.random.seed(9)
idx = np.random.permutation(len(X))

train_idx = idx[:int(-.2*len(X))]
test_idx = idx[int(-.2*len(X)):]

X_train = X[train_idx]
X_test = X[test_idx]
y_train = y[train_idx]
y_test = y[test_idx]

mins = X_train.min(axis=0)
maxs = X_train.max(axis=0)

X_train = (X_train - mins) / (maxs - mins)
X_test = (X_test - mins) / (maxs - mins)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape,)

(1890, 37) (472, 37) (1890,) (472,)


In [None]:
len(train_idx)

1890

In [None]:
len(test_idx)

472

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=1)
model = model.fit(X_train, y_train)

print('score = ',model.score(X_test,y_test))

score =  0.7076271186440678


In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=0)
model = model.fit(X_train, y_train)

print('score = ',model.score(X_test,y_test))

print('\nintercept = ',model.intercept_)
print('\nintercept = ',model.coef_)
print('\n',model.predict_proba(X))

score =  0.7139830508474576

intercept =  [1.07582715]

intercept =  [[-0.69950893  0.9383768  -0.61198441  1.26762517 -0.30388233  1.15756838
   0.83470362  0.44806529 -0.84730208  0.24837997 -0.61999892  0.3559452
  -0.05956582 -0.20684545  0.59257049 -0.55781199  0.09663162  0.39836738
  -0.78859725 -0.19740637  0.02584335 -0.68713176 -0.46582078  0.26865163
   0.10200305 -0.03450883 -1.14167085  0.2507588   0.68395733 -0.30141519
  -0.93269957  0.7792166   1.185866    0.72229925 -0.81954901  0.20819803
  -3.43618487]]

 [[6.67343837e-01 3.32656163e-01]
 [0.00000000e+00 1.00000000e+00]
 [9.99966780e-01 3.32202993e-05]
 ...
 [9.98302605e-01 1.69739471e-03]
 [1.00000000e+00 6.19209328e-25]
 [9.99556910e-01 4.43089597e-04]]


In [None]:
seeds

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374
...,...,...,...
2349,2021,Z12,1457
2350,2021,Z13,1317
2351,2021,Z14,1159
2352,2021,Z15,1331


In [None]:
list(outscores.columns)

['WinRatio',
 'PointsPerGame',
 'PointsAllowedPerGame',
 'PointsRatio',
 'OTsPerGame',
 'FGPerGame',
 'FGRatio',
 'FGAPerGame',
 'FGAllowedPerGame',
 'FG3PerGame',
 'FG3Ratio',
 'FG3APerGame',
 'FG3AllowedPerGame',
 'FTPerGame',
 'FTRatio',
 'FTAPerGame',
 'FTllowedPerGame',
 'ORPerGame',
 'DRPerGame',
 'TRPerGame',
 'OppORPerGame',
 'OppDRPerGame',
 'OppTRPerGame',
 'ORRatio',
 'DRRatio',
 'TRRatio',
 'AstPerGame',
 'OppAstPerGame',
 'StlPerGame',
 'OppStlPerGame',
 'TOPerGame',
 'OppTOPerGame',
 'BlkPerGame',
 'OppBlkPerGame',
 'PFPerGame',
 'OppPFPerGame',
 'Seed',
 'Result']

In [None]:
spark.conf.set("spark.sql.execution.arrow.enabled","true")
df=spark.createDataFrame(outscores) 
df.printSchema()
df.show()

root
 |-- WinRatio: double (nullable = true)
 |-- PointsPerGame: double (nullable = true)
 |-- PointsAllowedPerGame: double (nullable = true)
 |-- PointsRatio: double (nullable = true)
 |-- OTsPerGame: double (nullable = true)
 |-- FGPerGame: double (nullable = true)
 |-- FGRatio: double (nullable = true)
 |-- FGAPerGame: double (nullable = true)
 |-- FGAllowedPerGame: double (nullable = true)
 |-- FG3PerGame: double (nullable = true)
 |-- FG3Ratio: double (nullable = true)
 |-- FG3APerGame: double (nullable = true)
 |-- FG3AllowedPerGame: double (nullable = true)
 |-- FTPerGame: double (nullable = true)
 |-- FTRatio: double (nullable = true)
 |-- FTAPerGame: double (nullable = true)
 |-- FTllowedPerGame: double (nullable = true)
 |-- ORPerGame: double (nullable = true)
 |-- DRPerGame: double (nullable = true)
 |-- TRPerGame: double (nullable = true)
 |-- OppORPerGame: double (nullable = true)
 |-- OppDRPerGame: double (nullable = true)
 |-- OppTRPerGame: double (nullable = true)
 |-- 

# Model

In [None]:
# Assemble all the features with VectorAssembler
required_features = ['WinRatio',
 'PointsPerGame',
 'PointsAllowedPerGame',
 'PointsRatio',
 'OTsPerGame',
 'FGPerGame',
 'FGRatio',
 'FGAPerGame',
 'FGAllowedPerGame',
 'FG3PerGame',
 'FG3Ratio',
 'FG3APerGame',
 'FG3AllowedPerGame',
 'FTPerGame',
 'FTRatio',
 'FTAPerGame',
 'FTllowedPerGame',
 'ORPerGame',
 'DRPerGame',
 'TRPerGame',
 'OppORPerGame',
 'OppDRPerGame',
 'OppTRPerGame',
 'ORRatio',
 'DRRatio',
 'TRRatio',
 'AstPerGame',
 'OppAstPerGame',
 'StlPerGame',
 'OppStlPerGame',
 'TOPerGame',
 'OppTOPerGame',
 'BlkPerGame',
 'OppBlkPerGame',
 'PFPerGame',
 'OppPFPerGame',
 'Seed']
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(df)

In [None]:
transformed_data.show()

+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----+------+--------------------+
|            WinRatio|      PointsPerGame|PointsAllowedPerGame|         PointsRatio|          OTsPerGame|           FGPerGame|             FGRatio|         FGAPerGame|   FGAllowedPerGame|          FG3PerGame|       

In [None]:
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol='Result', maxIter=10, regParam=0.3, elasticNetParam=0.8)
M1 = lr.fit(training_data)
print("Coefficients: " + str(M1.coefficients))
print("Intercept: " + str(M1.intercept))

Coefficients: (37,[36],[-0.003627448055896554])
Intercept: -0.023637692034646562
