<a href="https://colab.research.google.com/github/andrewmarion/MarchMadness/blob/main/FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# March Madness
## Final Project
### Group 15: Andrew Marion, Dallas Hutchinson, Aydan Koyles


# Setting Up File

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [4]:
import findspark
findspark.init()

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

# Imports

In [26]:
import json
from pyspark.mllib.classification import LogisticRegressionModel,LogisticRegressionWithLBFGS, SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from pyspark.mllib.clustering import *
import pyspark.sql
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.mllib.stat import Statistics
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.clustering import LDA
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql.functions import lit

# Read in csv file

In [89]:
RegularSeason = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MRegularSeasonDetailedResults.csv", inferSchema = True)
Seeds = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MNCAATourneySeeds.csv", inferSchema = True)
Tourney = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MNCAATourneyDetailedResults.csv", inferSchema = True)
Massey = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MMasseyOrdinals.csv", inferSchema = True)
Conferences = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MNCAATourneySeeds.csv", inferSchema = True)
Coaches = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MTeamCoaches.csv", inferSchema = True)
TourneyCompact = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MNCAATourneyCompactResults.csv", inferSchema = True)


# Clean / Explore Data

## Season Input

In [9]:
WinTeams = RegularSeason.groupBy('Season', 'WTeamID').sum('WScore','LScore','NumOT','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst','WTO','WStl','WBlk','WPF','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst','LTO','LStl','LBlk','LPF')
LoseTeams = RegularSeason.groupBy('Season', 'LTeamID').sum('WScore','LScore','NumOT','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst','WTO','WStl','WBlk','WPF','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst','LTO','LStl','LBlk','LPF')

In [10]:
NumWin = RegularSeason.groupBy('Season', 'WTeamID').count()
NumWin = NumWin.withColumnRenamed("count", "Wins")
NumWin = NumWin.withColumnRenamed("WTeamID", "TeamID")

NumLose = RegularSeason.groupBy('Season', 'LTeamID').count()
NumLose = NumLose.withColumnRenamed("count", "Loses")
NumLose = NumLose.withColumnRenamed("LTeamID", "TeamID")

merged_df = NumWin.join(NumLose, ['Season','TeamID'])
merged_df = merged_df.withColumn('NumGames', merged_df.Wins + merged_df.Loses)
merged_df.show()

+------+------+----+-----+--------+
|Season|TeamID|Wins|Loses|NumGames|
+------+------+----+-----+--------+
|  2003|  1226|  13|   14|      27|
|  2003|  1189|  10|   15|      25|
|  2009|  1401|  23|    9|      32|
|  2011|  1431|  24|    9|      33|
|  2006|  1173|  13|   17|      30|
|  2006|  1153|  18|   12|      30|
|  2007|  1275|  18|   14|      32|
|  2007|  1175|  20|   12|      32|
|  2008|  1145|   6|   25|      31|
|  2008|  1313|  16|   15|      31|
|  2008|  1229|  23|    9|      32|
|  2008|  1325|  19|   12|      31|
|  2009|  1286|  13|   17|      30|
|  2009|  1189|   9|   20|      29|
|  2009|  1267|  12|   17|      29|
|  2012|  1334|   9|   19|      28|
|  2013|  1158|  24|    9|      33|
|  2013|  1104|  20|   12|      32|
|  2015|  1165|  12|   17|      29|
|  2015|  1337|  17|   13|      30|
+------+------+----+-----+--------+
only showing top 20 rows



for win team: rename all w to generaric ones and all L to opp as they are opponent stats.

Also add count to get number of wins




for lose team: rename all L to generaric ones and all W to opp as they are opponent stats

Also add count to get number of loses

the combine them off of team id

In [11]:
# Get all win stats for each team
WinTeams = WinTeams.withColumnRenamed("WTeamID", "TeamID")
WinTeams = WinTeams.withColumnRenamed("sum(WScore)", "WPTS")
WinTeams = WinTeams.withColumnRenamed("sum(LScore)", "WOppPTS")
WinTeams = WinTeams.withColumnRenamed("sum(NumOT)", "WOTs")
WinTeams = WinTeams.withColumnRenamed("sum(WFGM)", "WFGM")
WinTeams = WinTeams.withColumnRenamed("sum(WFGA)", "WFGA")
WinTeams = WinTeams.withColumnRenamed("sum(WFGA3)", "WFGA3")
WinTeams = WinTeams.withColumnRenamed("sum(WFGM3)", "WFGM3")
WinTeams = WinTeams.withColumnRenamed("sum(WFTA)", "WFTA")
WinTeams = WinTeams.withColumnRenamed("sum(WFTM)", "WFTM")
WinTeams = WinTeams.withColumnRenamed("sum(WOR)", "WOR")
WinTeams = WinTeams.withColumnRenamed("sum(WDR)", "WDR")
WinTeams = WinTeams.withColumnRenamed("sum(WAst)", "WAST")
WinTeams = WinTeams.withColumnRenamed("sum(WTO)", "WTO")
WinTeams = WinTeams.withColumnRenamed("sum(WStl)", "WSTL")
WinTeams = WinTeams.withColumnRenamed("sum(WBlk)", "WBLK")
WinTeams = WinTeams.withColumnRenamed("sum(WPF)", "WPF")
WinTeams = WinTeams.withColumnRenamed("sum(LFGM)", "WOppFGM")
WinTeams = WinTeams.withColumnRenamed("sum(LFGA)", "WOppFGA")
WinTeams = WinTeams.withColumnRenamed("sum(LFGA3)", "WOppFGA3")
WinTeams = WinTeams.withColumnRenamed("sum(LFGM3)", "WOppFGM3")
WinTeams = WinTeams.withColumnRenamed("sum(LFTA)", "WOppFTA")
WinTeams = WinTeams.withColumnRenamed("sum(LFTM)", "WOppFTM")
WinTeams = WinTeams.withColumnRenamed("sum(LOR)", "WOppOR")
WinTeams = WinTeams.withColumnRenamed("sum(LDR)", "WOppDR")
WinTeams = WinTeams.withColumnRenamed("sum(LAst)", "WOppAST")
WinTeams = WinTeams.withColumnRenamed("sum(LTO)", "WOppTO")
WinTeams = WinTeams.withColumnRenamed("sum(LStl)", "WOppSTL")
WinTeams = WinTeams.withColumnRenamed("sum(LBlk)", "WOppBLK")
WinTeams = WinTeams.withColumnRenamed("sum(LPF)", "WOppPF")

In [12]:
# Get all loss stats for each team
LoseTeams = LoseTeams.withColumnRenamed("LTeamID", "TeamID")
LoseTeams = LoseTeams.withColumnRenamed("sum(WScore)", "LOppPTS")
LoseTeams = LoseTeams.withColumnRenamed("sum(LScore)", "LPTS")
LoseTeams = LoseTeams.withColumnRenamed("sum(NumOT)", "LOTs")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFGM)", "LOppFGM")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFGA)", "LOppFGA")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFGA3)", "LOppFGA3")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFGM3)", "LOppFGM3")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFTA)", "LOppFTA")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFTM)", "LOppFTM")
LoseTeams = LoseTeams.withColumnRenamed("sum(WOR)", "LOppOR")
LoseTeams = LoseTeams.withColumnRenamed("sum(WDR)", "LOppDR")
LoseTeams = LoseTeams.withColumnRenamed("sum(WAst)", "LOppAST")
LoseTeams = LoseTeams.withColumnRenamed("sum(WTO)", "LOppTO")
LoseTeams = LoseTeams.withColumnRenamed("sum(WStl)", "LOppSTL")
LoseTeams = LoseTeams.withColumnRenamed("sum(WBlk)", "LOppBLK")
LoseTeams = LoseTeams.withColumnRenamed("sum(WPF)", "LOppPF")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFGM)", "LFGM")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFGA)", "LFGA")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFGA3)", "LFGA3")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFGM3)", "LFGM3")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFTA)", "LFTA")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFTM)", "LFTM")
LoseTeams = LoseTeams.withColumnRenamed("sum(LOR)", "LOR")
LoseTeams = LoseTeams.withColumnRenamed("sum(LDR)", "LDR")
LoseTeams = LoseTeams.withColumnRenamed("sum(LAst)", "LAST")
LoseTeams = LoseTeams.withColumnRenamed("sum(LTO)", "LTO")
LoseTeams = LoseTeams.withColumnRenamed("sum(LStl)", "LSTL")
LoseTeams = LoseTeams.withColumnRenamed("sum(LBlk)", "LBLK")
LoseTeams = LoseTeams.withColumnRenamed("sum(LPF)", "LPF")

In [13]:
#LoseTeams.filter(LoseTeams.Season == 2022).show()

In [14]:
# combine win teams and lose teams
Season = WinTeams.join(LoseTeams, ['Season','TeamID'])

In [15]:
# Combine coluns
Season = Season.withColumn("OppPTS", Season.WOppPTS + Season.LOppPTS) 
Season = Season.withColumn("PTS", Season.WPTS + Season.LPTS) 
Season = Season.withColumn("OTs", Season.WOTs + Season.LOTs) 
Season = Season.withColumn("OppFGM", Season.WOppFGM + Season.LOppFGM)
Season = Season.withColumn("OppFGA", Season.WOppFGA + Season.LOppFGA)
Season = Season.withColumn("OppFGA3", Season.WOppFGA3 + Season.LOppFGA3)
Season = Season.withColumn("OppFGM3", Season.WOppFGM3 + Season.LOppFGM3) 
Season = Season.withColumn("OppFTA", Season.WOppFTA + Season.LOppFTA) 
Season = Season.withColumn("OppFTM", Season.WOppFTM + Season.LOppFTM) 
Season = Season.withColumn("OppOR", Season.WOppOR + Season.LOppOR) 
Season = Season.withColumn("OppDR", Season.WOppDR + Season.LOppDR) 
Season = Season.withColumn("OppAST", Season.WOppAST + Season.LOppAST) 
Season = Season.withColumn("OppTO", Season.WOppTO + Season.LOppTO) 
Season = Season.withColumn("OppSTL", Season.WOppSTL + Season.LOppSTL) 
Season = Season.withColumn("OppBLK", Season.WOppBLK + Season.LOppBLK) 
Season = Season.withColumn("OppPF", Season.WOppPF + Season.LOppPF) 
Season = Season.withColumn("FGM", Season.WFGM + Season.LFGM) 
Season = Season.withColumn("FGA", Season.WFGA + Season.LFGA) 
Season = Season.withColumn("FGA3", Season.WFGA3 + Season.LFGA3) 
Season = Season.withColumn("FGM3", Season.WFGM3 + Season.LFGM3) 
Season = Season.withColumn("FTA", Season.WFTA + Season.LFTA) 
Season = Season.withColumn("FTM", Season.WFTM + Season.LFTM) 
Season = Season.withColumn("OR", Season.WOR + Season.LOR) 
Season = Season.withColumn("DR", Season.WDR + Season.LDR) 
Season = Season.withColumn("AST", Season.WAST + Season.LAST)
Season = Season.withColumn("TO", Season.WTO + Season.LTO) 
Season = Season.withColumn("STL", Season.WSTL + Season.LSTL)
Season = Season.withColumn("BLK", Season.WBLK + Season.LBLK)
Season = Season.withColumn("PF", Season.WPF + Season.LPF)

# Drop Unwanted Columns
Season =  Season.drop("WPTS","WOppPTS","WOTs","WFGM","WFGA","WFGM3","WFGA3","WFTM","WFTA","WOR","WDR","WAST","WTO","WSTL","WBLK","WPF","WOppFGM","WOppFGA","WOppFGM3","WOppFGA3","WOppFTM","WOppFTA","WOppOR","WOppDR","WOppAST","WOppTO","WOppSTL","WOppBLK","WOppPF","LOppPTS","LPTS","LOTs","LOppFGM","LOppFGA","LOppFGM3","LOppFGA3","LOppFTM","LOppFTA","LOppOR","LOppDR","LOppAST","LOppTO","LOppSTL","LOppBLK","LOppPF","LFGM","LFGA","LFGM3","LFGA3","LFTM","LFTA","LOR","LDR","LAST","LTO","LSTL","LBLK","LPF")

In [16]:
# Combine merged and season
Season_Stats = Season.join(merged_df, ['Season','TeamID'])

Make season stats for input

In [17]:
#Wins, PPG, PPG Allowed, Points Ratio, OTs
#RegularSeasonInput['WinRatio'] = combinedTeams['Wins'] / combinedTeams['NumGames']
RegularSeasonInput = Season_Stats.withColumn("WinRatio", Season_Stats.Wins /  Season_Stats.NumGames) 
#RegularSeasonInput['PointsPerGame'] = combinedTeams['Points'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("PointsPerGame", Season_Stats.PTS /  Season_Stats.NumGames)
#RegularSeasonInput['PointsAllowedPerGame'] = combinedTeams['OppPoints'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("PointsAllowedPerGame", Season_Stats.OppPTS /  Season_Stats.NumGames)
#RegularSeasonInput['PointsRatio'] = combinedTeams['Points'] / combinedTeams['OppPoints']
RegularSeasonInput = RegularSeasonInput.withColumn("PointsRatio", Season_Stats.PTS /  Season_Stats.OppPTS)
#RegularSeasonInput['OTsPerGame'] = combinedTeams['NumOT'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("OTsPerGame", Season_Stats.OTs /  Season_Stats.NumGames)
#Field Goals
#RegularSeasonInput['FGPerGame'] = combinedTeams['FGM'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("FGPerGame", Season_Stats.FGM /  Season_Stats.NumGames)
#RegularSeasonInput['FGRatio'] = combinedTeams['FGM'] / combinedTeams['FGA']
RegularSeasonInput = RegularSeasonInput.withColumn("FGRatio", Season_Stats.FGM /  Season_Stats.FGA)
#RegularSeasonInput['FGAPerGame'] = combinedTeams['FGA'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("FGAPerGame", Season_Stats.FGA /  Season_Stats.NumGames)
#RegularSeasonInput['FGAllowedPerGame'] = combinedTeams['OppFGM'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("FGAllowedPerGame", Season_Stats.OppFGM /  Season_Stats.NumGames)
#Three Pointers
#RegularSeasonInput['FG3PerGame'] = combinedTeams['FGM3'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("FG3PerGame", Season_Stats.FGM3 /  Season_Stats.NumGames)
#RegularSeasonInput['FG3Ratio'] = combinedTeams['FGM3'] / combinedTeams['FGA3']
RegularSeasonInput = RegularSeasonInput.withColumn("FG3Ratio'", Season_Stats.FGM3 /  Season_Stats.FGA3)
#RegularSeasonInput['FG3APerGame'] = combinedTeams['FGA3'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("FG3APerGame", Season_Stats.FGA3 /  Season_Stats.NumGames)
#RegularSeasonInput['FG3AllowedPerGame'] = combinedTeams['OppFGM3'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("FG3AllowedPerGame", Season_Stats.OppFGM3 /  Season_Stats.NumGames)
#Free Throws
#RegularSeasonInput['FTPerGame'] = combinedTeams['FTM'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("FTPerGame", Season_Stats.FTM /  Season_Stats.NumGames)
#RegularSeasonInput['FTRatio'] = combinedTeams['FTM'] / combinedTeams['FTA']
RegularSeasonInput = RegularSeasonInput.withColumn("FTRatio", Season_Stats.FTM /  Season_Stats.FTA)
#RegularSeasonInput['FTAPerGame'] = combinedTeams['FTA'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("FTAPerGame", Season_Stats.FTA /  Season_Stats.NumGames)
#RegularSeasonInput['FTllowedPerGame'] = combinedTeams['OppFTM'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("FTllowedPerGame", Season_Stats.OppFTM /  Season_Stats.NumGames)
#Rebounds
#RegularSeasonInput['ORPerGame'] = combinedTeams['OR'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("ORPerGame", Season_Stats.OR /  Season_Stats.NumGames)
#RegularSeasonInput['DRPerGame'] = combinedTeams['DR'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("DRPerGame", Season_Stats.DR /  Season_Stats.NumGames)
#RegularSeasonInput['TRPerGame'] = (combinedTeams['OR'] + combinedTeams['DR']) / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("TRPerGame", (Season_Stats.OR+ Season_Stats.DR) /  Season_Stats.NumGames)
#RegularSeasonInput['OppORPerGame'] = combinedTeams['OppOR'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("OppORPerGame", Season_Stats.OppOR /  Season_Stats.NumGames)
#RegularSeasonInput['OppDRPerGame'] = combinedTeams['OppDR'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("OppDRPerGam", Season_Stats.OppDR /  Season_Stats.NumGames)
#RegularSeasonInput['OppTRPerGame'] = (combinedTeams['OppOR'] + combinedTeams['OppDR']) / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("OppTRPerGame", (Season_Stats.OppOR +  Season_Stats.OppDR) / Season_Stats.NumGames)
#RegularSeasonInput['ORRatio'] = combinedTeams['OR'] / combinedTeams['OppOR']
RegularSeasonInput = RegularSeasonInput.withColumn("ORRatio", Season_Stats.OR /  Season_Stats.OppOR)
#RegularSeasonInput['DRRatio'] = combinedTeams['DR'] / combinedTeams['OppDR']
RegularSeasonInput = RegularSeasonInput.withColumn("DRRatio", Season_Stats.DR /  Season_Stats.OppDR)
#RegularSeasonInput['TRRatio'] = (combinedTeams['OR'] + combinedTeams['DR']) / (combinedTeams['OppOR'] + combinedTeams['OppDR'])
RegularSeasonInput = RegularSeasonInput.withColumn("TRRatio", (Season_Stats.OR +  Season_Stats.DR) / (Season_Stats.OppOR +  Season_Stats.OppDR))
#Assists
#RegularSeasonInput['AstPerGame'] = combinedTeams['Ast'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("AstPerGame", Season_Stats.AST /  Season_Stats.NumGames)
#RegularSeasonInput['OppAstPerGame'] = combinedTeams['OppAst'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("OppAstPerGame", Season_Stats.OppAST /  Season_Stats.NumGames)
#Steals
#RegularSeasonInput['StlPerGame'] = combinedTeams['Stl'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("StlPerGame", Season_Stats.STL /  Season_Stats.NumGames)
#RegularSeasonInput['OppStlPerGame'] = combinedTeams['OppStl'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("OppStlPerGame", Season_Stats.OppSTL /  Season_Stats.NumGames)
#Turnovers
#RegularSeasonInput['TOPerGame'] = combinedTeams['TO'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("TOPerGame", Season_Stats.TO / Season_Stats.NumGames)
#RegularSeasonInput['OppTOPerGame'] = combinedTeams['OppTO'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("OppTOPerGame", Season_Stats.OppTO / Season_Stats.NumGames)
#Blocks
#RegularSeasonInput['BlkPerGame'] = combinedTeams['Blk'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("BlkPerGame", Season_Stats.BLK / Season_Stats.NumGames)
#RegularSeasonInput['OppBlkPerGame'] = combinedTeams['OppBlk'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("OppBlkPerGame", Season_Stats.OppBLK /  Season_Stats.NumGames)
#Personal Fouls
#RegularSeasonInput['PFPerGame'] = combinedTeams['PF'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("PFPerGame", Season_Stats.PF /  Season_Stats.NumGames)
#RegularSeasonInput['OppPFPerGame'] = combinedTeams['OppPF'] / combinedTeams['NumGames']
RegularSeasonInput = RegularSeasonInput.withColumn("OppPFPerGame", Season_Stats.OppPF /  Season_Stats.NumGames)

#drop unwanted columns
RegularSeasonInput =  RegularSeasonInput.drop("PTS","OppPTS","OTs","FGM","FGA","FGM3","FGA3","FTM","FTA","OR","DR","AST","TO","STL","BLK","PF","OppFGM","OppFGA","OppFGM3","OppFGA3","OppFTM","OppFTA","OppOR","OppDR","OppAST","OppTO","OppSTL","OppBLK","OppPF")
#show
RegularSeasonInput.show()

+------+------+----+-----+--------+-------------------+------------------+--------------------+------------------+--------------------+------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|Season|TeamID|Wins|Loses|NumGames|           WinRatio|     PointsPerGame|PointsAllowedPerGame|       PointsRatio|          OTsPerGame|         FGPerGame|            FGRatio|        FGAPerGame|  FGAllowedPerGame|        FG3PerGame|          FG3Ratio'|       FG3APerGame| FG

In [18]:
RegularSeasonInput.describe().show()

+-------+------------------+------------------+------------------+------------------+-----------------+-------------------+-----------------+--------------------+------------------+-------------------+------------------+--------------------+------------------+------------------+------------------+--------------------+------------------+------------------+------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|            Season|            TeamID|              Wins|             Loses|         NumGames|           WinRatio|    PointsPerGame|PointsAllowedPerGame|       PointsRatio|         OTsPerGame

## Tourney Input

In [90]:
Seeds = Seeds.withColumn('Seed', Seeds.Seed.substr(2,2))
Seeds.show()

+------+----+------+
|Season|Seed|TeamID|
+------+----+------+
|  1985|  01|  1207|
|  1985|  02|  1210|
|  1985|  03|  1228|
|  1985|  04|  1260|
|  1985|  05|  1374|
|  1985|  06|  1208|
|  1985|  07|  1393|
|  1985|  08|  1396|
|  1985|  09|  1439|
|  1985|  10|  1177|
|  1985|  11|  1455|
|  1985|  12|  1330|
|  1985|  13|  1233|
|  1985|  14|  1318|
|  1985|  15|  1273|
|  1985|  16|  1250|
|  1985|  01|  1385|
|  1985|  02|  1433|
|  1985|  03|  1301|
|  1985|  04|  1424|
+------+----+------+
only showing top 20 rows



In [91]:
Seeds.describe().show()

+-------+------------------+-----------------+------------------+
|summary|            Season|             Seed|            TeamID|
+-------+------------------+-----------------+------------------+
|  count|              2354|             2354|              2354|
|   mean|2002.7625318606626|8.620220900594733|1291.5747663551401|
| stddev|10.475078296853514|4.646193415404665|103.31555675390796|
|    min|              1985|               01|              1101|
|    max|              2021|               16|              1463|
+-------+------------------+-----------------+------------------+



In [116]:
Winners = TourneyCompact.withColumnRenamed("WTeamID","TeamID")
Winners =  Winners.drop("DayNum","WScore","WLoc","NumOT","LScore")
Winners = Winners.join(Seeds, ['Season','TeamID'])
Winners = Winners.withColumnRenamed("Seed","Team1Seed")
Winners = Winners.withColumnRenamed("TeamID","Team1")
Winners = Winners.withColumnRenamed("LTeamID","TeamID")
Winners = Winners.join(Seeds, ['Season','TeamID'])
Winners = Winners.withColumnRenamed("TeamID","Team2")
Winners = Winners.withColumnRenamed("Seed","Team2Seed")
Winners = Winners.withColumn("Result",lit(1))
print('winner')
Winners.show()

Losers = Winners.withColumn("Result",lit(0))
print('loser')
Losers.show()

winner
+------+-----+-----+---------+---------+------+
|Season|Team2|Team1|Team1Seed|Team2Seed|Result|
+------+-----+-----+---------+---------+------+
|  1985| 1385| 1207|       01|       01|     1|
|  1985| 1210| 1207|       01|       02|     1|
|  1985| 1260| 1207|       01|       04|     1|
|  1985| 1396| 1207|       01|       08|     1|
|  1985| 1250| 1207|       01|       16|     1|
|  1985| 1228| 1210|       02|       03|     1|
|  1985| 1393| 1210|       02|       07|     1|
|  1985| 1273| 1210|       02|       15|     1|
|  1985| 1208| 1228|       03|       06|     1|
|  1985| 1318| 1228|       03|       14|     1|
|  1985| 1374| 1260|       04|       05|     1|
|  1985| 1233| 1260|       04|       13|     1|
|  1985| 1330| 1374|       05|       12|     1|
|  1985| 1455| 1208|       06|       11|     1|
|  1985| 1177| 1393|       07|       10|     1|
|  1985| 1439| 1396|       08|       09|     1|
|  1985| 1301| 1385|       01|       03|     1|
|  1985| 1246| 1385|       01|   

In [117]:
TourneyInput = Winners.union(Losers)
TourneyInput.show()
TourneyInput.describe().show()

+------+-----+-----+---------+---------+------+
|Season|Team2|Team1|Team1Seed|Team2Seed|Result|
+------+-----+-----+---------+---------+------+
|  1985| 1385| 1207|       01|       01|     1|
|  1985| 1210| 1207|       01|       02|     1|
|  1985| 1260| 1207|       01|       04|     1|
|  1985| 1396| 1207|       01|       08|     1|
|  1985| 1250| 1207|       01|       16|     1|
|  1985| 1228| 1210|       02|       03|     1|
|  1985| 1393| 1210|       02|       07|     1|
|  1985| 1273| 1210|       02|       15|     1|
|  1985| 1208| 1228|       03|       06|     1|
|  1985| 1318| 1228|       03|       14|     1|
|  1985| 1374| 1260|       04|       05|     1|
|  1985| 1233| 1260|       04|       13|     1|
|  1985| 1330| 1374|       05|       12|     1|
|  1985| 1455| 1208|       06|       11|     1|
|  1985| 1177| 1393|       07|       10|     1|
|  1985| 1439| 1396|       08|       09|     1|
|  1985| 1301| 1385|       01|       03|     1|
|  1985| 1246| 1385|       01|       12|

# Make Final Input Dataframe

In [None]:
#combine tourney and season

# Model Building

In [None]:
spark.conf.set("spark.sql.execution.arrow.enabled","true")
df=spark.createDataFrame(outscores) 
df.printSchema()
df.show()

In [None]:
# Assemble all the features with VectorAssembler
required_features = ['WinRatio',
 'PointsPerGame',
 'PointsAllowedPerGame',
 'PointsRatio',
 'OTsPerGame',
 'FGPerGame',
 'FGRatio',
 'FGAPerGame',
 'FGAllowedPerGame',
 'FG3PerGame',
 'FG3Ratio',
 'FG3APerGame',
 'FG3AllowedPerGame',
 'FTPerGame',
 'FTRatio',
 'FTAPerGame',
 'FTllowedPerGame',
 'ORPerGame',
 'DRPerGame',
 'TRPerGame',
 'OppORPerGame',
 'OppDRPerGame',
 'OppTRPerGame',
 'ORRatio',
 'DRRatio',
 'TRRatio',
 'AstPerGame',
 'OppAstPerGame',
 'StlPerGame',
 'OppStlPerGame',
 'TOPerGame',
 'OppTOPerGame',
 'BlkPerGame',
 'OppBlkPerGame',
 'PFPerGame',
 'OppPFPerGame',
 'Seed']
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(df)

In [None]:
transformed_data.show()

In [None]:
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol='Result', maxIter=10, regParam=0.3, elasticNetParam=0.8)
M1 = lr.fit(training_data)
print("Coefficients: " + str(M1.coefficients))
print("Intercept: " + str(M1.intercept))

# TO DO

1.   Get 2022 seeds
2. Combine all regular season stats with Tournament input
3.   Break off 2022
4.   Make Model

