<a href="https://colab.research.google.com/github/andrewmarion/MarchMadness/blob/main/FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# March Madness
## Final Project
### Group 15: Andrew Marion, Dallas Hutchinson, Aydan Koyles


# Setting Up File

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [4]:
import findspark
findspark.init()

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

# Imports

In [6]:
import json
from pyspark.mllib.classification import LogisticRegressionModel,LogisticRegressionWithLBFGS, SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from pyspark.mllib.clustering import *
import pyspark.sql
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.mllib.stat import Statistics
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.clustering import LDA
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql.functions import lit

# Read in csv file

In [7]:
RegularSeason = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MRegularSeasonDetailedResults.csv", inferSchema = True)
Seeds = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MNCAATourneySeeds.csv", inferSchema = True)
Tourney = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MNCAATourneyDetailedResults.csv", inferSchema = True)
Massey = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MMasseyOrdinals_thruDay128.csv", inferSchema = True)
Conferences = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MNCAATourneySeeds.csv", inferSchema = True)
Coaches = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MTeamCoaches.csv", inferSchema = True)
TourneyCompact = spark.read.option("header",True).csv("/content/gdrive/MyDrive/MDataFiles_Stage1/MNCAATourneyCompactResults.csv", inferSchema = True)


# Clean / Explore Data

## Season Input

In [8]:
WinTeams = RegularSeason.groupBy('Season', 'WTeamID').sum('WScore','LScore','NumOT','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst','WTO','WStl','WBlk','WPF','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst','LTO','LStl','LBlk','LPF')
LoseTeams = RegularSeason.groupBy('Season', 'LTeamID').sum('WScore','LScore','NumOT','WFGM','WFGA','WFGM3','WFGA3','WFTM','WFTA','WOR','WDR','WAst','WTO','WStl','WBlk','WPF','LFGM','LFGA','LFGM3','LFGA3','LFTM','LFTA','LOR','LDR','LAst','LTO','LStl','LBlk','LPF')

In [9]:
NumWin = RegularSeason.groupBy('Season', 'WTeamID').count()
NumWin = NumWin.withColumnRenamed("count", "Wins")
NumWin = NumWin.withColumnRenamed("WTeamID", "TeamID")

NumLose = RegularSeason.groupBy('Season', 'LTeamID').count()
NumLose = NumLose.withColumnRenamed("count", "Loses")
NumLose = NumLose.withColumnRenamed("LTeamID", "TeamID")

merged_df = NumWin.join(NumLose, ['Season','TeamID'])
merged_df = merged_df.withColumn('NumGames', merged_df.Wins + merged_df.Loses)

In [10]:
# Get all win stats for each team
WinTeams = WinTeams.withColumnRenamed("WTeamID", "TeamID")
WinTeams = WinTeams.withColumnRenamed("sum(WScore)", "WPTS")
WinTeams = WinTeams.withColumnRenamed("sum(LScore)", "WOppPTS")
WinTeams = WinTeams.withColumnRenamed("sum(NumOT)", "WOTs")
WinTeams = WinTeams.withColumnRenamed("sum(WFGM)", "WFGM")
WinTeams = WinTeams.withColumnRenamed("sum(WFGA)", "WFGA")
WinTeams = WinTeams.withColumnRenamed("sum(WFGA3)", "WFGA3")
WinTeams = WinTeams.withColumnRenamed("sum(WFGM3)", "WFGM3")
WinTeams = WinTeams.withColumnRenamed("sum(WFTA)", "WFTA")
WinTeams = WinTeams.withColumnRenamed("sum(WFTM)", "WFTM")
WinTeams = WinTeams.withColumnRenamed("sum(WOR)", "WOR")
WinTeams = WinTeams.withColumnRenamed("sum(WDR)", "WDR")
WinTeams = WinTeams.withColumnRenamed("sum(WAst)", "WAST")
WinTeams = WinTeams.withColumnRenamed("sum(WTO)", "WTO")
WinTeams = WinTeams.withColumnRenamed("sum(WStl)", "WSTL")
WinTeams = WinTeams.withColumnRenamed("sum(WBlk)", "WBLK")
WinTeams = WinTeams.withColumnRenamed("sum(WPF)", "WPF")
WinTeams = WinTeams.withColumnRenamed("sum(LFGM)", "WOppFGM")
WinTeams = WinTeams.withColumnRenamed("sum(LFGA)", "WOppFGA")
WinTeams = WinTeams.withColumnRenamed("sum(LFGA3)", "WOppFGA3")
WinTeams = WinTeams.withColumnRenamed("sum(LFGM3)", "WOppFGM3")
WinTeams = WinTeams.withColumnRenamed("sum(LFTA)", "WOppFTA")
WinTeams = WinTeams.withColumnRenamed("sum(LFTM)", "WOppFTM")
WinTeams = WinTeams.withColumnRenamed("sum(LOR)", "WOppOR")
WinTeams = WinTeams.withColumnRenamed("sum(LDR)", "WOppDR")
WinTeams = WinTeams.withColumnRenamed("sum(LAst)", "WOppAST")
WinTeams = WinTeams.withColumnRenamed("sum(LTO)", "WOppTO")
WinTeams = WinTeams.withColumnRenamed("sum(LStl)", "WOppSTL")
WinTeams = WinTeams.withColumnRenamed("sum(LBlk)", "WOppBLK")
WinTeams = WinTeams.withColumnRenamed("sum(LPF)", "WOppPF")

In [11]:
# Get all loss stats for each team
LoseTeams = LoseTeams.withColumnRenamed("LTeamID", "TeamID")
LoseTeams = LoseTeams.withColumnRenamed("sum(WScore)", "LOppPTS")
LoseTeams = LoseTeams.withColumnRenamed("sum(LScore)", "LPTS")
LoseTeams = LoseTeams.withColumnRenamed("sum(NumOT)", "LOTs")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFGM)", "LOppFGM")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFGA)", "LOppFGA")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFGA3)", "LOppFGA3")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFGM3)", "LOppFGM3")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFTA)", "LOppFTA")
LoseTeams = LoseTeams.withColumnRenamed("sum(WFTM)", "LOppFTM")
LoseTeams = LoseTeams.withColumnRenamed("sum(WOR)", "LOppOR")
LoseTeams = LoseTeams.withColumnRenamed("sum(WDR)", "LOppDR")
LoseTeams = LoseTeams.withColumnRenamed("sum(WAst)", "LOppAST")
LoseTeams = LoseTeams.withColumnRenamed("sum(WTO)", "LOppTO")
LoseTeams = LoseTeams.withColumnRenamed("sum(WStl)", "LOppSTL")
LoseTeams = LoseTeams.withColumnRenamed("sum(WBlk)", "LOppBLK")
LoseTeams = LoseTeams.withColumnRenamed("sum(WPF)", "LOppPF")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFGM)", "LFGM")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFGA)", "LFGA")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFGA3)", "LFGA3")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFGM3)", "LFGM3")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFTA)", "LFTA")
LoseTeams = LoseTeams.withColumnRenamed("sum(LFTM)", "LFTM")
LoseTeams = LoseTeams.withColumnRenamed("sum(LOR)", "LOR")
LoseTeams = LoseTeams.withColumnRenamed("sum(LDR)", "LDR")
LoseTeams = LoseTeams.withColumnRenamed("sum(LAst)", "LAST")
LoseTeams = LoseTeams.withColumnRenamed("sum(LTO)", "LTO")
LoseTeams = LoseTeams.withColumnRenamed("sum(LStl)", "LSTL")
LoseTeams = LoseTeams.withColumnRenamed("sum(LBlk)", "LBLK")
LoseTeams = LoseTeams.withColumnRenamed("sum(LPF)", "LPF")

In [12]:
#LoseTeams.filter(LoseTeams.Season == 2022).show()

In [13]:
# combine win teams and lose teams
Season = WinTeams.join(LoseTeams, ['Season','TeamID'])

In [14]:
# Combine coluns
Season = Season.withColumn("OppPTS", Season.WOppPTS + Season.LOppPTS) 
Season = Season.withColumn("PTS", Season.WPTS + Season.LPTS) 
Season = Season.withColumn("OTs", Season.WOTs + Season.LOTs) 
Season = Season.withColumn("OppFGM", Season.WOppFGM + Season.LOppFGM)
Season = Season.withColumn("OppFGA", Season.WOppFGA + Season.LOppFGA)
Season = Season.withColumn("OppFGA3", Season.WOppFGA3 + Season.LOppFGA3)
Season = Season.withColumn("OppFGM3", Season.WOppFGM3 + Season.LOppFGM3) 
Season = Season.withColumn("OppFTA", Season.WOppFTA + Season.LOppFTA) 
Season = Season.withColumn("OppFTM", Season.WOppFTM + Season.LOppFTM) 
Season = Season.withColumn("OppOR", Season.WOppOR + Season.LOppOR) 
Season = Season.withColumn("OppDR", Season.WOppDR + Season.LOppDR) 
Season = Season.withColumn("OppAST", Season.WOppAST + Season.LOppAST) 
Season = Season.withColumn("OppTO", Season.WOppTO + Season.LOppTO) 
Season = Season.withColumn("OppSTL", Season.WOppSTL + Season.LOppSTL) 
Season = Season.withColumn("OppBLK", Season.WOppBLK + Season.LOppBLK) 
Season = Season.withColumn("OppPF", Season.WOppPF + Season.LOppPF) 
Season = Season.withColumn("FGM", Season.WFGM + Season.LFGM) 
Season = Season.withColumn("FGA", Season.WFGA + Season.LFGA) 
Season = Season.withColumn("FGA3", Season.WFGA3 + Season.LFGA3) 
Season = Season.withColumn("FGM3", Season.WFGM3 + Season.LFGM3) 
Season = Season.withColumn("FTA", Season.WFTA + Season.LFTA) 
Season = Season.withColumn("FTM", Season.WFTM + Season.LFTM) 
Season = Season.withColumn("OR", Season.WOR + Season.LOR) 
Season = Season.withColumn("DR", Season.WDR + Season.LDR) 
Season = Season.withColumn("AST", Season.WAST + Season.LAST)
Season = Season.withColumn("TO", Season.WTO + Season.LTO) 
Season = Season.withColumn("STL", Season.WSTL + Season.LSTL)
Season = Season.withColumn("BLK", Season.WBLK + Season.LBLK)
Season = Season.withColumn("PF", Season.WPF + Season.LPF)

# Drop Unwanted Columns
Season =  Season.drop("WPTS","WOppPTS","WOTs","WFGM","WFGA","WFGM3","WFGA3","WFTM","WFTA","WOR","WDR","WAST","WTO","WSTL","WBLK","WPF","WOppFGM","WOppFGA","WOppFGM3","WOppFGA3","WOppFTM","WOppFTA","WOppOR","WOppDR","WOppAST","WOppTO","WOppSTL","WOppBLK","WOppPF","LOppPTS","LPTS","LOTs","LOppFGM","LOppFGA","LOppFGM3","LOppFGA3","LOppFTM","LOppFTA","LOppOR","LOppDR","LOppAST","LOppTO","LOppSTL","LOppBLK","LOppPF","LFGM","LFGA","LFGM3","LFGA3","LFTM","LFTA","LOR","LDR","LAST","LTO","LSTL","LBLK","LPF")

In [15]:
# Combine merged and season
Season_Stats = Season.join(merged_df, ['Season','TeamID'])

Make season stats for input

In [16]:
#Wins, PPG, PPG Allowed, Points Ratio, OTs
RegularSeasonInput = Season_Stats.withColumn("WinRatio", Season_Stats.Wins /  Season_Stats.NumGames) 
RegularSeasonInput = RegularSeasonInput.withColumn("PointsPerGame", Season_Stats.PTS /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("PointsAllowedPerGame", Season_Stats.OppPTS /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("PointsRatio", Season_Stats.PTS /  Season_Stats.OppPTS)
RegularSeasonInput = RegularSeasonInput.withColumn("OTsPerGame", Season_Stats.OTs /  Season_Stats.NumGames)
#Field Goals
RegularSeasonInput = RegularSeasonInput.withColumn("FGPerGame", Season_Stats.FGM /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("FGRatio", Season_Stats.FGM /  Season_Stats.FGA)
RegularSeasonInput = RegularSeasonInput.withColumn("FGAPerGame", Season_Stats.FGA /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("FGAllowedPerGame", Season_Stats.OppFGM /  Season_Stats.NumGames)
#Three Pointers
RegularSeasonInput = RegularSeasonInput.withColumn("FG3PerGame", Season_Stats.FGM3 /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("FG3Ratio", Season_Stats.FGM3 /  Season_Stats.FGA3)
RegularSeasonInput = RegularSeasonInput.withColumn("FG3APerGame", Season_Stats.FGA3 /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("FG3AllowedPerGame", Season_Stats.OppFGM3 /  Season_Stats.NumGames)
#Free Throws
RegularSeasonInput = RegularSeasonInput.withColumn("FTPerGame", Season_Stats.FTM /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("FTRatio", Season_Stats.FTM /  Season_Stats.FTA)
RegularSeasonInput = RegularSeasonInput.withColumn("FTAPerGame", Season_Stats.FTA /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("FTllowedPerGame", Season_Stats.OppFTM /  Season_Stats.NumGames)
#Rebounds
RegularSeasonInput = RegularSeasonInput.withColumn("ORPerGame", Season_Stats.OR /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("DRPerGame", Season_Stats.DR /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("TRPerGame", (Season_Stats.OR+ Season_Stats.DR) /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("OppORPerGame", Season_Stats.OppOR /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("OppDRPerGame", Season_Stats.OppDR /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("OppTRPerGame", (Season_Stats.OppOR +  Season_Stats.OppDR) / Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("ORRatio", Season_Stats.OR /  Season_Stats.OppOR)
RegularSeasonInput = RegularSeasonInput.withColumn("DRRatio", Season_Stats.DR /  Season_Stats.OppDR)
RegularSeasonInput = RegularSeasonInput.withColumn("TRRatio", (Season_Stats.OR +  Season_Stats.DR) / (Season_Stats.OppOR +  Season_Stats.OppDR))
#Assists
RegularSeasonInput = RegularSeasonInput.withColumn("AstPerGame", Season_Stats.AST /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("OppAstPerGame", Season_Stats.OppAST /  Season_Stats.NumGames)
#Steals
RegularSeasonInput = RegularSeasonInput.withColumn("StlPerGame", Season_Stats.STL /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("OppStlPerGame", Season_Stats.OppSTL /  Season_Stats.NumGames)
#Turnovers
RegularSeasonInput = RegularSeasonInput.withColumn("TOPerGame", Season_Stats.TO / Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("OppTOPerGame", Season_Stats.OppTO / Season_Stats.NumGames)
#Blocks
RegularSeasonInput = RegularSeasonInput.withColumn("BlkPerGame", Season_Stats.BLK / Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("OppBlkPerGame", Season_Stats.OppBLK /  Season_Stats.NumGames)
#Personal Fouls
RegularSeasonInput = RegularSeasonInput.withColumn("PFPerGame", Season_Stats.PF /  Season_Stats.NumGames)
RegularSeasonInput = RegularSeasonInput.withColumn("OppPFPerGame", Season_Stats.OppPF /  Season_Stats.NumGames)

#drop unwanted columns
RegularSeasonInput =  RegularSeasonInput.drop("PTS","OppPTS","OTs","FGM","FGA","FGM3","FGA3","FTM","FTA","OR","DR","AST","TO","STL","BLK","PF","OppFGM","OppFGA","OppFGM3","OppFGA3","OppFTM","OppFTA","OppOR","OppDR","OppAST","OppTO","OppSTL","OppBLK","OppPF")
#show
RegularSeasonInput.show()

+------+------+----+-----+--------+-------------------+------------------+--------------------+------------------+--------------------+------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|Season|TeamID|Wins|Loses|NumGames|           WinRatio|     PointsPerGame|PointsAllowedPerGame|       PointsRatio|          OTsPerGame|         FGPerGame|            FGRatio|        FGAPerGame|  FGAllowedPerGame|        FG3PerGame|           FG3Ratio|       FG3APerGame| FG

In [17]:
RegularSeasonInput.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+-------------------+-----------------+--------------------+------------------+-------------------+------------------+--------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|            Season|            TeamID|              Wins|             Loses|          NumGames|           WinRatio|    PointsPerGame|PointsAllowedPerGame|       PointsRatio|         OTsPerGam

## Tourney Input

In [18]:
Seeds = Seeds.withColumn('Seed', Seeds.Seed.substr(2,2))

In [19]:
Winners = TourneyCompact.withColumnRenamed("WTeamID","TeamID")
Winners =  Winners.drop("DayNum","WScore","WLoc","NumOT","LScore")
Winners = Winners.join(Seeds, ['Season','TeamID'])
Winners = Winners.withColumnRenamed("Seed","Team1Seed")
Winners = Winners.withColumnRenamed("TeamID","Team1")
Winners = Winners.withColumnRenamed("LTeamID","TeamID")
Winners = Winners.join(Seeds, ['Season','TeamID'])
Winners = Winners.withColumnRenamed("TeamID","Team2")
Winners = Winners.withColumnRenamed("Seed","Team2Seed")
Winners = Winners.withColumn("Result",lit(1))

Losers = TourneyCompact.withColumnRenamed("LTeamID","TeamID")
Losers = Losers.drop("DayNum","WScore","WLoc","NumOT","LScore")
Losers = Losers.join(Seeds, ['Season','TeamID'])
Losers = Losers.withColumnRenamed("Seed","Team1Seed")
Losers = Losers.withColumnRenamed("TeamID","Team1")
Losers = Losers.withColumnRenamed("WTeamID","TeamID")
Losers = Losers.join(Seeds, ['Season','TeamID'])
Losers = Losers.withColumnRenamed("TeamID","Team2")
Losers = Losers.withColumnRenamed("Seed","Team2Seed")
Losers = Losers.withColumn("Result",lit(0))


In [20]:
TourneyInput = Winners.union(Losers)

In [21]:
TourneyInput.filter(TourneyInput.Season == 2022).show()

+------+-----+-----+---------+---------+------+
|Season|Team2|Team1|Team1Seed|Team2Seed|Result|
+------+-----+-----+---------+---------+------+
|  2022| 1313| 1124|       01|       16|     1|
|  2022| 1389| 1246|       02|       15|     1|
|  2022| 1463| 1345|       03|       14|     1|
|  2022| 1103| 1417|       04|       13|     1|
|  2022| 1231| 1388|       05|       12|     1|
|  2022| 1362| 1293|       07|       10|     1|
|  2022| 1266| 1314|       08|       09|     1|
|  2022| 1209| 1211|       01|       16|     1|
|  2022| 1168| 1181|       02|       15|     1|
|  2022| 1439| 1403|       03|       11|     1|
|  2022| 1286| 1403|       03|       14|     1|
|  2022| 1436| 1116|       04|       13|     1|
|  2022| 1308| 1163|       05|       12|     1|
|  2022| 1323| 1104|       06|       11|     1|
|  2022| 1172| 1277|       07|       10|     1|
|  2022| 1272| 1129|       08|       09|     1|
|  2022| 1411| 1242|       01|       16|     1|
|  2022| 1240| 1120|       02|       15|

# Make Final Input Dataframe

In [22]:
RegularSeasonInput.columns
# Team 1
outscores1 = RegularSeasonInput.withColumnRenamed('TeamID','Team1')
outscores1 = outscores1.join(TourneyInput, ['Season','Team1'])
outscores1 = outscores1.withColumnRenamed('Wins','Team1Wins')
outscores1 = outscores1.withColumnRenamed('Loses','Team1Loses')
outscores1 = outscores1.withColumnRenamed('NumGames','Team1NumGames')
outscores1 = outscores1.withColumnRenamed('WinRatio','Team1WinRatio')
outscores1 = outscores1.withColumnRenamed('PointsPerGame','Team1PointsPerGame')
outscores1 = outscores1.withColumnRenamed('PointsAllowedPerGame','Team1PointsAllowedPerGame')
outscores1 = outscores1.withColumnRenamed('PointsRatio','Team1PointsRatio')
outscores1 = outscores1.withColumnRenamed('OTsPerGame','Team1OTsPerGame')
outscores1 = outscores1.withColumnRenamed('FGPerGame','Team1FGPerGame')
outscores1 = outscores1.withColumnRenamed('FGRatio','Team1FGRatio')
outscores1 = outscores1.withColumnRenamed('FGAPerGame','Team1FGAPerGame')
outscores1 = outscores1.withColumnRenamed('FGAllowedPerGame','Team1FGAllowedPerGame')
outscores1 = outscores1.withColumnRenamed('FG3PerGame','Team1FG3PerGame')
outscores1 = outscores1.withColumnRenamed('FG3Ratio','Team1FG3Ratio')
outscores1 = outscores1.withColumnRenamed('FG3APerGame','Team1FG3APerGame')
outscores1 = outscores1.withColumnRenamed('FG3AllowedPerGame','Team1FG3AllowedPerGame')
outscores1 = outscores1.withColumnRenamed('FTPerGame','Team1FTPerGame')
outscores1 = outscores1.withColumnRenamed('FTRatio','Team1FTRatio')
outscores1 = outscores1.withColumnRenamed('FTAPerGame','Team1FTAPerGame')
outscores1 = outscores1.withColumnRenamed('FTllowedPerGame','Team1FTAllowedPerGame')
outscores1 = outscores1.withColumnRenamed('ORPerGame','Team1ORPerGame')
outscores1 = outscores1.withColumnRenamed('DRPerGame','Team1DRPerGame')
outscores1 = outscores1.withColumnRenamed('TRPerGame','Team1TRPerGame')
outscores1 = outscores1.withColumnRenamed('OppORPerGame','Team1OppORPerGame')
outscores1 = outscores1.withColumnRenamed('OppDRPerGame','Team1OppDRPerGame')
outscores1 = outscores1.withColumnRenamed('OppTRPerGame','Team1OppTRPerGame')
outscores1 = outscores1.withColumnRenamed('ORRatio','Team1ORRatio')
outscores1 = outscores1.withColumnRenamed('DRRatio','Team1DRRatio')
outscores1 = outscores1.withColumnRenamed('TRRatio','Team1TRRatio')
outscores1 = outscores1.withColumnRenamed('AstPerGame','Team1AstPerGame')
outscores1 = outscores1.withColumnRenamed('OppAstPerGame','Team1OppAstPerGame')
outscores1 = outscores1.withColumnRenamed('StlPerGame','Team1StlPerGame')
outscores1 = outscores1.withColumnRenamed('OppStlPerGame','Team1OppStlPerGame')
outscores1 = outscores1.withColumnRenamed('TOPerGame','Team1TOPerGame')
outscores1 = outscores1.withColumnRenamed('OppTOPerGame','Team1OppTOPerGame')
outscores1 = outscores1.withColumnRenamed('BlkPerGame','Team1BlkPerGame')
outscores1 = outscores1.withColumnRenamed('OppBlkPerGame','Team1OppBlkPerGame')
outscores1 = outscores1.withColumnRenamed('PFPerGame','Team1PFPerGame')
outscores1 = outscores1.withColumnRenamed('OppPFPerGame','Team1OppPFPerGame')

In [23]:
outscores2 = RegularSeasonInput.withColumnRenamed('TeamID','Team2')
outscores2 = outscores2.join(TourneyInput, ['Season','Team2'])

#rename columns
outscores2 = outscores2.withColumnRenamed('Wins','Team2Wins')
outscores2 = outscores2.withColumnRenamed('Loses','Team2Loses')
outscores2 = outscores2.withColumnRenamed('NumGames','Team2NumGames')
outscores2 = outscores2.withColumnRenamed('WinRatio','Team2WinRatio')
outscores2 = outscores2.withColumnRenamed('PointsPerGame','Team2PointsPerGame')
outscores2 = outscores2.withColumnRenamed('PointsAllowedPerGame','Team2PointsAllowedPerGame')
outscores2 = outscores2.withColumnRenamed('PointsRatio','Team2PointsRatio')
outscores2 = outscores2.withColumnRenamed('OTsPerGame','Team2OTsPerGame')
outscores2 = outscores2.withColumnRenamed('FGPerGame','Team2FGPerGame')
outscores2 = outscores2.withColumnRenamed('FGRatio','Team2FGRatio')
outscores2 = outscores2.withColumnRenamed('FGAPerGame','Team2FGAPerGame')
outscores2 = outscores2.withColumnRenamed('FGAllowedPerGame','Team2FGAllowedPerGame')
outscores2 = outscores2.withColumnRenamed('FG3PerGame','Team2FG3PerGame')
outscores2 = outscores2.withColumnRenamed('FG3Ratio','Team2FG3Ratio')
outscores2 = outscores2.withColumnRenamed('FG3APerGame','Team2FG3APerGame')
outscores2 = outscores2.withColumnRenamed('FG3AllowedPerGame','Team2FG3AllowedPerGame')
outscores2 = outscores2.withColumnRenamed('FTPerGame','Team2FTPerGame')
outscores2 = outscores2.withColumnRenamed('FTRatio','Team2FTRatio')
outscores2 = outscores2.withColumnRenamed('FTAPerGame','Team2FTAPerGame')
outscores2 = outscores2.withColumnRenamed('FTllowedPerGame','Team2FTAllowedPerGame')
outscores2 = outscores2.withColumnRenamed('ORPerGame','Team2ORPerGame')
outscores2 = outscores2.withColumnRenamed('DRPerGame','Team2DRPerGame')
outscores2 = outscores2.withColumnRenamed('TRPerGame','Team2TRPerGame')
outscores2 = outscores2.withColumnRenamed('OppORPerGame','Team2OppORPerGame')
outscores2 = outscores2.withColumnRenamed('OppDRPerGame','Team2OppDRPerGame')
outscores2 = outscores2.withColumnRenamed('OppTRPerGame','Team2OppTRPerGame')
outscores2 = outscores2.withColumnRenamed('ORRatio','Team2ORRatio')
outscores2 = outscores2.withColumnRenamed('DRRatio','Team2DRRatio')
outscores2 = outscores2.withColumnRenamed('TRRatio','Team2TRRatio')
outscores2 = outscores2.withColumnRenamed('AstPerGame','Team2AstPerGame')
outscores2 = outscores2.withColumnRenamed('OppAstPerGame','Team2OppAstPerGame')
outscores2 = outscores2.withColumnRenamed('StlPerGame','Team2StlPerGame')
outscores2 = outscores2.withColumnRenamed('OppStlPerGame','Team2OppStlPerGame')
outscores2 = outscores2.withColumnRenamed('TOPerGame','Team2TOPerGame')
outscores2 = outscores2.withColumnRenamed('OppTOPerGame','Team2OppTOPerGame')
outscores2 = outscores2.withColumnRenamed('BlkPerGame','Team2BlkPerGame')
outscores2 = outscores2.withColumnRenamed('OppBlkPerGame','Team2OppBlkPerGame')
outscores2 = outscores2.withColumnRenamed('PFPerGame','Team2PFPerGame')
outscores2 = outscores2.withColumnRenamed('OppPFPerGame','Team2OppPFPerGame')

print(outscores1.count())
print(outscores2.count())

2413
2413


In [24]:
outscores = outscores2.join(outscores1, ['Season','Team1','Team2','Result','Team1Seed','Team2Seed'])
outscores.show()

+------+-----+-----+------+---------+---------+---------+----------+-------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+-------------------+------------------+---------------------+------------------+-------------------+------------------+----------------------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+---------+----------+-------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+-------------------+------------------+----------------

In [25]:
notin = outscores2.join(outscores1, ['Season','Team1','Team2'],"left_anti")
notin.show()
print(notin.count())

notin = outscores1.join(outscores2, ['Season','Team1','Team2'],"left_anti")
notin.show()
print(notin.count())
# combine on season, team 1, team 2 and result

+------+-----+-----+---------+----------+-------------+-------------------+------------------+-------------------------+------------------+--------------------+------------------+-------------------+------------------+---------------------+------------------+-------------------+------------------+----------------------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+---------+---------+------+
|Season|Team1|Team2|Team2Wins|Team2Loses|Team2NumGames|      Team2WinRatio|Team2PointsPerGame|Team2PointsAllowedPerGame|  Team2PointsRatio|     Team2OTsPerGame|    Team2FGPerGame|       Team2FGRatio|   Team2FGAPe

# Model Building

In [26]:
outscores = outscores.withColumn('Wins',outscores.Team1Wins - outscores.Team2Wins)
outscores = outscores.withColumn('Loses',outscores.Team1Loses - outscores.Team2Loses)
outscores = outscores.withColumn('NumGames',outscores.Team1NumGames - outscores.Team2NumGames)
outscores = outscores.withColumn('WinRatio',outscores.Team1WinRatio - outscores.Team2WinRatio)
outscores = outscores.withColumn('PointsPerGame',outscores.Team1PointsPerGame - outscores.Team2PointsPerGame)
outscores = outscores.withColumn('PointsAllowedPerGame',outscores.Team1PointsAllowedPerGame - outscores.Team2PointsAllowedPerGame)
outscores = outscores.withColumn('PointsRatio',outscores.Team1PointsRatio - outscores.Team2PointsRatio)
outscores = outscores.withColumn('OTsPerGame',outscores.Team1OTsPerGame - outscores.Team2OTsPerGame)
outscores = outscores.withColumn('FGPerGame',outscores.Team1FGPerGame - outscores.Team2FGPerGame)
outscores = outscores.withColumn('FGRatio',outscores.Team1FGRatio - outscores.Team2FGRatio)
outscores = outscores.withColumn('FGAPerGame',outscores.Team1FGAPerGame - outscores.Team2FGAPerGame)
outscores = outscores.withColumn('FGAllowedPerGame',outscores.Team1FGAllowedPerGame - outscores.Team2FGAllowedPerGame)
outscores = outscores.withColumn('FG3PerGame',outscores.Team1FG3PerGame - outscores.Team2FG3PerGame)
outscores = outscores.withColumn('FG3Ratio',outscores.Team1FG3Ratio - outscores.Team2FG3Ratio)
outscores = outscores.withColumn('FG3APerGame',outscores.Team1FG3APerGame - outscores.Team2FG3APerGame)
outscores = outscores.withColumn('FG3AllowedPerGame',outscores.Team1FG3AllowedPerGame - outscores.Team2FG3AllowedPerGame)
outscores = outscores.withColumn('FTPerGame',outscores.Team1FTPerGame - outscores.Team2FTPerGame)
outscores = outscores.withColumn('FTRatio',outscores.Team1FTRatio - outscores.Team2FTRatio)
outscores = outscores.withColumn('FTAPerGame',outscores.Team1FTAPerGame - outscores.Team2FTAPerGame)
outscores = outscores.withColumn('FTAllowedPerGame',outscores.Team1FTAllowedPerGame - outscores.Team2FTAllowedPerGame)
outscores = outscores.withColumn('ORPerGame',outscores.Team1ORPerGame - outscores.Team2ORPerGame)
outscores = outscores.withColumn('DRPerGame',outscores.Team1DRPerGame - outscores.Team2DRPerGame)
outscores = outscores.withColumn('TRPerGame',outscores.Team1TRPerGame - outscores.Team2TRPerGame)
outscores = outscores.withColumn('OppORPerGame',outscores.Team1OppORPerGame - outscores.Team2OppORPerGame)
outscores = outscores.withColumn('OppDRPerGame',outscores.Team1OppDRPerGame - outscores.Team2OppDRPerGame)
outscores = outscores.withColumn('OppTRPerGame',outscores.Team1OppTRPerGame - outscores.Team2OppTRPerGame)
outscores = outscores.withColumn('ORRatio',outscores.Team1ORRatio - outscores.Team2ORRatio)
outscores = outscores.withColumn('DRRatio',outscores.Team1DRRatio - outscores.Team2DRRatio)
outscores = outscores.withColumn('TRRatio',outscores.Team1TRRatio - outscores.Team2TRRatio)
outscores = outscores.withColumn('AstPerGame',outscores.Team1AstPerGame - outscores.Team2AstPerGame)
outscores = outscores.withColumn('OppAstPerGame',outscores.Team1OppAstPerGame - outscores.Team2OppAstPerGame)
outscores = outscores.withColumn('StlPerGame',outscores.Team1StlPerGame - outscores.Team2StlPerGame)
outscores = outscores.withColumn('OppStlPerGame',outscores.Team1OppStlPerGame - outscores.Team2OppStlPerGame)
outscores = outscores.withColumn('TOPerGame',outscores.Team1TOPerGame - outscores.Team2TOPerGame)
outscores = outscores.withColumn('OppTOPerGame',outscores.Team1OppTOPerGame - outscores.Team2OppTOPerGame)
outscores = outscores.withColumn('BlkPerGame',outscores.Team1BlkPerGame - outscores.Team2BlkPerGame)
outscores = outscores.withColumn('OppBlkPerGame',outscores.Team1OppBlkPerGame - outscores.Team2OppBlkPerGame)
outscores = outscores.withColumn('PFPerGame',outscores.Team1PFPerGame - outscores.Team2PFPerGame)
outscores = outscores.withColumn('OppPFPerGame',outscores.Team1OppPFPerGame - outscores.Team2OppPFPerGame)
outscores = outscores.withColumn('Seed',outscores.Team1Seed - outscores.Team2Seed)

outscores.show()

+------+-----+-----+------+---------+---------+---------+----------+-------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+-------------------+------------------+---------------------+------------------+-------------------+------------------+----------------------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+---------+----------+-------------+------------------+------------------+-------------------------+------------------+--------------------+------------------+-------------------+------------------+----------------

In [27]:
outscores =  outscores.drop("Team1OppPFPerGame", "Team2OppPFPerGame",
                            'Team1PFPerGame','Team2PFPerGame',
                            "Team1Wins","Team2Wins",
                            "Team1Loses","Team2Loses",
                            "Team1NumGames","Team2NumGames",
                            "Team1WinRatio","Team2WinRatio",
                            "Team1PointsPerGame","Team2PointsPerGame",
                            "Team1PointsAllowedPerGame","Team2PointsAllowedPerGame",
                            "Team1PointsRatio","Team2PointsRatio",
                            "Team1OTsPerGame","Team2OTsPerGame",
                            "Team1FGPerGame","Team2FGPerGame",
                            "Team1FGRatio","Team2FGRatio",
                            "Team1FGAPerGame","Team2FGAPerGame",
                            "Team1FGAllowedPerGame","Team2FGAllowedPerGame",
                            "Team1FG3PerGame","Team2FG3PerGame",
                            "Team1FG3Ratio","Team2FG3Ratio",
                            "Team1FG3APerGame","Team2FG3APerGame",
                            "Team1FG3AllowedPerGame","Team2FG3AllowedPerGame",
                            "Team1FTPerGame","Team2FTPerGame",
                            "Team1FTRatio","Team2FTRatio",
                            "Team1FTAPerGame","Team2FTAPerGame",
                            "Team1FTAllowedPerGame","Team2FTAllowedPerGame",
                            "Team1ORPerGame","Team2ORPerGame",
                            "Team1DRPerGame","Team2DRPerGame",
                            "Team1TRPerGame","Team2TRPerGame",
                            "Team1OppORPerGame","Team2OppORPerGame",
                            'Team1OppDRPerGame','Team2OppDRPerGame',
                            'Team1OppTRPerGame','Team2OppTRPerGame',
                            'Team1ORRatio','Team2ORRatio',
                            'Team1DRRatio','Team2DRRatio',
                            'Team1TRRatio','Team2TRRatio',
                            'Team1AstPerGame','Team2AstPerGame',
                            'Team1OppAstPerGame','Team2OppAstPerGame',
                            'Team1StlPerGame','Team2StlPerGame',
                            'Team1OppStlPerGame','Team2OppStlPerGame',
                            'Team1TOPerGame','Team2TOPerGame',
                            'Team1OppTOPerGame','Team2OppTOPerGame',
                            'Team1BlkPerGame','Team2BlkPerGame',
                            'Team1OppBlkPerGame','Team2OppBlkPerGame',
                            'Team1PFPerGame','Team2PFPerGame',
                            'Team1OppPFPerGame','Team2OppPFPerGame'
                            'Team1Seed','Team2Seed')
outscores2022 = outscores.filter(outscores.Season == 2022)
outscores2022.show()
outscores = outscores.filter(outscores.Season < 2022)
outscores.show()

+------+-----+-----+------+---------+----+-----+--------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-----+
|Season|Team1|Team2|Result|Team1Seed|Wins|Loses|NumGames|            WinRatio|      PointsPerGame|PointsAllowedPerGame|         PointsRatio|          OTsPerGame|           FGPerGame|        

In [28]:
# Assemble all the features with VectorAssembler
required_features = ['WinRatio',
 'PointsPerGame',
 'PointsAllowedPerGame',
 'PointsRatio',
 'OTsPerGame',
 'FGPerGame',
 'FGRatio',
 'FGAPerGame',
 'FGAllowedPerGame',
 'FG3PerGame',
 'FG3Ratio',
 'FG3APerGame',
 'FG3AllowedPerGame',
 'FTPerGame',
 'FTRatio',
 'FTAPerGame',
 'FTAllowedPerGame',
 'ORPerGame',
 'DRPerGame',
 'TRPerGame',
 'OppORPerGame',
 'OppDRPerGame',
 'OppTRPerGame',
 'ORRatio',
 'DRRatio',
 'TRRatio',
 'AstPerGame',
 'OppAstPerGame',
 'StlPerGame',
 'OppStlPerGame',
 'TOPerGame',
 'OppTOPerGame',
 'BlkPerGame',
 'OppBlkPerGame',
 'PFPerGame',
 'OppPFPerGame',
 'Seed']
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(outscores)

In [29]:
transformed_data.show()

+------+-----+-----+------+---------+----+-----+--------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-----+--------------------+
|Season|Team1|Team2|Result|Team1Seed|Wins|Loses|NumGames|            WinRatio|      PointsPerGame|PointsAllowedPerGame|         PointsRatio|          OTsPerGame|        

In [30]:
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [31]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol='Result', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(training_data)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: (37,[],[])
Intercept: -0.016278167359688487


In [32]:
trainingSummary = lrModel.summary
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Coefficients: 
1 X 37 CSRMatrix

Intercept: [-0.016278167359688487]
objectiveHistory:
0.6931140593154237
False positive rate by label:
label 0: 1.0
label 1: 0.0
True positive rate by label:
label 0: 1.0
label 1: 0.0
Precision by label:
label 0: 0.5040694519804666
label 1: 0.0
Recall by label:
label 0: 1.0
label 1: 0.0
F-measure by label:
label 0: 0.6702741702741702
label 1: 0.0
Accuracy: 0.5040694519804666
FPR: 0.5040694519804666
TPR: 0.5040694519804666
F-measure: 0.337864733686763
Precision: 0.25408601241988793
Recall: 0.5040694519804666


# Predict 2022 Season

In [33]:
outscores2022.show()

+------+-----+-----+------+---------+----+-----+--------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-----+
|Season|Team1|Team2|Result|Team1Seed|Wins|Loses|NumGames|            WinRatio|      PointsPerGame|PointsAllowedPerGame|         PointsRatio|          OTsPerGame|           FGPerGame|        

In [34]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
test_data = assembler.transform(outscores2022)
predictions = lrModel.transform(test_data)
predictions.select("prediction","features").show()

+----------+--------------------+
|prediction|            features|
+----------+--------------------+
|       0.0|[0.07157258064516...|
|       0.0|[-0.1885775862068...|
|       0.0|[0.04166666666666...|
|       0.0|[-0.0416666666666...|
|       0.0|[0.07815126050420...|
|       0.0|[-0.0781512605042...|
|       0.0|[-0.1654411764705...|
|       0.0|[-0.2791991101223...|
|       0.0|[-0.0861742424242...|
|       0.0|[0.11436950146627...|
|       0.0|[0.16544117647058...|
|       0.0|[-0.1683569979716...|
|       0.0|[-0.2235294117647...|
|       0.0|[-0.1143695014662...|
|       0.0|[0.16835699797160...|
|       0.0|[0.08617424242424...|
|       0.0|[-0.0416666666666...|
|       0.0|[0.29655172413793...|
|       0.0|[-0.1408602150537...|
|       0.0|[0.16718913270637...|
+----------+--------------------+
only showing top 20 rows



In [35]:
predictions.select("prediction","features").show(64)

+----------+--------------------+
|prediction|            features|
+----------+--------------------+
|       0.0|[0.07157258064516...|
|       0.0|[-0.1885775862068...|
|       0.0|[0.04166666666666...|
|       0.0|[-0.0416666666666...|
|       0.0|[0.07815126050420...|
|       0.0|[-0.0781512605042...|
|       0.0|[-0.1654411764705...|
|       0.0|[-0.2791991101223...|
|       0.0|[-0.0861742424242...|
|       0.0|[0.11436950146627...|
|       0.0|[0.16544117647058...|
|       0.0|[-0.1683569979716...|
|       0.0|[-0.2235294117647...|
|       0.0|[-0.1143695014662...|
|       0.0|[0.16835699797160...|
|       0.0|[0.08617424242424...|
|       0.0|[-0.0416666666666...|
|       0.0|[0.29655172413793...|
|       0.0|[-0.1408602150537...|
|       0.0|[0.16718913270637...|
|       0.0|[0.10752688172043...|
|       0.0|[-0.2965517241379...|
|       0.0|[-0.0909090909090...|
|       0.0|[0.03472222222222...|
|       0.0|[0.18857758620689...|
|       0.0|[0.14086021505376...|
|       0.0|[-

# TO DO

1.   Get 2022 seeds
2.   Break off 2022
3.   Make Model

