<a href="https://colab.research.google.com/github/brendangoldz/HW4/blob/main/randomforrest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [2]:
pip install sportsipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sportsipy
  Downloading sportsipy-0.6.0-py3-none-any.whl (499 kB)
[K     |████████████████████████████████| 499 kB 5.2 MB/s 
[?25hCollecting pyquery>=1.4.0
  Downloading pyquery-1.4.3-py3-none-any.whl (22 kB)
Collecting cssselect>0.7.9
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Installing collected packages: cssselect, pyquery, sportsipy
Successfully installed cssselect-1.1.0 pyquery-1.4.3 sportsipy-0.6.0


In [3]:
from sportsipy.ncaab.teams import Team
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [4]:
#common ml methods
def zscore(X):
    means = []
    stds = []
    num_features = np.shape(X)[1]

    for i in range(num_features):
      current_feature = X[:, i]
      mean = np.mean(current_feature)
      std = np.std(current_feature, ddof=1)
      means.append(mean)
      stds.append(std)
    for i in range(num_features):
        current_feature = X[:, i]
        X[:, i] = (current_feature - means[i]) / stds[i]
    return X,means,stds

def unzscore(X,means,stds):
    return  X * stds + means


def getXY(data):
    X = data[:, :-1]
    y = data[:, -1]
    return X, y

In [5]:
#metrics
def calculate_metrics(pred, actual):
    pred = np.round(np.where(pred > .5, 1, 0))
    # True Positive (TP): we predict a label of 1 (positive), and the true label is 1.
    TP = np.sum(np.logical_and(pred == 1, actual == 1))

    # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
    TN = np.sum(np.logical_and(pred == 0, actual == 0))

    # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
    FP = np.sum(np.logical_and(pred == 1, actual == 0))

    # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
    FN = np.sum(np.logical_and(pred == 0, actual == 1))

    accuracy(TP, TN, FP, FN)
    precision = (TP / (TP + FP)) * 100
    print("\tPrecision: {}".format(precision))
    recall = (TP / (TP + FN)) * 100
    print("\tRecall: {}".format(recall))
    f_measure = (2 * precision * recall) / (precision + recall)
    print("\tF-Measure: {}".format(f_measure))
    print("")


def calculate_metrics_with_threshold(pred, actual):
    precisions = []
    recalls = []
    increments = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    for increment in increments:
        pred_thresholds = []
        for val in pred:
            if val >= increment:
                pred_thresholds.append(1)
            else:
                pred_thresholds.append(0)

        pred_thresholds = np.array(pred_thresholds)
        # True Positive (TP): we predict a label of 1 (positive), and the true label is 1.
        TP = np.sum(np.logical_and(pred_thresholds == 1, actual == 1))

        # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
        FP = np.sum(np.logical_and(pred_thresholds == 1, actual == 0))

        # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
        FN = np.sum(np.logical_and(pred_thresholds == 0, actual == 1))
        precision = (TP / (TP + FP))
        recall = (TP / (TP + FN))
        if np.isnan(precision):
            precisions.append(1)
        else:
            precisions.append(precision)

        if recall is None:
            recalls.append(1)
        else:
            recalls.append(recall)
    return precisions, recalls


def accuracy(TP, TN, FP, FN):
    correctly_identified_items = (TP + TN)
    incorrectly_identified_items = (FP + FN)
    total_items = correctly_identified_items + incorrectly_identified_items

    accuracy = (correctly_identified_items / float(total_items)) * 100
    print("\tAccuracy: {}".format(accuracy))

In [6]:
class LogisticRegression():
  def __init__(self, learning_rate, iterations,game_fields):
    self.learning_rate = learning_rate
    self.iterations = iterations
    self.game_fields = game_fields

    self.train_x = None
    self.train_y = None
    self.test_x = None
    self.test_y_nums = None
    
  def read_split_data(self,seed=0):
    np.random.seed(seed)
    data = pd.read_csv('/content/drive/MyDrive/CS613/games.csv',usecols=self.game_fields)
    data = data.reindex(columns=self.game_fields)
    data = data.to_numpy()
    index = round(len(data) * (2 / 3))
    np.random.shuffle(data)
    train = data[0:index]
    test = data[index:]
    return train, test

  def single_classification(self):
    train, test = self.read_split_data()
    self.train_x, self.train_y = getXY(train)
    self.test_x, self.test_y_nums = getXY(test)
    self.train_x,self.means,self.stds = zscore(self.train_x)
    train_log_loss = self.fit(self.train_x,self.train_y)
    train_y_pred = self.predict(self.train_x)
    print("TRAINING DATA")
    calculate_metrics(train_y_pred,self.train_y)
    train_p, train_r = calculate_metrics_with_threshold(train_y_pred,self.train_y)
    self.test_x,means,stds = zscore(self.test_x)
    test_log_loss = self.fit(self.test_x, self.test_y_nums)
    test_y_pred = self.predict(self.test_x)
    print("VALIDATION DATA")
    calculate_metrics(test_y_pred, self.test_y_nums)
    test_p, test_r = calculate_metrics_with_threshold(test_y_pred, self.test_y_nums)

  # Function for model training
  def fit(self, X, Y):
    # num training size, num features
    self.m, self.n = X.shape
    self.weights = np.zeros(self.n) #weights
    self.bias = 0 #bias
    self.X = X #X train data
    self.Y = Y #Y train data
    log_loss = []

    # gradient descent learning
    for i in range(self.iterations):
      y_hat = self.gradient_descent()
      log_loss.append(self.log_loss(Y,y_hat))
    return log_loss

  def gradient_descent(self):
    # calculate new gradients
    y_hat = self.predict(self.X)
    updated_W = np.dot(self.X.T, np.reshape(((y_hat - self.Y.T)),self.m)) / self.m
    updated_B = np.sum(np.reshape(((y_hat - self.Y.T)),self.m)) / self.m

    # update current gradients
    self.weights = self.weights - self.learning_rate * updated_W
    self.bias = self.bias - self.learning_rate * updated_B

    return y_hat

  def predict(self, X):
    return 1 / (1 + np.exp(- (X.dot(self.weights) + self.bias)))

  def log_loss(self,y, y_hat):
    loss = -((y*np.log(y_hat)) + ((1 - y) * np.log(1-y_hat)))
    return loss.mean()

  

In [7]:
class Node():
  def __init__(self, feature=None, threshold=None, data_left=None, data_right=None, gain=None, target=None): 
    self.right = data_right
    self.left = data_left
    self.index = feature
    self.info_gain = gain
    self.threshold = threshold
    self.target = target

class DecisionTree(): 
  def __init__(self,game_fields):
    self.game_fields = game_fields

  def read_split_data(self,seed=0):
    np.random.seed(seed)
    data = pd.read_csv('/content/drive/MyDrive/CS613/games.csv',usecols=self.game_fields)
    data = data.reindex(columns=self.game_fields)
    data = data.to_numpy()
    index = round(len(data) * (2 / 3))
    np.random.shuffle(data)
    train = data[0:index]
    test = data[index:]
    return train, test

  def id3(self,seed=0):
    train, test = self.read_split_data(seed)
    self.train_x, self.train_y = getXY(train)
    self.tree = self.fit(self.train_x, self.train_y)

    self.test_x, self.test_y = getXY(test)
    calculate_metrics(self.predict(self.test_x),self.test_y)
    
  def fit(self,X,Y):
    return self.buildTree(np.concatenate((X, Y.reshape(-1, 1)), axis=1))

  def buildTree(self, dataset):
    X, Y = dataset[:,:-1], dataset[:,-1]
    num_features = np.shape(X)[1]
    best_node = self.get_best_node(dataset,num_features)
    if len(best_node) > 1 and best_node["gain"] > 0: #valid node
      left = self.buildTree(best_node["dataset_left"])
      right = self.buildTree(best_node["dataset_right"])
      return Node(best_node["index"], best_node["threshold"],left, right, best_node["gain"])
    return Node(target= max(list(Y), key=list(Y).count))
    
  def get_best_node(self,dataset, num_features):
    current_max_gain = -float("inf")
    best_node = {}
    for index in range(num_features):
      threshold = np.mean(dataset[:, index])
      dataset_left = np.array([row for row in dataset if row[index]<=threshold])
      dataset_right = np.array([row for row in dataset if row[index]>threshold])
      if len(dataset_left) > 0 and len(dataset_right) > 0: #valid left and right branches
        new_max_gain = self.calculate_info_gain(dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1])
        if new_max_gain > current_max_gain:
          best_node["index"] = index
          best_node["gain"] = new_max_gain
          best_node["threshold"] = threshold
          best_node["dataset_left"] = dataset_left
          best_node["dataset_right"] = dataset_right
          current_max_gain = new_max_gain
    return best_node

  def calculate_info_gain(self, parent, left_child, right_child):
    weight_left = len(left_child) / len(parent)
    weight_right = len(right_child) / len(parent)
    return self.get_attribute_entropy(parent) - (weight_left*self.get_attribute_entropy(left_child) + weight_right*self.get_attribute_entropy(right_child))

  def get_attribute_entropy(self, y):
    attribute_entropy = 0
    for cls in np.unique(y):
      class_probability = len(y[y == cls]) / len(y)
      attribute_entropy += -class_probability * np.log2(class_probability)
    return attribute_entropy
    
  def predict(self, X):
    tree = self.tree

    return np.array([self._predict(x, tree) for x in X])

  def _predict(self, x, tree):
    if tree.target is not None: #terminate case
      return tree.target 

    if x[tree.index] <= tree.threshold: #pick left
      return self._predict(x, tree.left)
    else: #pick right
      return self._predict(x, tree.right)

  def randomForestModel(self, forestSize=20):
    train, test = self.read_split_data()
    self.train_x, self.train_y = getXY(train)
    X = self.train_x
    forest = []
    for i in range(forestSize):
        feature_indices = np.random.randint(low=0, high=X.shape[1], size=X.shape)
        X_features = X[:, feature_indices]
        decisionTree = buildTree(X_features)
        forest.append(decisionTree)
    return forest

  def randomForestPredictions(self, randomForest):
    train, test = self.read_split_data()
    X, y = getXY(test)
    predictions = []
    for i in range(len(randomForest)):
        # column = "decision tree " + str(i)
        predictions.append(predict(
            X, randomForest[i]))
    predictions = np.array(predictions)
    return predictions.mode()

In [8]:
def generate_game_fields(fields,target):
  #generate the fields to use for a game dataframe
  away_columns = []
  home_columns = []

  for field in fields:
    away_columns.append("away_"+field)
    home_columns.append("home_" + field)
  result = away_columns + home_columns
  result.append(target)
  return result

fields = ['offensive_rating','effective_field_goal_percentage','total_rebound_percentage','free_throw_attempt_rate','free_throw_percentage','three_point_attempt_rate',
               'three_point_field_goal_percentage','turnover_percentage','true_shooting_percentage']

game_fields = generate_game_fields(fields,"home_win")
print(game_fields)


['away_offensive_rating', 'away_effective_field_goal_percentage', 'away_total_rebound_percentage', 'away_free_throw_attempt_rate', 'away_free_throw_percentage', 'away_three_point_attempt_rate', 'away_three_point_field_goal_percentage', 'away_turnover_percentage', 'away_true_shooting_percentage', 'home_offensive_rating', 'home_effective_field_goal_percentage', 'home_total_rebound_percentage', 'home_free_throw_attempt_rate', 'home_free_throw_percentage', 'home_three_point_attempt_rate', 'home_three_point_field_goal_percentage', 'home_turnover_percentage', 'home_true_shooting_percentage', 'home_win']


In [None]:
lr_model = LogisticRegression(learning_rate=.001, iterations=5000,game_fields=game_fields)
dt_model = DecisionTree(game_fields=game_fields)
rand_forest = dt_model.randomForestModel()

In [None]:
lr_model.single_classification()

In [None]:
dt_model.id3()

In [None]:
dt_model.randomForestPredictions(rand_forest)

In [None]:
"""
#zscore based on training means and stds
means = lr_model.means
stds = lr_model.stds
num_features = np.shape(game)[1]
for i in range(num_features):
    current_feature = game[:, i]
    game[:, i] = (current_feature - means[i]) / stds[i]
# game = game - means / stds
lr_model.predict(game)
"""

In [None]:
class Bracket():
  def __init__(self,year,model):
    self.year = year
    self.bracket,self.winners_bracket = self.get_bracket()
    self.teams = {}
    self.model = model
    self.correct_count = 0
    self.region_count = 0

  def determine_bracket_order(self,soup_data,bracket,winners_bracket):
    final_four_teams = []
    div = soup_data.findAll(id="national")
    for x in div:
      for a in x.find_all("a"):
        team = re.findall("\/cbb\/schools\/\D+\/", str(a))
        if team:
          school = team[0]
          school = school.replace('/cbb/schools/', '')
          school = school.replace('/', '')
          if school.upper() not in final_four_teams:
            final_four_teams.append(school.upper())
    key_order = []
    for team in final_four_teams:
      for key in bracket:
        teams = bracket.get(key)
        if team in teams:
          key_order.append(key)
    bracket = {k: bracket[k] for k in key_order}
    key_order.append("FINAL FOUR")
    winners_bracket = {k: winners_bracket[k] for k in key_order}
    return bracket,winners_bracket

  def get_winners(self,soup_data):
    winners = []
    bracket = {}
    div = soup_data.find_all("div", {"class": "winner"})
    for x in div:
      for a in x.find_all("a"):
        team = re.findall("\/cbb\/schools\/\D+\/", str(a))
        if team:
          school = team[0]
          school = school.replace('/cbb/schools/', '')
          school = school.replace('/', '')
          winners.append(school.upper())
    split = [winners[i:i + 15] for i in range(0, len(winners), 15)]
    for index,region in enumerate(split):
      if len(region) > 3:
        bracket[index+1] = region
      else:
        bracket["FINAL FOUR"] = region
    return bracket

  def get_bracket(self):
    url = 'https://www.sports-reference.com/cbb/postseason/'+str(self.year)+'-ncaa.html'
    res = requests.get(url)
    soup_data = BeautifulSoup(res.text, 'html.parser')
    div = soup_data.findAll(id="bracket")
    teams = []
    for x in div:
      for a in x.find_all("a"):
        team = re.findall("\/cbb\/schools\/\D+\/", str(a))
        if team:
          school = team[0]
          school = school.replace('/cbb/schools/', '')
          school = school.replace('/', '')
          if school.upper() not in teams:
            teams.append(school.upper())
    #breaking into regions
    split = [teams[i:i + 16] for i in range(0, len(teams), 16)]
    output = {}
    output[1] = split[0]
    output[2] = split[1]
    output[3] = split[2]
    output[4] = split[3]
    winners_bracket = self.get_winners(soup_data)
    output,winners_bracket = self.determine_bracket_order(soup_data,output,winners_bracket)

    return output,winners_bracket

  def run_bracket(self):
    final_four = []
    for region in self.bracket:
      self.run_region(region)
      final_four.append(self.bracket.get(region)[0])
      print("\tCorrectly Predicted Games in Region #{}: {}".format(region,self.region_count))
      percentage = (self.region_count/15) * 100
      print("\tPercentage of Correctly Predicted Games in Region #{}: {}".format(region,percentage))
      self.region_count = 0
      print("")

    self.bracket.clear()
    self.bracket["FINAL FOUR"] = final_four
    self.run_region("FINAL FOUR")
    print("\tCorrectly Predicted Games in FINAL FOUR: {}".format(self.region_count))
    percentage = (self.region_count/3) * 100
    print("\tPercentage of Correctly Predicted Games in FINAL FOUR: {}".format(percentage))
    print("")

    print("\t{} Champion: {}".format(self.year,self.bracket["FINAL FOUR"][0]))
    print("\tCorrectly Predicted Games: {}".format(self.correct_count))
    percentage = (self.correct_count/63) * 100
    print("\tPercentage of Correctly Predicted Games: {}".format(percentage))

  def run_region(self,region):
    winners = []
    region_size = len(self.bracket.get(region))
    winners_bracket = self.winners_bracket.get(region)
    print(region)
    while region_size != 1:
      for index, team in enumerate(self.bracket.get(region)):
        if index % 2 == 0:  
          actual_winner = winners_bracket.pop(0)
          home = team
          away = self.bracket.get(region)[index+1]
          game = self.create_game(away,home,fields,self.model.game_fields[:-1])
          pred = self.model.predict(game)[0] 
          if pred == 1: #home win
            winners.append(home)
            if home == actual_winner:
              self.correct_count += 1
              self.region_count +=1
          else: #away win
            winners.append(away)
            if away == actual_winner:
              self.correct_count += 1
              self.region_count +=1
          print("\tH: {} vs A: {}, home win = {}, actual winner = {}".format(home,away, pred,actual_winner))
      region_size = len(winners)  
      self.bracket[region] = winners  
      winners = []
      print("")

  def get_team_by_name(self,team_name,year,team_fields):
    #return a team entry from sportsipy
    team = Team(team_name=team_name, year=year)
    df = team.dataframe[team_fields].dropna(axis='columns')
    return df

  def create_game(self,away_abbr,home_abbr, team_fields, game_fields):
    #combine two team entries into one dataframe to be passed to a model
    #only call api for a team once
    if away_abbr not in self.teams:
      away_df = self.get_team_by_name(away_abbr,self.year,team_fields)
      self.teams[away_abbr] = away_df
    else:
      away_df = self.teams[away_abbr]

    if home_abbr not in self.teams:
      home_df = self.get_team_by_name(home_abbr,self.year,team_fields)
      self.teams[home_abbr] = home_df
    else:
      home_df = self.teams[home_abbr]

    #need dummy column to join on, will remove later on in method
    away_df['joincol'] = 1
    home_df['joincol'] = 1

    game = pd.merge(away_df, home_df, how='outer', on='joincol', suffixes=('_away', '_home'))
    game = game.drop(columns=['joincol']).dropna(axis='columns')
    game.columns = game_fields
    return game.to_numpy()

fifteen_bracket = Bracket(2015,dt_model)
fifteen_bracket.run_bracket()

In [None]:
eighteen_bracket = Bracket(2018,dt_model)
eighteen_bracket.run_bracket()

In [None]:
nineteen_bracket = Bracket(2019,dt_model)
nineteen_bracket.run_bracket()

In [None]:
twenty_one_bracket = Bracket(2021,dt_model)
twenty_one_bracket.run_bracket()