Practice making a decision tree that compares four inputs (total, per90, l5, and opponent allowed)

In [2]:
import json, csv
import pandas as pd
from constants import *
import numpy as np
from scipy.stats import poisson

def poisson_over_likelyhood(mean, target):
    return 1 - poisson.cdf(target, mean)



In [3]:
players = pd.read_csv("Premier_League_Player_Stats.csv")
PP_lines = pd.read_csv("pp_lines.csv")
last5 = pd.read_csv("Premier_League_Last_Five_Stats.csv")
teams = pd.read_csv("Premier_League_Team_Stats.csv")

merged = pd.merge(players, PP_lines, on="name")
merged = pd.merge(merged, last5, on="name", suffixes=("", "_last5"))

#TODO: Make sure all opponents and teams are in the same format from PP to FBREF

merged = pd.merge(merged, teams, left_on="opponent", right_on="name", suffixes=("", "_opposing_team"))
merged

Unnamed: 0,name,team,games,minutes,gk_saves,clearances,tackles,assisted_shots,fouled,shots,...,tackles_opposing_team,interceptions_opposing_team,clearances_opposing_team,tackles_against,interceptions_against,clearances_against,fouls_opposing_team,fouled_opposing_team,fouls_against,fouled_against
0,Conor Gallagher,Chelsea,29,2421,,15.0,70.0,45.0,42.0,40.0,...,610,308,699,491,208,743,376,289,301,365
1,Axel Disasi,Chelsea,28,2480,,97.0,31.0,3.0,17.0,15.0,...,610,308,699,491,208,743,376,289,301,365
2,Moises Caicedo,Chelsea,27,2173,,32.0,62.0,15.0,36.0,8.0,...,610,308,699,491,208,743,376,289,301,365
3,Enzo Fernandez,Chelsea,27,2143,,28.0,48.0,32.0,31.0,44.0,...,610,308,699,491,208,743,376,289,301,365
4,Nicolas Jackson,Chelsea,27,2084,,15.0,14.0,26.0,23.0,57.0,...,610,308,699,491,208,743,376,289,301,365
5,Thiago Silva,Chelsea,24,2073,,117.0,26.0,6.0,8.0,10.0,...,610,308,699,491,208,743,376,289,301,365
6,Cole Palmer,Chelsea,26,1993,,11.0,21.0,52.0,20.0,69.0,...,610,308,699,491,208,743,376,289,301,365
7,Raheem Sterling,Chelsea,27,1851,,8.0,20.0,26.0,37.0,46.0,...,610,308,699,491,208,743,376,289,301,365
8,Malo Gusto,Chelsea,22,1528,,36.0,50.0,29.0,27.0,11.0,...,610,308,699,491,208,743,376,289,301,365
9,Mykhailo Mudryk,Chelsea,24,1080,,6.0,20.0,20.0,20.0,24.0,...,610,308,699,491,208,743,376,289,301,365


In [6]:
results = []
not_available = ['Shots On Target']

for index, row in merged.iterrows():
    # Access the values of each column in the current row
    name = row['name']

    for pp_stat_type in pp_to_fbref_stats.keys():
        # failsafe 
        if pp_stat_type not in row:
            continue

        if row[pp_stat_type] <= 0:
            continue

        if pp_stat_type in not_available:
            continue

        # Create a dictionary to store the result
        result_dict = {
            'name': name,
            'stat': pp_stat_type,
            'line': row[pp_stat_type]
        }

        # Get the FBref stat type that corresponds to the current PP stat type
        fbref_stat_type = pp_to_fbref_stats[pp_stat_type]

        # Get the value of the FBref stat type for the current player
        fbref_stat_value = row[fbref_stat_type]

        # Add the FBref stat value to the result dictionary
        result_dict['per90'] = fbref_stat_value * 90 / row['minutes']
        result_dict['perGame'] = fbref_stat_value / row['games']

        # Get the last 5 games stats
        last5_stat_type = fbref_stat_type + "_last5"
        last5_stat_value = row[last5_stat_type]

        # Add the last 5 games stats to the result dictionary
        result_dict['last5_per90'] = last5_stat_value * 90 / row['minutes_last5']
        result_dict['last5_perGame'] = last5_stat_value / row['games_last5']

        # Get the opposing team's FBref stat value
        opposing_team_fbref_stat_value = row[fbref_stat_type + "_opposing_team"]
        result_dict['opposing_team_relative_value'] = opposing_team_fbref_stat_value

        # Append the dictionary to the results list
        results.append(result_dict)

# Convert the results list into a dataframe
df = pd.DataFrame(results).dropna()

df

Unnamed: 0,name,stat,line,per90,perGame,last5_per90,last5_perGame,opposing_team_relative_value
5,Conor Gallagher,Shots,1.5,1.486989,1.37931,1.777778,1.6,424
8,Conor Gallagher,Fouls,2.0,2.379182,2.206897,0.0,0.0,376
15,Axel Disasi,Shots,0.5,0.544355,0.535714,1.2,1.2,424
22,Moises Caicedo,Tackles,2.0,2.567879,2.296296,2.138229,2.2,610
35,Enzo Fernandez,Shots,2.0,1.847877,1.62963,2.896996,3.0,424
36,Enzo Fernandez,Passes Attempted,79.5,75.804946,66.851852,84.206009,87.2,12583
45,Nicolas Jackson,Shots,2.5,2.461612,2.111111,3.511294,3.8,424
51,Thiago Silva,Clearances,3.5,5.079595,4.875,0.0,0.0,699
63,Cole Palmer,Shots Assisted,2.0,2.348219,2.0,0.0,0.0,286
65,Cole Palmer,Shots,3.5,3.115906,2.653846,2.576687,2.8,424
