In [277]:

import string
from functools import partial

# Import packages
import pandas as pd
from joblib import Parallel, delayed
import numpy as np
import plotly as pl
import plotly.graph_objects as go
import plotly.express as px
import datetime as dt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# from scipy import stats
from IPython.display import display
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

## Predictive Measures
1. Each match pulls in data from these most preceding data points:
    1. Five fixtures of each team against any rival at home TEAM_any_rival_home_RECENCY.
    2. Five fixtures of each team against any rival away TEAM_any_rival_away_RECENCY.
    3. Five encounters of these two teams in matching home/away configuration matching_encounter_RECENCY.
    4. Five encounters of these two teams in inverted home/away configuration inverted_encounter_RECENCY.   
2. These metrics are collected and labelled separately for each of these 20 fixtures:
    1. Goals for
    2. Goals against
    3. Points earned
3. These aggregate columns are also added:
    1. Goal difference (direct encounter)
    2. Goal difference (any encounter)
    3. Points difference (direct encounter)
    4. Points difference (any encounter)

Example row initial data:

index | date | home_team | away_team | full_time_home_goals | full_time_away_goals | full_time_points_home | full_time_points_away |

For each row, the previous 30 fixtures matching the given criteria will add this data:

goals_for_CRITERION | goals_against_CRITERION | points_earned_CRITERION | days_since_CRITERION

An example criterion would be: home_any_rival_home_1 - specifying the most recent match of the current home team, against any rival

An example column would be: goals_for_home_any_rival_home_1
An example column would be: days_since_any_rival_home_1

In [278]:
# define criteria labels

# team classifier
home = "home"
away = "away"

# match type classifier 
any_rival_home = "any_rival_home"
any_rival_away = "any_rival_away"
matching_encounter = "matching_encounter"
inverted_encounter = "inverted_encounter"

# outcome classifiers
goals_for = "goals_for"
goals_against = "goals_against"
points_earned = "points_earned"
days_since = "days_since"

outcome_criteria = [goals_for, goals_against, points_earned, days_since]
team_criteria = [home, away]
match_type_criteria = [matching_encounter, inverted_encounter, any_rival_home, any_rival_away]



# Generate criteria combinations
criteria_combinations = [
    [outcome, team, match_type]
    for outcome in outcome_criteria
    for team in team_criteria
    for match_type in match_type_criteria
]

# Generate all criteria combinations with recency
all_criteria_combinations = [
    combination + [recency]
    for recency in range(1, 6)
    for combination in criteria_combinations
]

print(len(all_criteria_combinations))

160


In [279]:
# Function to calculate points
def calculate_points(row):
    if row['full_time_home_goals'] > row['full_time_away_goals']:
        home_points = 3
        away_points = 0
    elif row['full_time_home_goals'] == row['full_time_away_goals']:
        home_points = 1
        away_points = 1
    else:
        home_points = 0
        away_points = 3
    return pd.Series([home_points, away_points])

In [280]:
# Function to extract data points for each record
def extract_data_points(record, df_param, combinations):
    data_points = {}
    for combo in combinations:
        outcome, team, match_type, recency = combo
        date = record['date']
        team_name = record[team]
        
        # Find previous matches based on the criteria
        past_matches = df_param[df_param['date'] < date]
        
        if past_matches.empty:
            data_points[f"{outcome}_{team}_{match_type}_{recency}"] = 0
            continue
        
        if match_type == matching_encounter:
            past_matches = past_matches[
                (past_matches['home'] == record['home']) & (past_matches['away'] == record['away'])
            ]
        elif match_type == inverted_encounter:
            past_matches = past_matches[
                (past_matches['home'] == record['away']) & (past_matches['away'] == record['home'])
            ]
        elif match_type == any_rival_home:
            past_matches = past_matches[past_matches['home'] == team_name]
        elif match_type == any_rival_away:
            past_matches = past_matches[past_matches['away'] == team_name]
        
        if past_matches.empty or len(past_matches) < recency:
            data_points[f"{outcome}_{team}_{match_type}_{recency}"] = 0
            continue
        
        # Get the specific match at the recency position
        specific_match = past_matches.iloc[-recency]
        
        value = 0
        if outcome == goals_for:
            if team == home:
                value = specific_match['full_time_home_goals']
            else:
                value = specific_match['full_time_away_goals']
        elif outcome == goals_against:
            if team == home:
                value = specific_match['full_time_away_goals']
            else:
                value = specific_match['full_time_home_goals']
        elif outcome == points_earned:
            if team == home:
                value = specific_match['full_time_home_points']
            else:
                value = specific_match['full_time_away_points']
        elif outcome == days_since:
            last_match_date = specific_match['date']
            value = (date - last_match_date).days
        
        data_points[f"{outcome}_{team}_{match_type}_{recency}"] = value
    
    return data_points


In [281]:
df = pd.read_csv("data/final_dataset.csv")
df.head()

# Apply the function to each row and create new columns
df[['full_time_home_points', 'full_time_away_points']] = df.apply(calculate_points, axis=1)
df['date'] = pd.to_datetime(df['date'], dayfirst=True)

# Pre-group the dataframe by home and away teams
grouped_home = df.groupby('home')
grouped_away = df.groupby('away')

# Sample
# Test set
sample = df.head(1000)


In [282]:
# Function to apply in parallel
def apply_func(record, df_param, criteria_combinations_param):
    return extract_data_points(record, df_param, criteria_combinations_param)

# Apply the function in parallel
def parallelize_dataframe(df_param, func, combinations, n_cores=4):
    df_split = np.array_split(df_param, n_cores)
    pool = Parallel(n_jobs=n_cores)
    func_partial = partial(func, df_param=df_param, combinations=combinations)
    results = pool(delayed(lambda d: d.apply(func_partial, axis=1))(d) for d in df_split)
    return pd.concat(results, axis=0)



In [283]:
# Apply the function to each record using parallel processing
all_data_points = parallelize_dataframe(sample, extract_data_points, all_criteria_combinations, 16)

# Convert the extracted data points into columns
data_points_df = pd.DataFrame(list(all_data_points))
final_df = pd.concat([sample, data_points_df], axis=1)


'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.



In [284]:
# Apply the function to each record using parallel processing
all_data_points_full = parallelize_dataframe(df, extract_data_points, all_criteria_combinations, 16)


'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.



In [285]:

# Convert the extracted data points into columns
data_points_df_full = pd.DataFrame(list(all_data_points_full))
final_df_full = pd.concat([df, data_points_df_full], axis=1)

In [286]:
final_df_full.to_csv('final_dataset.csv', index=False)

In [287]:
columns_to_drop = [home, away, 'date', 'full_time_home_goals', 'full_time_away_goals', 'full_time_away_points', 'Unnamed: 0']
objective_value_plus_prediction_data = final_df_full.drop(columns=columns_to_drop)

In [288]:
target_column = 'full_time_home_points'
scale_factor_labels = 1
correlations = objective_value_plus_prediction_data.corr()[target_column].drop(target_column).drop('Unnamed: 0')

KeyError: "['Unnamed: 0'] not found in axis"

In [None]:
# 0.8 Split, as in the MLR work flow.
def train_test_split(df_input):
    train_data_output = df_input.sample(frac=0.8, random_state=0)
    test_data_output = df_input.drop(train_data_output.index)
    return train_data_output, test_data_output

def get_features_and_labels(train_data_i, test_data_i):
    train_features_o = train_data_i.copy()
    test_features_o = test_data_i.copy()
    
    train_labels_o = train_features_o.pop(target_column)
    test_labels_o = test_features_o.pop(target_column)
    
    train_labels_o = train_labels_o / scale_factor_labels
    test_labels_o = test_labels_o / scale_factor_labels
    
    return train_features_o.astype(float), test_features_o.astype(float), train_labels_o.astype(float), test_labels_o.astype(float)

In [None]:
train_data, test_data = train_test_split(final_df_full)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
train_data_nums = train_data.drop(columns=columns_to_drop)
test_data_nums = test_data.drop(columns=columns_to_drop)
train_features, test_features, train_labels, test_labels = get_features_and_labels(train_data_nums, test_data_nums)

train_data_nums.head()

In [None]:

def build_and_compile_model(norm):
    model = keras.Sequential([
        norm,
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(1),
    ])
    
    model.compile(loss='mean_absolute_error', optimizer=tf.keras.optimizers.Adam(0.001))
    
    return model

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))

first = np.array(train_features[:1])

with np.printoptions(precision=2, suppress=True):
    print('First example:', first)
    print()
    print('Normalized:', normalizer(first).numpy())
    
sample_data = np.random.rand(100,5)

dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

In [None]:
%%time
history_dnn = dnn_model.fit(
    train_features,
    train_labels,
    epochs=100,
    verbose=0,
    validation_split=0.2
)

In [None]:
def plot_loss(history, title):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=history['epoch'], y=history['loss'], mode='lines', name='loss vs epoch'))
    fig.add_trace(go.Scatter(x=history['epoch'], y=history['val_loss'], mode='lines', name='val_loss vs epoch'))
    fig.update_layout(title=title)
    fig.show()

In [None]:
hist_dnn = pd.DataFrame(history_dnn.history)
hist_dnn['epoch'] = history_dnn.epoch
plot_loss(hist_dnn, "DNN Training")

In [None]:
test_results = {}
test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0) * scale_factor_labels
pd.DataFrame(test_results, index=['Mean absolute error '+ target_column]).T

In [None]:
predictions = dnn_model.predict(test_features)
pred_df = pd.DataFrame(predictions).reset_index()

In [None]:
comparison = pd.concat([test_data, pred_df], axis=1)

pred_df.head(20)
test_data.head(20)

In [None]:
comparison.head(20)

In [None]:
comparison.rename(columns={0: 'prediction'}, inplace=True)

In [702]:
# Define a function to apply your quantization/labeling logic
def label_predictions(value):
    if value < 0: # 0.8 is best?
        return 0
    # elif value < -5: # 0.9 is best?
    #     return 1
    else:
        return 3

In [703]:
# Apply the function to create a new column 'quantized_label'
comparison = comparison.assign(quantized_label=comparison['prediction'].apply(label_predictions))

In [704]:
comparison['successful_prediction'] = comparison['quantized_label'] == comparison['full_time_home_points']
comparison['prediction_failure_type'] = comparison['quantized_label'] - comparison['full_time_home_points']

In [705]:
outcome_and_prediction = comparison[['quantized_label', 'full_time_home_points', 'successful_prediction', 'prediction_failure_type', 'prediction']]


In [706]:
outcome_and_prediction['full_time_home_points'].value_counts()
ordered_by_real = comparison.sort_values(by='full_time_home_points', ascending=True)
ordered_by_prediction = comparison.sort_values(by='prediction', ascending=True)


outcome_and_prediction['prediction_failure_type'].value_counts()

prediction_failure_type
 0    669
 3    325
 2    306
-3     41
-1     27
Name: count, dtype: int64

In [707]:
first_draw_index = ordered_by_real[ordered_by_real['full_time_home_points'] == 1].index[0]
# prediction_draw_cutoff = ordered_by_prediction.index[first_draw_index]['prediction']
print(first_draw_index)

104


In [708]:
comparison['full_time_home_points'].value_counts()

full_time_home_points
3    657
0    378
1    333
Name: count, dtype: int64

In [710]:
print(669/1369)

0.48867786705624544


In [711]:
final_df_full['full_time_home_points'].value_counts()

full_time_home_points
3    3176
0    1913
1    1751
Name: count, dtype: int64

In [712]:
print(3176/(3176+1913+1751))

0.464327485380117


In [715]:
target_value = 3
target_value_column = 'full_time_home_points'
# Step 2: Count occurrences of value 1 in 'value_column' and compute proportion
def compute_proportion(group):
    counts = group[target_value_column].value_counts()
    if target_value in counts:
        proportion = counts[target_value] / len(group)
    else:
        proportion = 0
    return proportion

result = grouped_home.apply(compute_proportion).reset_index()
result.columns = ['team', 'home wins proportion']
sorted = result.sort_values(by='home wins proportion', ascending=False)

print(sorted)

                team  home wins proportion
25        Man United              0.730994
0            Arsenal              0.690058
12           Chelsea              0.681287
23         Liverpool              0.605263
24          Man City              0.591331
38         Tottenham              0.578947
26       Middlesboro              0.526316
16           Everton              0.508772
28         Newcastle              0.467105
17            Fulham              0.453441
20           Ipswich              0.447368
35             Stoke              0.426316
21             Leeds              0.421053
34       Southampton              0.411483
3          Blackburn              0.411483
41          West Ham              0.407018
30        Portsmouth              0.406015
11          Charlton              0.406015
32           Reading              0.403509
22         Leicester              0.390977
5             Bolton              0.387560
37           Swansea              0.383459
2         B



