In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_csv('dataset.csv')

In [None]:
def preprocess_data(df):
    # Load the saved scalers
    robust_scaler = joblib.load('robust_scaler.pkl')
    standard_scaler = joblib.load('standard_scaler.pkl')

    # Replace values in 'FGS' column
    df['FGS'] = df['FGS'].replace({0: -1, 2: 0})

    # Create new columns
    df['diff_points_h-a'] = df['POINTS_H'] - df['POINTS_A']
    df['diff_total_H-A'] = df['TOTAL_H_P'] - df['TOTAL_A_P']
#     df['home_team'] = df['Match_O'].apply(lambda x: 1 if x == 2 else 0)
#     df['away_team'] = df['Match_O'].apply(lambda x: 1 if x == 0 else 0)
#     df['draw'] = df['Match_O'].apply(lambda x: 1 if x == 1 else 0)

    # Apply robust scaling on 'diff_points_h-a'
    df['diff_points_h-a'] = robust_scaler.transform(df[['diff_points_h-a']])

    # Apply standard scaling on 'diff_total_H-A'
    df['diff_total_H-A'] = standard_scaler.transform(df[['diff_total_H-A']])

    # Drop the specified columns
    df = df.drop(columns=['RED-H', 'RED-A', 'POINTS_H', 'POINTS_A', 'TOTAL_H_P', 'TOTAL_A_P', 'Match Number']) # Keeping Match_O column for now

    # Filter the dataset
    df = df[df['HTGD'].isin([-1, 0, 1])]

    return df

In [None]:
df_mod = preprocess_data(df)

In [None]:
X_mod = df_mod.drop(['Match_O'], axis=1)
y_mod = df_mod['Match_O']

In [None]:
df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_mod, y_mod, test_size=0.2, random_state=42)

In [None]:
rest_df = df = df[df['HTGD'].isin([-1, 0, 1]) == False]

In [None]:
rest_df

In [None]:
def preprocess_rest(df):
    # Load the saved scalers
    robust_scaler = joblib.load('robust_scaler.pkl')
    standard_scaler = joblib.load('standard_scaler.pkl')

    # Replace values in 'FGS' column
    df['FGS'] = df['FGS'].replace({0: -1, 2: 0})

    # Create new columns
    df['diff_points_h-a'] = df['POINTS_H'] - df['POINTS_A']
    df['diff_total_H-A'] = df['TOTAL_H_P'] - df['TOTAL_A_P']
#     df['home_team'] = df['Match_O'].apply(lambda x: 1 if x == 2 else 0)
#     df['away_team'] = df['Match_O'].apply(lambda x: 1 if x == 0 else 0)
#     df['draw'] = df['Match_O'].apply(lambda x: 1 if x == 1 else 0)

    # Apply robust scaling on 'diff_points_h-a'
    df['diff_points_h-a'] = robust_scaler.transform(df[['diff_points_h-a']])

    # Apply standard scaling on 'diff_total_H-A'
    df['diff_total_H-A'] = standard_scaler.transform(df[['diff_total_H-A']])

    # Drop the specified columns
    df = df.drop(columns=['RED-H', 'RED-A', 'POINTS_H', 'POINTS_A', 'TOTAL_H_P', 'TOTAL_A_P', 'Match Number']) # Keeping Match_O column for now

    # Filter the dataset
#     df = df[df['HTGD'].isin([-1, 0, 1])]

    return df

In [None]:
rest_df = preprocess_rest(rest_df)

In [None]:
# X_test = X_test + rest_df.drop(['Match_O'],axis=1)
# y_test = y_test + rest_df['Match_O']
X_test = pd.concat([X_test,rest_df.drop(['Match_O'],axis=1)], ignore_index=True)
y_test = pd.concat([y_test,rest_df['Match_O']], ignore_index=True)


In [None]:
X_test

In [None]:
def predict_outcome(X):
    # Load the models
    lr_home_model = joblib.load('lr_home_model.pkl')
    lr_away_model = joblib.load('lr_away_model.pkl')
    lr_draw_model = joblib.load('lr_draw_model.pkl')

    # Initialize an empty list to store the predictions
    predictions = []

    # Iterate over the rows in the dataframe
    for _, row in X.iterrows():
        # If HTGD is greater than 1, the home team wins
        if row['HTGD'] > 1:
            predictions.append(2)
        # If HTGD is less than 1, the home team loses
        elif row['HTGD'] < 1:
            predictions.append(0)
        # Otherwise, use the models to predict the outcome
        else:
            # Calculate the probabilities of each outcome
            home_win_prob = lr_home_model.predict_proba([row])[0][1]
            away_win_prob = lr_away_model.predict_proba([row])[0][1]
            draw_prob = lr_draw_model.predict_proba([row])[0][1]

            # Determine the outcome with the highest probability
            max_prob = max(home_win_prob, away_win_prob, draw_prob)
            if max_prob == home_win_prob:
                predictions.append(2)
            elif max_prob == away_win_prob:
                predictions.append(0)
            else:
                predictions.append(1)

    return predictions

In [None]:
from sklearn.metrics import accuracy_score

# Predict the outcomes for the test data
y_pred = predict_outcome(X_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
def predict_outcome_weighted(X):
    # Load the models
    lr_home_model = joblib.load('lr_home_model.pkl')
    lr_away_model = joblib.load('lr_away_model.pkl')
    lr_draw_model = joblib.load('lr_draw_model.pkl')

    # Initialize an empty list to store the predictions
    predictions = []

    # Define the weights based on the precision of each model
    weight_home = 0.71
    weight_away = 0.74
    weight_draw = 0.00

    # Iterate over the rows in the dataframe
    for _, row in X.iterrows():
        # If HTGD is greater than 1, the home team wins
        if row['HTGD'] > 1:
            predictions.append(2)
        # If HTGD is less than 1, the home team loses
        elif row['HTGD'] < 1:
            predictions.append(0)
        # Otherwise, use the models to predict the outcome
        else:
            # Calculate the probabilities of each outcome
            home_win_prob = lr_home_model.predict_proba([row])[0][1] * weight_home
            away_win_prob = lr_away_model.predict_proba([row])[0][1] * weight_away
            draw_prob = lr_draw_model.predict_proba([row])[0][1] * weight_draw

            # Determine the outcome with the highest weighted probability
            max_prob = max(home_win_prob, away_win_prob, draw_prob)
            if max_prob == home_win_prob:
                predictions.append(2)
            elif max_prob == away_win_prob:
                predictions.append(0)
            else:
                predictions.append(1)

    return predictions

In [None]:
# Predict the outcomes for the test data using the weighted prediction function
y_pred_weighted = predict_outcome_weighted(X_test)

# Calculate the accuracy of the weighted predictions
accuracy_weighted = accuracy_score(y_test, y_pred_weighted)
accuracy_weighted