In [1]:
import pandas as pd
from sklearn.metrics import f1_score, recall_score
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Read the dataset
matches = pd.read_csv("matches_6_years.csv", index_col=0)

# Convert date column to datetime
matches["date"] = pd.to_datetime(matches["date"])

# Create target variable: 1 for win, 0 otherwise
matches["target"] = (matches["result"] == "W").astype("int")

# Convert categorical variables to numerical codes
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek

# Define predictors and target variable
predictors = ["venue_code", "opp_code", "hour", "day_code"]
target = "target"

# Split data into train and test sets with stratified sampling
train, test = train_test_split(matches, test_size=0.2, stratify=matches[target], random_state=42)

# Create MLP model
model = Sequential()
model.add(Dense(64, input_dim=len(predictors), activation='relu'))  # Input layer with 64 neurons
model.add(Dense(32, activation='relu'))  # Hidden layer with 32 neurons
model.add(Dense(1, activation='sigmoid'))  # Output layer with 1 neuron (binary classification)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train[predictors], train[target], epochs=10, batch_size=32, verbose=0)  # Set verbose to 0 to suppress training output

# Evaluate the model
loss, accuracy = model.evaluate(test[predictors], test[target], verbose=0)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Make predictions on the test set
predictions_prob = model.predict(test[predictors])
predictions = (predictions_prob > 0.5).astype(int)

# Compute precision, recall, and F1-score
precision = precision_score(test[target], predictions)
recall = recall_score(test[target], predictions)
f1 = f1_score(test[target], predictions)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


In [None]:
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Manchester City").sort_values("date")
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

rolling_averages(group, cols, new_cols)
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])

In [None]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    mlp = MLPClassifier()  # Initialize your MLPClassifier
    mlp.fit(train[predictors], train["target"])
    preds = mlp.predict(test[predictors])
    combined = pd.DataFrame({'actual': test["target"], 'predicted': preds}, index=test.index)
    accuracy = accuracy_score(test["target"], preds)
    precision = precision_score(test["target"], preds, average='macro')
    recall = recall_score(test["target"], preds, average='macro')
    f1 = f1_score(test["target"], preds, average='macro')
    return combined, accuracy, precision, recall, f1

# Assuming you have a DataFrame called `matches_rolling` and a list of predictors called `predictors` and new columns called `new_cols`
combined, accuracy, precision, recall, f1 = make_predictions(matches_rolling, predictors + new_cols)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# Feature scaling
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train[predictors])
test_scaled = scaler.transform(test[predictors])

# Define MLP classifier
mlp = MLPClassifier()

# Define hyperparameters to tune
parameters = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [200, 500, 1000]
}

# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(mlp, parameters, cv=5, n_jobs=-1)
grid_search.fit(train_scaled, train["target"])

# Best parameters found
best_params = grid_search.best_params_


# Train the model with best parameters
best_mlp = MLPClassifier(**best_params)
best_mlp.fit(train_scaled, train["target"])

# Predictions on the test set
predictions = best_mlp.predict(test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(test["target"], predictions)
precision = precision_score(test["target"], predictions)
recall = recall_score(test["target"], predictions)
f1 = f1_score(test["target"], predictions)

# Print the evaluation metrics
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
