In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Load historical match data from CSV file
match_data = pd.read_csv('matches.csv')

# Preprocess date attribute
match_data['Datetime'] = pd.to_datetime(match_data['Datetime'].str.split(' - ', expand=True)[0])

# Sort match data by date
match_data = match_data.sort_values(by='Datetime', ascending=False)

# Function to calculate average scores of the last 5 matches for a team
def get_last_5_avg_scores(team, matches):
    team_matches = matches[(matches['home_team'] == team) | (matches['away_team'] == team)]
    last_5_matches = team_matches.head(5)
    avg_home_score = last_5_matches[last_5_matches['home_team'] == team]['Home Team Goals'].mean()
    avg_away_score = last_5_matches[last_5_matches['away_team'] == team]['Away Team Goals'].mean()
    return avg_home_score, avg_away_score

# Function to encode team names
def encode_team(team):
    if team in team_encoder:
        return team_encoder[team]
    else:
        return -1

# Encode team names
team_encoder = {team: i for i, team in enumerate(match_data['home_team'].unique())}
match_data['home_team_encoded'] = match_data['home_team'].apply(encode_team)
match_data['away_team_encoded'] = match_data['away_team'].apply(encode_team)

# Calculate average scores for each team
match_data['home_avg_score'], match_data['away_avg_score'] = zip(*match_data.apply(
    lambda x: get_last_5_avg_scores(x['home_team'], match_data), axis=1))

# Extract relevant features and target variable
X = match_data[['home_avg_score', 'away_avg_score']]
y = match_data['outcome']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Train a Logistic Regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train_imputed, y_train)

# Function to predict the outcome of a match using Logistic Regression
def predict_match_logreg(home_team, away_team):
    home_team_encoded = encode_team(home_team)
    away_team_encoded = encode_team(away_team)
    if home_team_encoded == -1 or away_team_encoded == -1:
        return "Invalid team name(s)"
    home_avg_score = match_data[match_data['home_team'] == home_team]['home_avg_score'].iloc[0]
    away_avg_score = match_data[match_data['away_team'] == away_team]['away_avg_score'].iloc[0]
    outcome = logreg_model.predict([[home_avg_score, away_avg_score]])[0]
    if outcome == 1:
        return f"{home_team} wins"
    elif outcome == 0:
        return "It's a draw"
    else:
        return f"{away_team} wins"

# User input for home and away teams
home_team = input("Enter the home team: ")
away_team = input("Enter the away team: ")

# Prediction for the match
prediction = predict_match_logreg(home_team, away_team)
print(f"Prediction for the match between {home_team} and {away_team}: {prediction}")

# Calculate and print accuracy
y_pred = logreg_model.predict(X_test_imputed)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Enter the home team:  Peru
Enter the away team:  England


Prediction for the match between Peru and England: England wins
Accuracy: 0.47368421052631576
