In [22]:
import pandas as pd
from datetime import datetime
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import joblib

# Step 1: Load Data & Create Target Variable
data = pd.read_csv("Foosball.csv")
# Create a binary target: 1 if team1 wins, 0 if team2 wins.
data['winner'] = (data['team1Goals'] > data['team2Goals']).astype(int)

# Step 2: Engineer the Date Feature
# Convert gameDate to datetime and compute days since the first game
data['gameDate'] = pd.to_datetime(data['gameDate'])
min_date = data['gameDate'].min()
data['days_since'] = (data['gameDate'] - min_date).dt.days

# Define the features and target
features = ['player1Name', 'player2Name', 'player3Name', 'player4Name', 'days_since']
target = 'winner'

# Step 3: Define a Preprocessing Pipeline
player_columns = ['player1Name', 'player2Name', 'player3Name', 'player4Name']
date_columns = ['days_since']

preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode player names; new/unseen players will be ignored during transformation.
        ('players', OneHotEncoder(handle_unknown='ignore'), player_columns),
        # Pass through the numerical date feature
        ('date', 'passthrough', date_columns)
    ]
)

# Step 4: Combine Preprocessing with a Classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Fit the model using the features and target variable
X = data[features]
y = data[target]
model.fit(X, y)

# Save the model to a file
joblib.dump(model, 'foosball_model_LR.pkl')

# For future predictions:
# Construct a DataFrame new_data with the same columns as features,
# then call model.predict_proba(new_data) to get winning probabilities.


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['foosball_model_LR.pkl']

In [23]:
all_players_name = set(data["player1Name"].unique().tolist() + data["player2Name"].unique().tolist() + data["player3Name"].unique().tolist() + data["player4Name"].unique().tolist())
print(list(all_players_name))

['Bob', 'Lucas', 'Robin', 'Salma', 'Keon', 'Ruby', 'Jeffrey', 'Luuk', 'Hans', 'Lars', 'Cathleen', 'Roel', 'Ilhan', 'Sander', 'Jakko', 'Berkan', 'Amir', 'Ellen', 'Arian', 'Lara', 'Aart-Jan', 'Mohammad', 'Andres', 'Norent', 'Isabella', 'Jeroen']


In [8]:
df['winner'] = df.apply(lambda row: 'team1' if row['team1Goals'] > row['team2Goals'] else 'team2', axis=1)