
# Logistic Regression — World Cup Matches
This notebook trains a **logistic regression** model to predict whether the **home team wins** a match using multiple features from the dataset.

**Workflow**
1. Load & inspect data  
2. Feature engineering (binary target `home_win`, `is_knockout`, cleaned `attendance`)  
3. Train/test split  
4. Pipeline: Imputation + Scaling + One-Hot + **LogisticRegression**  
5. Evaluation: Accuracy, Precision, Recall, F1, ROC-AUC, PR-AUC, Confusion Matrix  
6. Coefficients & Odds Ratios (feature effects)


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, average_precision_score, precision_recall_curve,
    confusion_matrix, ConfusionMatrixDisplay
)

# Load data
csv_path = "/mnt/data/WorldCupMatches_cleaned.csv"
df_raw = pd.read_csv(csv_path)
df_raw.head(10)


In [None]:

# Quick structure and missingness
print("Shape:", df_raw.shape)
print("\nColumns:", list(df_raw.columns))
print("\nNull counts:\n", df_raw.isna().sum())
df_raw.describe(include='all').T.head(20)


## Feature Engineering

In [None]:

def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d.columns = [c.lower() for c in d.columns]

    # Target: home team win (1) vs not win (0)
    if {"home_team_goals", "away_team_goals"}.issubset(set(d.columns)):
        d["home_win"] = (pd.to_numeric(d["home_team_goals"], errors="coerce") >
                         pd.to_numeric(d["away_team_goals"], errors="coerce")).astype(int)
    else:
        raise ValueError("Dataset must include 'home_team_goals' and 'away_team_goals'.")

    # Attendance cleanup
    if "attendance" in d.columns:
        d["attendance"] = (
            d["attendance"].astype(str)
            .str.replace(",", "", regex=False)
            .str.extract(r"(\d+)", expand=False)
        )
        d["attendance"] = pd.to_numeric(d["attendance"], errors="coerce")

    # Year numeric
    if "year" in d.columns:
        d["year"] = pd.to_numeric(d["year"], errors="coerce")

    # Knockout flag derived from stage
    if "stage" in d.columns:
        d["is_knockout"] = ~d["stage"].astype(str).str.contains("group", case=False, na=False)
        d["is_knockout"] = d["is_knockout"].astype(int)

    # Ensure optional numeric columns are numeric if present
    for col in ["half-time_home_goals", "half-time_away_goals", "goal_difference"]:
        if col in d.columns:
            d[col] = pd.to_numeric(d[col], errors="coerce")

    return d

df = engineer_features(df_raw)
df.head(10)


## Select Predictors

In [None]:

target = "home_win"

candidate_numeric = [
    "year", "attendance", "is_knockout",
    "half-time_home_goals", "half-time_away_goals", "goal_difference"
]
numeric_features = [c for c in candidate_numeric if c in df.columns]

candidate_categorical = ["stage", "city", "stadium", "referee"]
categorical_features = [c for c in candidate_categorical if c in df.columns]

feature_cols = numeric_features + categorical_features
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

# Drop rows with missing target (should be none after casting)
mask = df[target].notna()
df = df.loc[mask].copy()

X = df[feature_cols]
y = df[target]

print("X shape:", X.shape, "y length:", len(y), "| Positive rate:", float(y.mean()))


## Train/Test Split & Pipeline

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False)),
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=True)),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_features),
        ("cat", categorical_pipe, categorical_features),
    ],
    remainder="drop"
)

logreg = Pipeline([
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=2000, solver="lbfgs"))
])

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_proba = logreg.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
pr_auc = average_precision_score(y_test, y_proba)

print({{
    "accuracy": acc,
    "precision": prec,
    "recall": rec,
    "f1": f1,
    "roc_auc": roc_auc,
    "pr_auc": pr_auc
}})


## Cross-Validation

In [None]:

cv_scores = cross_val_score(logreg, X, y, cv=min(5, len(y)), scoring="accuracy")
print("CV accuracy:", cv_scores, "mean =", float(np.mean(cv_scores)))


## Diagnostics

In [None]:

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("Home Win — Confusion Matrix")
plt.show()


In [None]:

# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure()
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()


In [None]:

# Precision-Recall curve
precisions, recalls, _ = precision_recall_curve(y_test, y_proba)
plt.figure()
plt.plot(recalls, precisions)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.show()


## Coefficients & Odds Ratios

In [None]:

# Get feature names post-encoding
ohe = None
if len(categorical_features):
    ohe = logreg.named_steps["preprocess"].named_transformers_["cat"].named_steps["ohe"]
num_names = numeric_features
cat_names = list(ohe.get_feature_names_out(categorical_features)) if ohe is not None else []
all_feature_names = num_names + cat_names

coefs = logreg.named_steps["clf"].coef_[0]
coef_df = pd.DataFrame({{"feature": all_feature_names, "coef": coefs}})
coef_df["odds_ratio"] = np.exp(coef_df["coef"])

# Top positive and negative effects
coef_df.sort_values("coef", ascending=False).head(15)


In [None]:

coef_df.sort_values("coef", ascending=True).head(15)
