
# Multiple Linear Regression — World Cup Matches
This notebook performs a **multiple linear regression (MLR)** to model and predict **Total Goals** per match using several predictors from the World Cup matches dataset.

**Pipeline overview**
1. Load & inspect data  
2. Feature engineering (create `totalgoals`, `is_knockout`, clean `attendance`)  
3. Split into train/test  
4. Build an MLR pipeline with:
   - Numeric: `SimpleImputer(median)` → `StandardScaler`
   - Categorical: `SimpleImputer(most_frequent)` → `OneHotEncoder`
5. Fit & evaluate (R², MAE, RMSE, CV-R²)  
6. Diagnostics: actual vs predicted, residuals, coefficient inspection


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

csv_path = "/mnt/data/WorldCupMatches_cleaned.csv"
df_raw = pd.read_csv(csv_path)
df_raw.head(10)


In [None]:

print("Shape:", df_raw.shape)
print("\nColumns:", list(df_raw.columns))
print("\nNull counts:\n", df_raw.isna().sum())
df_raw.describe(include='all').T.head(20)


## Feature Engineering

In [None]:

def engineer_features_wc(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d.columns = [c.lower() for c in d.columns]

    if "home_team_goals" in d.columns and "away_team_goals" in d.columns:
        d["totalgoals"] = pd.to_numeric(d["home_team_goals"], errors="coerce") +                           pd.to_numeric(d["away_team_goals"], errors="coerce")
    else:
        raise ValueError("Dataset must include 'home_team_goals' and 'away_team_goals'.")

    if "attendance" in d.columns:
        d["attendance"] = (
            d["attendance"].astype(str)
            .str.replace(",", "", regex=False)
            .str.extract(r"(\d+)", expand=False)
        )
        d["attendance"] = pd.to_numeric(d["attendance"], errors="coerce")

    if "year" in d.columns:
        d["year"] = pd.to_numeric(d["year"], errors="coerce")

    if "stage" in d.columns:
        d["is_knockout"] = ~d["stage"].astype(str).str.contains("group", case=False, na=False)
        d["is_knockout"] = d["is_knockout"].astype(int)

    for col in ["half-time_home_goals", "half-time_away_goals", "goal_difference"]:
        if col in d.columns:
            d[col] = pd.to_numeric(d[col], errors="coerce")

    return d

df = engineer_features_wc(df_raw)
df.head(10)


## Select Predictors (Multiple Linear Regression)

In [None]:

target = "totalgoals"

candidate_numeric = [
    "year", "attendance", "is_knockout",
    "half-time_home_goals", "half-time_away_goals", "goal_difference"
]
numeric_features = [c for c in candidate_numeric if c in df.columns]

candidate_categorical = ["stage", "city", "stadium", "referee"]
categorical_features = [c for c in candidate_categorical if c in df.columns]

feature_cols = numeric_features + categorical_features
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

mask = df[target].notna()
df = df.loc[mask].copy()

X = df[feature_cols]
y = df[target]

print("X shape:", X.shape, "y length:", len(y))


## Train/Test Split & Pipeline

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False)),
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=True)),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_features),
        ("cat", categorical_pipe, categorical_features),
    ],
    remainder="drop"
)

mlr = Pipeline([
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

mlr.fit(X_train, y_train)
y_pred = mlr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print({{"MAE": mae, "RMSE": rmse, "R2": r2}})


## Cross-Validated Performance

In [None]:

if len(X) >= 10:
    cv_scores = cross_val_score(mlr, X, y, cv=min(5, len(X)), scoring="r2")
    print("CV R² scores:", cv_scores)
    print("CV R² mean:", float(np.mean(cv_scores)))
else:
    print("Not enough samples for cross-validation.")


## Diagnostics

In [None]:

# Actual vs Predicted
plt.figure()
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Total Goals")
plt.ylabel("Predicted Total Goals")
plt.title("MLR — Actual vs Predicted")
plt.show()


In [None]:

# Residuals vs Predicted
residuals = y_test - y_pred
plt.figure()
plt.scatter(y_pred, residuals)
plt.axhline(0)
plt.xlabel("Predicted Total Goals")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residuals vs Predicted")
plt.show()


## Coefficients (Feature Effects)

In [None]:

# Extract feature names after preprocessing
ohe = None
if len(categorical_features):
    ohe = mlr.named_steps["preprocess"].named_transformers_["cat"].named_steps["ohe"]
num_names = numeric_features
cat_names = list(ohe.get_feature_names_out(categorical_features)) if ohe is not None else []
all_feature_names = num_names + cat_names

coef = mlr.named_steps["regressor"].coef_
coef_df = pd.DataFrame({{"feature": all_feature_names, "coefficient": coef}})

# Top positive
coef_df.sort_values("coefficient", ascending=False).head(15)


In [None]:

# Top negative
coef_df.sort_values("coefficient", ascending=True).head(15)
