# World Cup Matches — Linear Regression (Predicting Total Goals)

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

df = pd.read_csv("/mnt/data/world_cup_lr_project/data/WorldCupMatches_cleaned.csv")
df.head()


In [None]:

def engineer_features_wc(data: pd.DataFrame) -> pd.DataFrame:
    d = data.copy()
    d.columns = [c.lower() for c in d.columns]

    if "home_team_goals" in d.columns and "away_team_goals" in d.columns:
        d["totalgoals"] = pd.to_numeric(d["home_team_goals"], errors="coerce") +                                   pd.to_numeric(d["away_team_goals"], errors="coerce")
        d["goaldiff"] = pd.to_numeric(d["home_team_goals"], errors="coerce") -                                 pd.to_numeric(d["away_team_goals"], errors="coerce")
    else:
        raise ValueError("Expected 'home_team_goals' and 'away_team_goals'.")

    if "attendance" in d.columns:
        d["attendance"] = (
            d["attendance"].astype(str)
            .str.replace(",", "", regex=False)
            .str.extract(r"(\d+)", expand=False)
        )
        d["attendance"] = pd.to_numeric(d["attendance"], errors="coerce")

    if "year" in d.columns:
        d["year"] = pd.to_numeric(d["year"], errors="coerce")

    if "stage" in d.columns:
        d["is_knockout"] = ~d["stage"].astype(str).str.contains("group", case=False, na=False)
        d["is_knockout"] = d["is_knockout"].astype(int)
    return d

df_eng = engineer_features_wc(df)
df_eng.head()


In [None]:

target = "totalgoals"
candidate_cats = [c for c in ["stage", "city", "stadium"] if c in df_eng.columns]
numeric_cols = df_eng.select_dtypes(include=[np.number]).columns.tolist()
for col in [target, "goaldiff"]:
    if col in numeric_cols:
        numeric_cols.remove(col)

feature_cols_num = numeric_cols
feature_cols_cat = candidate_cats

X = df_eng[feature_cols_num + feature_cols_cat]
y = df_eng[target]
mask = y.notna()
X, y = X.loc[mask], y.loc[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = [c for c in feature_cols_num if c in X.columns]
categorical_features = [c for c in feature_cols_cat if c in X.columns]

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False)),
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=True)),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_features),
        ("cat", categorical_pipe, categorical_features),
    ],
    remainder="drop"
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print({"MAE": mae, "RMSE": rmse, "R2": r2})


In [None]:

if len(X) >= 10:
    cv_scores = cross_val_score(model, X, y, cv=min(5, len(X)), scoring="r2")
    print("CV R^2 mean:", cv_scores.mean())


In [None]:

plt.figure()
plt.scatter(y_test, y_pred)
plt.xlabel("Actual TotalGoals")
plt.ylabel("Predicted TotalGoals")
plt.title("Linear Regression: Actual vs Predicted TotalGoals")
plt.show()
