In [None]:
# Modeling Prediction
#To evaluate model performance, I compare four regressors—Linear Regression, Decision Tree, K-Nearest Neighbors, and a Neural Network—using different combinations of four selected predictors identified in EDA. I also include a baseline model that predicts the mean life expectancy. Models are evaluated using R² and MSE.


In [21]:
# Import necessary libraries
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor


In [23]:
# Load and Prepare Data

df = pd.read_csv("/content/Life Expectancy Data.csv")

# Filter for 2014 and rename columns
df = df[df["Year"] == 2014].copy()
df = df.rename(columns={
    " thinness  1-19 years": "Thinness (1-19)",
    " HIV/AIDS": "HIV/AIDS",
    "Life expectancy ": "LifeExpectancy"
})

# Drop rows with missing values in selected columns
df = df.dropna(subset=["Schooling", "Income composition of resources", "Thinness (1-19)", "HIV/AIDS", "LifeExpectancy"])

# Create an engineered feature
df["Score_Edu_HIV"] = df["Schooling"] / df["HIV/AIDS"]


In [24]:
# Baseline Model

# As a baseline, I predict the mean life expectancy for all test observations and compute the MSE.

# Define target
y = df["LifeExpectancy"]

# Predict mean
baseline_preds = np.full(len(y), y.mean())
baseline_mse = mean_squared_error(y, baseline_preds)
print(f"Baseline MSE: {baseline_mse:.2f}")



Baseline MSE: 69.71


In [None]:
### Define Features and Models


In [25]:
# Define predictors
predictor_list = ["Schooling", "Income composition of resources", "Thinness (1-19)", "Score_Edu_HIV"]

# Define models
regression_models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "KNN": KNeighborsRegressor(),  # To tune later
    "Neural Network": MLPRegressor(max_iter=1000, random_state=42)
}


In [27]:
# Model Evaluation Across Feature Combinations

# Store evaluation results
combo_model_results = []

# Try every combination of 1 to 4 features
for r in [1, 2, 3, 4]:
    for combo in combinations(predictor_list, r):
        X_subset = df[list(combo)]
        X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.2, random_state=42)

        for model_name, model in regression_models.items():
            pipe = Pipeline([
                ("scaler", StandardScaler()),
                ("model", model)
            ])
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)

            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            combo_model_results.append({
                "Model": model_name,
                "Features": combo,
                "R2 Score": r2,
                "MSE": mse
            })




In [28]:
# Top Performing Combinations

# Create DataFrame of results
results_df = pd.DataFrame(combo_model_results).sort_values(by="R2 Score", ascending=False).reset_index(drop=True)

# Display top 10
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 0)

print("Top performing model/feature combos (before tuning):\n")
print(results_df.head(10))


Top performing model/feature combos (before tuning):

               Model  \
0                KNN   
1                KNN   
2                KNN   
3      Decision Tree   
4                KNN   
5  Linear Regression   
6  Linear Regression   
7      Decision Tree   
8      Decision Tree   
9  Linear Regression   

                                                                       Features  \
0                              (Income composition of resources, Score_Edu_HIV)   
1                   (Schooling, Income composition of resources, Score_Edu_HIV)   
2  (Schooling, Income composition of resources, Thinness (1-19), Score_Edu_HIV)   
3                   (Schooling, Income composition of resources, Score_Edu_HIV)   
4             (Income composition of resources, Thinness (1-19), Score_Edu_HIV)   
5                              (Income composition of resources, Score_Edu_HIV)   
6                   (Schooling, Income composition of resources, Score_Edu_HIV)   
7  (Schooling, In

In [32]:
# Extract top model info
top_combo = results_df.loc[0]
best_features = list(top_combo["Features"])
print(f"\nSelected for GridSearchCV: {top_combo['Model']} with features {best_features}")


Selected for GridSearchCV: KNN with features ['Income composition of resources', 'Score_Edu_HIV']


In [35]:
# Since KNN is the best model here, we need to find out best k and adjust further
X_best = df[best_features]
X_train, X_test, y_train, y_test = train_test_split(X_best, y, test_size=0.2, random_state=42)

knn_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsRegressor())
    ])
param_grid = {"model__n_neighbors": list(range(1, 21))}
grid_search = GridSearchCV(knn_pipeline, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)
best_k = grid_search.best_params_["model__n_neighbors"]
best_cv_r2 = grid_search.best_score_

# Evaluate on test set using best estimator
best_knn_model = grid_search.best_estimator_
test_preds = best_knn_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_preds)

print(f"\nGridSearchCV Result for KNN:")
print(f"Best k: {best_k}")
print(f"Cross-validated R²: {best_cv_r2:.4f}")
print(f"Test Set MSE: {test_mse:.2f}")



GridSearchCV Result for KNN:
Best k: 4
Cross-validated R²: 0.8320
Test Set MSE: 13.88


In [None]:
### Interpretation

# Linear models generally performed well, but models like KNN and decision trees captured non-linear interactions. The best-performing configuration used a KNN model with 2 features, outperforming the baseline significantly. GridSearchCV revealed the optimal number of neighbors for the KNN model, reinforcing the model's ability to adapt to local data structure.
