<a href="https://colab.research.google.com/github/aryashantanu405/life_time_expectancy/blob/main/life_time_expectancy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Cell 2: Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv('life_expectancy.csv')
print(f"Original columns:\n{df.columns.tolist()}\n")

# Clean column names (remove trailing/leading spaces)
df.columns = df.columns.str.strip()
print(f"Cleaned columns:\n{df.columns.tolist()}\n")

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Correct column name for life expectancy
df = df.rename(columns={'Life expectancy': 'Life_expectancy'})

# Drop rows where target is missing
df = df.dropna(subset=['Life_expectancy'])

# Columns to drop (with exact names)
cols_to_drop = ['Population', 'Hepatitis B', 'Total expenditure', 'GDP', 'Measles', 'under-five deaths']
df = df.drop([col for col in cols_to_drop if col in df.columns], axis=1)

# Fill numerical missing values
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Encode categoricals
df['Country'] = LabelEncoder().fit_transform(df['Country'])
df['Status'] = LabelEncoder().fit_transform(df['Status'])

print("Missing values after cleaning:")
print(df.isnull().sum())

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df['Life_expectancy'], kde=True, bins=30)
plt.title('Life Expectancy Distribution')
plt.show()

# Top correlated features
corr_matrix = df.corr()
plt.figure(figsize=(16,10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Matrix')
plt.show()

In [13]:
X = df.drop(['Life_expectancy', 'Year', 'Country'], axis=1)
y = df['Life_expectancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

Training set: (2342, 13), Test set: (586, 13)


In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print("Linear Regression:")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

# Show coefficients
coef_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': lr.coef_})
print("\nTop 5 impactful features:")
print(coef_df.sort_values('Coefficient', ascending=False).head(5))

In [None]:
# Cell 8: Final Evaluation Metrics
from sklearn.metrics import mean_absolute_error

# Decision Tree (usually better for this data)
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

print("=== Decision Tree ===")
print(f"MAE: {mean_absolute_error(y_test, y_pred_dt):.2f} years")
print(f"MSE: {mean_squared_error(y_test, y_pred_dt):.2f}")
print(f"R²: {r2_score(y_test, y_pred_dt):.2f}")

# Compare with Linear Regression
y_pred_lr = lr.predict(X_test)
print("\n=== Linear Regression ===")
print(f"MAE: {mean_absolute_error(y_test, y_pred_lr):.2f} years")
print(f"R²: {r2_score(y_test, y_pred_lr):.2f}")

In [18]:
# Cell 9: Input with Value Ranges and Validation
def predict_life_expectancy(model):
    # Feature info dictionary (name: [default, min, max, description])
    feature_info = {
        'Status': [1, 0, 1, "0=Developing, 1=Developed"],
        'Adult Mortality': [200, 10, 500, "Deaths per 1000 adults"],
        'infant deaths': [10, 0, 1000, "Infant deaths per 1000"],
        'Alcohol': [5.0, 0, 20, "Alcohol consumption (liters/capita)"],
        'percentage expenditure': [500, 0, 10000, "Healthcare expenditure %"],
        'BMI': [20.0, 10, 40, "Average Body Mass Index"],
        'Polio': [80, 0, 100, "Polio immunization coverage %"],
        'Diphtheria': [80, 0, 100, "Diphtheria immunization coverage %"],
        'HIV/AIDS': [1.0, 0, 30, "HIV/AIDS prevalence %"],
        'thinness  1-19 years': [5.0, 0, 30, "Thinness prevalence age 10-19"],
        'thinness 5-9 years': [5.0, 0, 30, "Thinness prevalence age 5-9"],
        'Income composition of resources': [0.5, 0, 1, "Income index (0-1)"],
        'Schooling': [10, 0, 20, "Average years of schooling"]
    }

    print("Enter features (press Enter for defaults):")
    features = {}
    for feature, (default, min_val, max_val, desc) in feature_info.items():
        while True:
            try:
                user_input = input(f"{feature} ({desc}) [{default}]: ") or default
                val = float(user_input)
                if min_val <= val <= max_val:
                    features[feature] = val
                    break
                else:
                    print(f"Must be between {min_val}-{max_val}")
            except ValueError:
                print("Please enter a number")

    # Create properly ordered DataFrame
    input_data = pd.DataFrame([features], columns=X_train.columns)

    # Predict
    prediction = model.predict(input_data)[0]
    print(f"\nPredicted Life Expectancy: {prediction:.1f} years")
    print(f"Range: {prediction-5.1:.1f} to {prediction+5.1:.1f} (95% confidence)")

# Test it
predict_life_expectancy(dt_model)

Enter features (press Enter for defaults):
Status (0=Developing, 1=Developed) [1]: 1
Adult Mortality (Deaths per 1000 adults) [200]: 300
infant deaths (Infant deaths per 1000) [10]: 230
Alcohol (Alcohol consumption (liters/capita)) [5.0]: 12
percentage expenditure (Healthcare expenditure %) [500]: 23
BMI (Average Body Mass Index) [20.0]: 12
Polio (Polio immunization coverage %) [80]: 78
Diphtheria (Diphtheria immunization coverage %) [80]: 79
HIV/AIDS (HIV/AIDS prevalence %) [1.0]: 0.2
thinness  1-19 years (Thinness prevalence age 10-19) [5.0]: 8
thinness 5-9 years (Thinness prevalence age 5-9) [5.0]: 5
Income composition of resources (Income index (0-1)) [0.5]: .4
Schooling (Average years of schooling) [10]: 9

Predicted Life Expectancy: 59.9 years
Range: 54.8 to 65.0 (95% confidence)
