In [None]:
# STEP 1: Install + Import Libraries
!pip install seaborn scikit-learn matplotlib

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# STEP 2: Upload Separate Datasets
from google.colab import files

print("Upload Air Quality CSV")
uploaded_air = files.upload()
air = pd.read_csv(list(uploaded_air.keys())[0])
air = air.rename(columns={'value': 'AirQuality'})

print("Upload Water Quality CSV")
uploaded_water = files.upload()
water = pd.read_csv(list(uploaded_water.keys())[0])
water = water.rename(columns={'value': 'WaterQuality'})

print("Upload Population CSV")
uploaded_pop = files.upload()
pop = pd.read_csv(list(uploaded_pop.keys())[0])
pop = pop.rename(columns={'value': 'PopulationGrowth'})

print("Upload GDP CSV")
uploaded_gdp = files.upload()
gdp = pd.read_csv(list(uploaded_gdp.keys())[0])
gdp = gdp.rename(columns={'value': 'GDP'})

print("Upload Life Expectancy CSV")
uploaded_health = files.upload()
health = pd.read_csv(list(uploaded_health.keys())[0])
health = health.rename(columns={'value': 'LifeExpectancy'})

# STEP 3: Merge Datasets on Region & Year
df = air.merge(water, on=["region","year"], how="inner") \
        .merge(pop, on=["region","year"], how="inner") \
        .merge(gdp, on=["region","year"], how="inner") \
        .merge(health, on=["region","year"], how="inner")

print("Merged Dataset Shape:", df.shape)
print(df.head())

# STEP 4: Check for Missing Data
print("\nMissing Values:\n", df.isnull().sum())

# Optional: Fill missing data if any
df[df.select_dtypes(include=np.number).columns] = df.select_dtypes(include=np.number).fillna(df.select_dtypes(include=np.number).mean())

# STEP 5: Correlation Matrix
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

# STEP 6: Define Features (X) and Targets (y)
features = ["AirQuality", "WaterQuality", "PopulationGrowth", "GDP"]
targets = ["LifeExpectancy"]

X = df[features]
y = df[targets]

# STEP 7: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# STEP 8: Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("\n--- Linear Regression Results ---")
print("Coefficients:")
for f, c in zip(features, lr.coef_[0]):
    print(f"{f}: {c:.4f}")
print("Intercept:", lr.intercept_[0])
print("R2 Score:", r2_score(y_test, y_pred_lr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))

# STEP 8.1: Linear Regression Expression
print("\nLinear Regression Expression:")
expression = f"LifeExpectancy = {lr.intercept_[0]:.4f}"
for f, c in zip(features, lr.coef_[0]):
    sign = '+' if c >= 0 else '-'
    expression += f" {sign} {abs(c):.4f} * {f}"
print(expression)

# STEP 9: Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train.values.ravel())
y_pred_rf = rf.predict(X_test)

print("\n--- Random Forest Results ---")
print("R2 Score:", r2_score(y_test, y_pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))

# STEP 9.1: Random Forest Approximate Expression
print("\nRandom Forest Approximate Expression (based on feature importances):")
# Use mean LifeExpectancy as baseline
baseline = y.mean()[0]
expression_rf = f"LifeExpectancy ≈ {baseline:.4f}"
for f, imp in zip(features, rf.feature_importances_):
    sign = '+' if imp >= 0 else '-'
    expression_rf += f" {sign} {imp:.4f} * scaled({f})"
print(expression_rf)
print("Note: 'scaled' refers to z-score normalized features (mean=0, std=1). Random Forest is non-linear, so this is an approximation.")

# STEP 10: Feature Importance (Random Forest)
importances = pd.Series(rf.feature_importances_, index=features)
importances.sort_values().plot(kind='barh', figsize=(6,4), title="Feature Importance")
plt.show()

# STEP 11: Optional - Pairplot for visual relationships
sns.pairplot(df[features + targets])
plt.show()