In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Load the dataset
file_path = "path_to_your_dataset.csv"  # Replace with your actual dataset path
data = pd.read_csv(file_path)

# Check the first few rows of the dataset
data.head()


In [None]:
# Get dataset information
data.info()

# Handle missing values
data = data.dropna()

# Ensure 'Year' is treated as an integer
data['Year'] = data['Year'].astype(int)

# Summary statistics
data.describe()


In [None]:
# Lineplot for Life Expectancy Over Time
plt.figure(figsize=(12, 6))
sns.lineplot(data=data, x='Year', y='Life expectancy', hue='Entity', legend=False)
plt.title("Life Expectancy Over Time")
plt.xlabel("Year")
plt.ylabel("Life Expectancy")
plt.show()

# Histogram of Life Expectancy
plt.figure(figsize=(8, 5))
sns.histplot(data['Life expectancy'], bins=20, kde=True, color='orange')
plt.title("Distribution of Life Expectancy")
plt.xlabel("Life Expectancy")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Define Features (X) and Target (y)
X = data[['Year']]  # Only using Year as a feature
y = data['Life expectancy']

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Random Forest Regressor Model
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


In [None]:
# Function to Evaluate Models
def evaluate_model(y_test, y_pred, model_name):
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} - Mean Squared Error: {mse:.2f}, R²: {r2:.2f}")

# Evaluate Both Models
evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest Regressor")


In [None]:
# Linear Regression - Visualization
plt.figure(figsize=(8, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual Values')
plt.plot(X_test, y_pred_lr, color='red', label='Predictions (LR)')
plt.title("Linear Regression - Actual vs. Predicted")
plt.xlabel("Year")
plt.ylabel("Life Expectancy")
plt.legend()
plt.show()

# Random Forest - Visualization
plt.figure(figsize=(8, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual Values')
plt.scatter(X_test, y_pred_rf, color='green', label='Predictions (RF)', alpha=0.6)
plt.title("Random Forest - Actual vs. Predicted")
plt.xlabel("Year")
plt.ylabel("Life Expectancy")
plt.legend()
plt.show()


In [None]:
# Future Years for Prediction
future_years = pd.DataFrame({'Year': [2025, 2030, 2040, 2050]})

# Predict Using Linear Regression
future_pred_lr = lr_model.predict(future_years)

# Predict Using Random Forest
future_pred_rf = rf_model.predict(future_years)

# Show Predictions
print("Future Predictions (Linear Regression):")
print(future_years.assign(Life_Expectancy=future_pred_lr))

print("\nFuture Predictions (Random Forest):")
print(future_years.assign(Life_Expectancy=future_pred_rf))


In [None]:
# Save Predictions to CSV
predictions = future_years.copy()
predictions['Life_Expectancy_LR'] = future_pred_lr
predictions['Life_Expectancy_RF'] = future_pred_rf
predictions.to_csv("future_predictions.csv", index=False)

print("Predictions saved to future_predictions.csv")
