In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import joblib

# Load data
data = pd.read_csv('../data/life_expectancy_data.csv')

# Drop non-numeric columns
data_numeric = data.drop(columns=['Country', 'Year', 'Status'])

# Separate features and target variable
X = data_numeric.drop(columns=['Life expectancy'])
y = data_numeric['Life expectancy']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
y_imputed = imputer.fit_transform(y.values.reshape(-1, 1))

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_imputed, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Save model and scaler
joblib.dump(model, '../models/trained_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(imputer, '../models/imputer.pkl')

print("Model training complete. Model, scaler, and imputer saved.")


Model training complete. Model, scaler, and imputer saved.
