In [25]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [26]:
if os.name == 'nt':
    path = os.path.join(os.path.dirname(os.getcwd), 'BT1/BT4/LifeExpectancyData.csv')
elif os.name == 'posix':
    path = os.path.join(os.getcwd(), 'BT1/BT4/LifeExpectancyData.csv')
data = pd.read_csv(path)
numeric_columns = data.select_dtypes(include=np.number).columns
data.fillna(data[numeric_columns].mean(), inplace=True)
data = pd.get_dummies(data, columns=['Status'], drop_first=True)

In [27]:
X = data.drop(['Country', 'Life expectancy '], axis=1)
y = data['Life expectancy ']

#### Chuẩn hóa dữ liệu

In [28]:
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)

### Train

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, random_state=42)

In [30]:
model = LinearRegression()
model.fit(X_train, y_train)

In [31]:
y_pred = model.predict(X_test)

#### Tính toán các sai số

In [32]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R2: {r2}')

MAE: 2.8583331337679483
MSE: 15.242936417631359
R2: 0.8240562394240223


### 2. so sánh sai số trước và sau khi chuẩn hóa dữ liệu
#### Nếu không chuẩn hóa.

In [33]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train2, y_train2)
y_pred2 = model.predict(X_test2)

mae2 = mean_absolute_error(y_test2, y_pred2)
mse2 = mean_squared_error(y_test2, y_pred2)
r22 = r2_score(y_test2, y_pred2)

print(f'MAE: {mae2}')
print(f'MSE: {mse2}')
print(f'R2: {r22}')

# so sánh sai số:
print("so sánh sai số:", end='\n')
print(f'MAE: {mae - mae2}')
print(f'MSE: {mse - mse2}')
print(f'R2: {r2 - r22}')

MAE: 2.8583331337667004
MSE: 15.242936417613265
R2: 0.8240562394242311
so sánh sai số:
MAE: 1.247890679678676e-12
MSE: 1.809397076613095e-11
R2: -2.0883295093199195e-13


### 3. Sử dụng Cross Validation

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, random_state=42)

param_grid = {'fit_intercept': [True, False], 'copy_X': [True, False], 'n_jobs': [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
model = LinearRegression()
grid = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid.fit(X_train, y_train)

In [36]:
# best params
print(grid.best_params_)

# best model
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R2: {r2}')

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None}
MAE: 2.8583331337679483
MSE: 15.242936417631359
R2: 0.8240562394240223


### 4. Dự đoán đến năm 2030