# Import

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import *
# Import thêm dữ thư viện nếu cần

# Đọc dữ liệu

In [2]:
# Đọc dữ liệu bằng pandas
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Lấy các đặc trưng X và giá trị mục tiêu y cho các tập huấn luyện (train) và kiểm tra (test)
X_train = train.iloc[:, :-1]    # Dataframe (chứa 10 đặc trưng huấn luyện)
Y_train = train.iloc[:, -1]     # Series    (chứa 1 giá trị mục tiêu kiểm tra)

X_test = test.iloc[:, :-1]      # Dataframe (chứa 10 đặc trưng kiểm tra)
Y_test = test.iloc[:, -1]       # Series    (chứa 1 giá trị mục tiêu kiểm tra)
# Sinh viên có thể sử dụng các khác nếu cần

# Cài đặt hàm

In [3]:
# Cài đặt các hàm cần thiết ở đây
class OLSLinearRegression:
    def fit(self, X, y):
        X_pinv = np.linalg.inv(X.T @ X) @ X.T    # np.linalg.pinv(X)
        self.w = X_pinv @ y
        return self
    def get_params(self):
        return self.w
    def predict(self, X):
        return np.sum(self.w.ravel() * X, axis=1)  

def RMSE(test,pred):
    return np.sqrt(sklearn.metrics.mean_squared_error(test, pred))

def CrossValidation_5Fold(X_train_clone, Y_train_clone):
    rmse = 0
    for train_index, test_index in sklearn.model_selection.KFold(n_splits = 5, shuffle = True).split(X_train_clone):
        X_train_feature, X_values, Y_train_feature, Y_values = X_train_clone[train_index], X_train_clone[test_index], Y_train_clone[train_index], Y_train_clone[test_index]
        lr.fit(X_train_feature, Y_train_feature)
        rmse += RMSE(Y_values, lr.predict(X_values))
    return rmse/5

def get_feature_index(paradigm,name):
    for i in range(len(paradigm.columns.values)):
        if name == paradigm.columns.values[i]:
            return i

def new_paradigm_with_avg_2_best_feature(best_feature, second_feature, best_feature_name, second_feature_name):
    return pd.Series((best_feature+second_feature)/2, name='Avg ' + best_feature_name + ' and ' + second_feature_name)

def new_paradigm_with_sum_of_sqrt_2_best_feature(best_feature, second_feature, best_feature_name, second_feature_name):
    return pd.Series(np.sqrt(best_feature) + np.sqrt(second_feature), name='Sum of sqrt ' + best_feature_name + ' and ' + second_feature_name)

def new_paradigm_with_sum_of_root4_2_best_feature(best_feature, second_feature, best_feature_name, second_feature_name):
    return pd.Series(best_feature**(1/4) + second_feature**(1/4), name='Sum of root4 ' + best_feature_name + ' and ' + second_feature_name)

def new_paradigm_with_thinness_age_5_19(train, test):
    fea1_train = X_train.copy().to_numpy()[:,get_feature_index(train, 'Thinness age 10-19')]
    fea2_train = X_train.copy().to_numpy()[:,get_feature_index(train, 'Thinness age 5-9')]
    fea1_test = X_test.copy().to_numpy()[:,get_feature_index(test, 'Thinness age 10-19')]
    fea2_test = X_test.copy().to_numpy()[:,get_feature_index(test, 'Thinness age 5-9')] 
    return train.join(pd.Series((fea1_train+fea2_train)/2,name = 'Thinness age 5-19')), test.join(pd.Series((fea1_test+fea2_test)/2,name = 'Thinness age 5-19'))

def Model_Thinnes_Age_5_19(X_train, X_test):
    new_X_train, new_X_test = new_paradigm_with_thinness_age_5_19(X_train,X_test)
    del new_X_train['Thinness age 5-9']
    del new_X_test['Thinness age 5-9']
    del new_X_train['Thinness age 10-19']
    del new_X_test['Thinness age 10-19']
    return new_X_train, new_X_test

def Model_Avg_2_best_feature(X_train, X_test):
    best_feature_name = sorted(LifeExpectancyRMSE, key=lambda row:(row[1]))[0][0]
    second_feature_name = sorted(LifeExpectancyRMSE, key=lambda row:(row[1]))[1][0]
    best_feature_train = X_train.copy().to_numpy()[:,get_feature_index(X_train, best_feature_name)]
    second_feature_train = X_train.copy().to_numpy()[:,get_feature_index(X_train, second_feature_name)]
    best_feature_test = X_test.copy().to_numpy()[:,get_feature_index(X_test, best_feature_name)]
    second_feature_test = X_test.copy().to_numpy()[:,get_feature_index(X_test, second_feature_name)]
    new_X_train = X_train.join(new_paradigm_with_avg_2_best_feature(best_feature_train, second_feature_train, best_feature_name, second_feature_name))
    new_X_test = X_test.join(new_paradigm_with_avg_2_best_feature(best_feature_test, second_feature_test, best_feature_name, second_feature_name))
    del new_X_train[best_feature_name]
    del new_X_test[best_feature_name]
    del new_X_train[second_feature_name]
    del new_X_test[second_feature_name]
    return new_X_train, new_X_test

def Model_Sum_Of_Sqrt_2_best_feature(X_train, X_test):
    best_feature_name = sorted(LifeExpectancyRMSE, key=lambda row:(row[1]))[0][0]
    second_feature_name = sorted(LifeExpectancyRMSE, key=lambda row:(row[1]))[1][0]
    best_feature_train = X_train.copy().to_numpy()[:,get_feature_index(X_train, best_feature_name)]
    second_feature_train = X_train.copy().to_numpy()[:,get_feature_index(X_train, second_feature_name)]
    best_feature_test = X_test.copy().to_numpy()[:,get_feature_index(X_test, best_feature_name)]
    second_feature_test = X_test.copy().to_numpy()[:,get_feature_index(X_test, second_feature_name)]
    new_X_train = X_train.join(new_paradigm_with_sum_of_sqrt_2_best_feature(best_feature_train, second_feature_train, best_feature_name, second_feature_name))
    new_X_test = X_test.join(new_paradigm_with_sum_of_sqrt_2_best_feature(best_feature_test, second_feature_test, best_feature_name, second_feature_name))
    del new_X_train[best_feature_name]
    del new_X_test[best_feature_name]
    del new_X_train[second_feature_name]
    del new_X_test[second_feature_name]
    return new_X_train, new_X_test

def Model_Sum_Of_Root4_2_best_feature(X_train, X_test):
    best_feature_name = sorted(LifeExpectancyRMSE, key=lambda row:(row[1]))[0][0]
    second_feature_name = sorted(LifeExpectancyRMSE, key=lambda row:(row[1]))[1][0]
    best_feature_train = X_train.copy().to_numpy()[:,get_feature_index(X_train, best_feature_name)]
    second_feature_train = X_train.copy().to_numpy()[:,get_feature_index(X_train, second_feature_name)]
    best_feature_test = X_test.copy().to_numpy()[:,get_feature_index(X_test, best_feature_name)]
    second_feature_test = X_test.copy().to_numpy()[:,get_feature_index(X_test, second_feature_name)]
    new_X_train = X_train.join(new_paradigm_with_sum_of_root4_2_best_feature(best_feature_train, second_feature_train, best_feature_name, second_feature_name))
    new_X_test = X_test.join(new_paradigm_with_sum_of_root4_2_best_feature(best_feature_test, second_feature_test, best_feature_name, second_feature_name))
    del new_X_train[best_feature_name]
    del new_X_test[best_feature_name]
    del new_X_train[second_feature_name]
    del new_X_test[second_feature_name]
    return new_X_train, new_X_test

def Model_Sqrt_2_best_feature(X_train, X_test):
    best_feature_name = sorted(LifeExpectancyRMSE, key=lambda row:(row[1]))[0][0]
    second_feature_name = sorted(LifeExpectancyRMSE, key=lambda row:(row[1]))[1][0]
    best_feature_train = X_train.copy().to_numpy()[:,get_feature_index(X_train, best_feature_name)]
    second_feature_train = X_train.copy().to_numpy()[:,get_feature_index(X_train, second_feature_name)]
    best_feature_test = X_test.copy().to_numpy()[:,get_feature_index(X_test, best_feature_name)]
    second_feature_test = X_test.copy().to_numpy()[:,get_feature_index(X_test, second_feature_name)]
    train_fea = pd.DataFrame(np.sqrt(best_feature_train) + np.sqrt(second_feature_train))
    test_fea = pd.DataFrame(np.sqrt(best_feature_test) + np.sqrt(second_feature_test))
    return train_fea, test_fea

def Model_Root4_2_best_feature(X_train, X_test):
    best_feature_name = sorted(LifeExpectancyRMSE, key=lambda row:(row[1]))[0][0]
    second_feature_name = sorted(LifeExpectancyRMSE, key=lambda row:(row[1]))[1][0]
    X_train.iloc[:,get_feature_index(X_train, best_feature_name)] = X_train.iloc[:,get_feature_index(X_train, best_feature_name)]**(1/4)
    X_train.iloc[:,get_feature_index(X_train, second_feature_name)] = X_train.iloc[:,get_feature_index(X_train, second_feature_name)]**(1/4)
    X_test.iloc[:,get_feature_index(X_test, best_feature_name)] = X_test.iloc[:,get_feature_index(X_test, best_feature_name)]**(1/4)
    X_test.iloc[:,get_feature_index(X_test, second_feature_name)] = X_test.iloc[:,get_feature_index(X_test, second_feature_name)]**(1/4)
    return X_train, X_test

def create_new_data(X_train, X_test):
    new_train_data = []
    new_test_data = []
    train_temp, test_temp = Model_Thinnes_Age_5_19(X_train.copy(), X_test.copy())
    new_train_data.append(['Thinnes age 5-19',train_temp])
    new_test_data.append(['Thinnes age 5-19',test_temp])
    train_temp, test_temp = Model_Avg_2_best_feature(X_train.copy(), X_test.copy())
    new_train_data.append(['Avg 2 best feature',train_temp])
    new_test_data.append(['Avg 2 best feature',test_temp])
    train_temp, test_temp = Model_Sum_Of_Sqrt_2_best_feature(X_train.copy(), X_test.copy())
    new_train_data.append(['Sum of sqrt 2 best feature',train_temp])
    new_test_data.append(['Sum of sqrt 2 best feature',test_temp])
    train_temp, test_temp = Model_Root4_2_best_feature(X_train.copy(), X_test.copy())
    new_train_data.append(['Root4 2 best feature',train_temp])
    new_test_data.append(['Root4 2 best feature',test_temp])
    train_temp, test_temp = Model_Sum_Of_Root4_2_best_feature(X_train.copy(), X_test.copy())
    new_train_data.append(['Sum of Root4 2 best feature',train_temp])
    new_test_data.append(['Sum of Root4 2 best feature',test_temp])
    train_temp, test_temp = Model_Sqrt_2_best_feature(X_train.copy(), X_test.copy())
    new_train_data.append(['Sqrt 2 best feature',train_temp])
    new_test_data.append(['Sqrt 2 best feature',test_temp])
    return new_train_data, new_test_data

# Yêu cầu 1a: Sử dụng toàn bộ 10 đặc trưng đề bài cung cấp (2 điểm) 

In [4]:
# Phần code cho yêu cầu 1a
lr = OLSLinearRegression().fit(X_train, Y_train)
for i in range(len(lr.get_params())):
    print('w' + str(i), ' = ', lr.get_params()[i])

w0  =  0.01510136273529735
w1  =  0.09021998065778095
w2  =  0.04292181752549854
w3  =  0.13928911689488005
w4  =  -0.5673328270882102
w5  =  -0.00010076511487489814
w6  =  0.7407134377587482
w7  =  0.19093579767394017
w8  =  24.50597359115195
w9  =  2.393516607832661


In [5]:
# Gọi hàm RMSE (tự cài đặt hoặc từ thư viện) trên tập kiểm tra
rmse_a = RMSE(Y_test,lr.predict(X_test))
print('RMSE : ', rmse_a)

RMSE :  7.064046430584209


Công thức hồi quy

$$\text{Life expectancy} = $$
$w0(\text{Adult Mortality}) + w1(\text{BMI}) + w2(\text{Polio}) + w3(\text{Diphtheria}) + w4(\text{HIV/AIDS}) + w5(\text{GDP}) + w6(\text{Thinness age 10-19}) + w7(\text{Thinness age 5-9}) + w8(\text{Income composition of resources}) + w9(\text{Schooling})$

# Yêu cầu 1b: Xây dựng mô hình sử dụng duy nhất 1 đặc trưng, tìm mô hình cho kết quả tốt nhất (2 điểm)

Lưu ý: khi sử dụng cross-validation, sinh viên cần xáo trộn dữ liệu 1 lần duy nhất và thực hiện trên toàn bộ đặc trưng

In [6]:
# Phần code cho yêu cầu 1b
# Tìm ra đặc trưng tốt nhất
# In ra các kết quả cross-validation như yêu cầu
LifeExpectancyRMSE = []
for i in range(len(X_train.columns)):
    LifeExpectancyRMSE.append([X_train.columns.values[i], CrossValidation_5Fold(X_train.copy().to_numpy()[:,i:i+1], Y_train.copy().to_numpy())])
best_feature_name = sorted(LifeExpectancyRMSE, key=lambda row:(row[1]))[0][0]
best_feature_index = get_feature_index(X_train,best_feature_name)
print('Đặc trưng tốt nhất : ', best_feature_name)
print('Vị trí đặc trưng tốt nhất : ', best_feature_index)
print(pd.DataFrame(LifeExpectancyRMSE, columns= ['Đặc trưng', 'RMSE']))

Đặc trưng tốt nhất :  Schooling
Vị trí đặc trưng tốt nhất :  9
                         Đặc trưng       RMSE
0                  Adult Mortality  46.206149
1                              BMI  27.958758
2                            Polio  17.935279
3                       Diphtheria  16.019921
4                         HIV/AIDS  67.058772
5                              GDP  60.188389
6               Thinness age 10-19  51.866769
7                 Thinness age 5-9  51.754931
8  Income composition of resources  13.210230
9                        Schooling  11.765080


In [7]:
# Huấn luyện lại mô hình best_feature_model với đặc trưng tốt nhất trên toàn bộ tập huấn luyện
X_train_best_feature = X_train.copy().to_numpy()[:,best_feature_index:best_feature_index+1]
lr.fit(X_train_best_feature, Y_train.copy().to_numpy())
print('w = ')
print(lr.get_params())

w = 
[5.5573994]


In [8]:
# Gọi hàm RMSE (tự cài đặt hoặc từ thư viện) trên tập kiểm tra với mô hình best_feature_model
rmse_b = RMSE(Y_test, lr.predict(X_test.copy().to_numpy()[:,best_feature_index:best_feature_index+1]))
print('RMSE : ', rmse_b)

RMSE :  10.26095039165537


Công thức hồi quy

$$\text{Life expectancy} = w * Schooling$$

# Yêu cầu 1c: Sinh viên tự xây dựng mô hình, tìm mô hình cho kết quả tốt nhất (3 điểm)

Lưu ý: khi sử dụng cross-validation, sinh viên cần xáo trộn dữ liệu 1 lần duy nhất và thực hiện trên toàn bộ $m$ mô hình mà sinh viên thiết kế

In [9]:
def CrossFold(X_train_clone, Y_train_clone):
    rmse = 0
    for train_index, test_index in sklearn.model_selection.KFold(n_splits = 5, shuffle = True).split(X_train_clone):
        X_train_feature, X_values, Y_train_feature, Y_values = X_train_clone[train_index], X_train_clone[test_index], Y_train_clone[train_index], Y_train_clone[test_index]
        lr.fit(X_train_feature, Y_train_feature)
        rmse += RMSE(Y_values, lr.predict(X_values))
    return rmse/5

In [10]:
# Phần code cho yêu cầu 1c
# Tìm ra mô hình tốt nhất (tự thiết kế bởi sinh viên)
# In ra các kết quả cross-validation như yêu cầu
new_train_data, new_test_data = create_new_data(X_train.copy(), X_test.copy())
NewLifeExpectancyRMSE = []
new_train_data.append(['Default model',X_train])
for i in range(len(new_train_data)):
    NewLifeExpectancyRMSE.append([new_train_data[i][0], CrossValidation_5Fold(new_train_data.copy()[i][1].to_numpy(), Y_train.copy().to_numpy())])
name = sorted(NewLifeExpectancyRMSE, key=lambda row:(row[1]))[0][0]
print(pd.DataFrame(NewLifeExpectancyRMSE, columns= ['Mô hình', 'RMSE']))
print('Mô hình tốt nhất : ', name)

                       Mô hình      RMSE
0             Thinnes age 5-19  7.914910
1           Avg 2 best feature  8.105618
2   Sum of sqrt 2 best feature  5.213554
3         Root4 2 best feature  3.985899
4  Sum of Root4 2 best feature  4.694251
5          Sqrt 2 best feature  6.180251
6                Default model  7.898711
Mô hình tốt nhất :  Root4 2 best feature


In [11]:
# Huấn luyện lại mô hình my_best_model trên toàn bộ tập huấn luyện
index = 0
for i in range(len(new_train_data)):
    if (new_train_data[i][0] == name):
        index = i
        break
X_train_my_best_model = new_train_data.copy()[index][1].to_numpy()
lr.fit(X_train_my_best_model, Y_train.copy().to_numpy())
for i in range(len(lr.get_params())):
    print('w' + str(i), ' = ', lr.get_params()[i])

w0  =  -0.015185742427712211
w1  =  0.044602692927739336
w2  =  -7.399559271062484e-05
w3  =  0.027189499305987894
w4  =  -0.5640320061653871
w5  =  7.296053152205728e-05
w6  =  0.006811567455989593
w7  =  -0.052223045830113834
w8  =  3.8381257883199464
w9  =  35.210322828085786


In [12]:
# Gọi hàm RMSE (tự cài đặt hoặc từ thư viện) trên tập kiểm tra với mô hình my_best_model
rmse_c = RMSE(Y_test, lr.predict(new_test_data.copy()[index][1].to_numpy()))
print('RMSE : ', rmse_c)

RMSE :  3.870607190322188


Công thức hồi quy

$$\text{Life expectancy} = $$
$w0(\text{Adult Mortality}) + w1(\text{BMI}) + w2(\text{Polio}) + w3(\text{Diphtheria}) + w4(\text{HIV/AIDS}) + w5(\text{GDP}) + w6(\text{Thinness age 10-19}) + w7(\text{Thinness age 5-9}) + w8\sqrt[4]{(\text{Income composition of resources})} + w9\sqrt[4]{(\text{Schooling})}$