# Đồ án 3: Linear Regression

In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import linear_model

In [57]:
def LinearRegression(A, b):
    reg = linear_model.LinearRegression().fit(A, b)
    return reg.coef_, reg.intercept_

In [58]:
def CrossValidation(A, b, kfold):
    kf = KFold(n_splits=kfold)
    error_list = []

    for train_index, test_index in kf.split(A):
        A_train, A_test = A[train_index], A[test_index]
        b_train, b_test = b[train_index], b[test_index]
        x, bias = LinearRegression(A_train, b_train)
        error_list.append(np.mean(np.absolute(b_test - (np.dot(A_test, x) + bias))))

    return np.mean(error_list)

In [59]:
def chooseBestProperty(A, b):
    result = []

    for i in range(A.shape[1]):
        cv_prop = CrossValidation(A[:, i:i + 1], b, 10)
        result.append(cv_prop)
    
    return result, np.argmin(result)

In [60]:
def buildModel(A, b, error_list, label_len):
    sorted_error = np.argsort(error_list)

    full_error = CrossValidation(A, b, 10)

    error_list = []
    cv_error_list = []
    prop_list = []

    for i in range(2, label_len - 1):
        prop_to_pick = sorted_error[:i]
        A_build = A[:, prop_to_pick]
        error = CrossValidation(A_build, b, 10)
        cv_error_list.append(error)
        error_list.append(np.absolute(full_error - error))
        prop_list.append(prop_to_pick)

    return cv_error_list, prop_list, np.argmin(error_list)

In [61]:
df = pd.read_csv('wine.csv', sep=';')

In [62]:
label = df.columns
properties = df.iloc[:, : -1].to_numpy()
rate = df.iloc[:, -1].to_numpy()

In [63]:
LinearRegression(properties, rate)

(array([ 4.75247531e-02, -1.06874258e+00, -2.68710829e-01,  3.49742662e-02,
        -1.59729560e+00,  3.48788138e-03, -3.79835506e-03, -3.94690810e+01,
        -2.45575908e-01,  7.73840794e-01,  2.69377496e-01]),
 42.91716245147436)

In [64]:
CrossValidation(properties, rate, 10)

0.5094507964775307

In [65]:
error_list, best_index = chooseBestProperty(properties, rate)
print(f'Best property is {label[best_index]}')
x, b0 = LinearRegression(properties[:, best_index:best_index + 1], rate)
print(x)
print(b0)

Best property is alcohol
[0.37471047]
1.7740758844499194


In [66]:
cv_error_list, prop_list, best_index_prop = buildModel(properties, rate, error_list, len(label))
print(f'Best properties are {label[prop_list[best_index_prop]]}')
x, b0 = LinearRegression(properties[:, prop_list[best_index_prop]], rate)
print(x)
print(b0)
print(cv_error_list[best_index_prop])

Best properties are Index(['alcohol', 'volatile acidity', 'total sulfur dioxide', 'citric acid',
       'sulphates', 'density', 'fixed acidity', 'chlorides',
       'free sulfur dioxide'],
      dtype='object')
[ 2.79228621e-01 -1.08519171e+00 -3.27609434e-03 -2.50067753e-01
  7.55341209e-01 -3.10768383e+01  5.89618605e-02 -1.44151674e+00
  2.85670572e-03]
33.61492533087279
0.5104161230846668
