<a href="https://colab.research.google.com/github/bnnguyen/DESLab_ML_training_2024/blob/main/Deslab_2024_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linear Regression và Bootstrapping

In [None]:
#data processing imports
import numpy as np
import pandas as pd

#data visualization imports
import seaborn as sns
import matplotlib.pyplot as plt

#SciKit Learn library for models
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# accuracy measuring imports
import statistics
from statistics import mean
import sklearn.metrics as metrics

%matplotlib inline

In [None]:
# reading in the Boston dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
data = pd.read_csv(url)
data.head()

In [None]:
# Print out summary statistics of the data

# Observing means and distribution to get a better understanding of the data
allVals = list(data.columns)

print(allVals)
np.round(data.describe(), 2)

In [None]:
# data dimensions
data.shape

In [None]:
# we always have to make sure that the y variable is normal since this can mess up our model in serious ways
# Create a plot to observe the distribution of the target
plt.figure(figsize=(10,5))
plt.tight_layout()
sns.distplot(data['medv'])

In [None]:
# Create a scatter plot comparing medv versus lstat
plt.figure(figsize=(15, 10))
data.plot(x='lstat', y='medv', style='o')
plt.xlabel('lstat')
plt.ylabel('medv')
plt.title('medv vs lstat')
plt.show()

### Xây dựng mô hình và đánh giá độ hiệu quả

In [None]:
# getting X and Y values from dataset
vals = list(set(allVals) - set(['medv']))
Xvals = data[vals].values
Yvals = data['medv'].values

In [None]:
# splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(Xvals, Yvals, test_size = 0.2, random_state = 10)

In [None]:
# creating linear regression model and fitting it based on the train sets
regressor = LinearRegression()
regressor.fit(X_train, y_train)


# getting the R^2 score of the linear model for training and test sets and seeing how the train fitted performs
# on the test data sets
r_train_score = regressor.score(X_train, y_train)
r_test_score = regressor.score(X_test, y_test)

#Print out the R^2 Scores
print("LinearReg R^2 Training Score: {}".format(np.round(r_train_score, 2)))
print("LinearReg R^2 Test Score: {}".format(np.round(r_test_score, 2)))

In [None]:
# printing out the coefficients for the model
modelCoeff = list(zip(vals,np.round(regressor.coef_, 5)))

print("Intercept: {}".format(np.round(regressor.intercept_, 5)))
linearModelvalues = pd.DataFrame(modelCoeff, columns=['Variable', 'Coefficient'], index=None)
linearModelvalues

In [None]:
# predicting what y_test should be based on X_test and then comparing to what it actually is
y_pred = regressor.predict(X_test)

results = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
df1 = results.head(25)

# Plot the prediction versus actual values
df1.plot(kind='bar',figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
# getting measures of accuracy from the comparisons
print('Mean Absolute Error:', )
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', )

### Triển khai hồi quy Bootstrap

In [None]:
bootstrap_iter = 1000    # designate the number of iterations for bootstrapping

# storing the values for each of these scores across every sample/iteraction of bootstrap
MSEtotal = []
RMSEtotal = []
MAEtotal = []

coefs = []
intercept =[]

for i in range(bootstrap_iter):
    # resampling and re-fitting model to each new sample

    X_, y_ = resample(X_train, y_train)
    regressor.fit(X_, y_)


    MSEtotal.append(metrics.mean_squared_error(y_test, y_pred))
    RMSEtotal.append(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    MAEtotal.append(metrics.mean_absolute_error(y_test, y_pred))

    coefs.append(regressor.coef_)
    intercept.append(regressor.intercept_)

In [None]:
# averaging bootstrap regression results into the lists
average = []
for coefIndex in range(len(regressor.coef_)):
    sumCoef = 0
    for sample in range(len(coefs)):
        sumCoef += coefs[sample][coefIndex]
    average.append(sumCoef)

avg_ints = sum(intercept)/len(intercept)

for x in range(len(average)):
    average[x] = average[x]/len(intercept)

In [None]:
# displaying results from the bootstrap
avgCoeffs = pd.DataFrame({'Variables': vals, 'Average Coefficients': np.round(average, 5)})
avgInt = pd.DataFrame({'Average Intercept': ['avg_int'], 'Value': np.round([avg_ints], 5)})

display(avgCoeffs)
display(avgInt)

print('Mean Absolute Error:', statistics.mean(MAEtotal))
print('Mean Squared Error:', statistics.mean(MSEtotal))
print('Root Mean Squared Error:', statistics.mean(RMSEtotal))