In [1]:
# import library.
import pandas as pd
import numpy as np

import statsmodels.api as sm

from sklearn.linear_model import LinearRegression

In [2]:
# training set.
wine = pd.read_csv('../data/wine.csv')
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Year         25 non-null     int64  
 1   Price        25 non-null     float64
 2   WinterRain   25 non-null     int64  
 3   AGST         25 non-null     float64
 4   HarvestRain  25 non-null     int64  
 5   Age          25 non-null     int64  
 6   FrancePop    25 non-null     float64
dtypes: float64(3), int64(4)
memory usage: 1.5 KB


In [3]:
# feature and target sets.
features_init = ['AGST', 'HarvestRain', 'Age', 'WinterRain', 'FrancePop']
X_init = wine[features_init]
y_init = wine['Price']


# summary of coefficient sections.
ols_X_init = X_init
ols_X_init.insert(0, 'Constant', np.ones(ols_X_init.shape[0], dtype=int))

ols_init = sm.OLS(y_init, ols_X_init)
ols_results_init = ols_init.fit()
print(ols_results_init.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.829
Model:                            OLS   Adj. R-squared:                  0.784
Method:                 Least Squares   F-statistic:                     18.47
Date:                Sat, 14 Aug 2021   Prob (F-statistic):           1.04e-06
Time:                        14:33:54   Log-Likelihood:                -2.1043
No. Observations:                  25   AIC:                             16.21
Df Residuals:                      19   BIC:                             23.52
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Constant       -0.4504     10.189     -0.044      

In [4]:
# using significant features.
features_sig = ['AGST', 'HarvestRain', 'Age', 'WinterRain']
X_sig = wine[features_sig]
y_sig = wine['Price']


# summary of coefficient sections.
ols_X_sig = X_sig
ols_X_sig.insert(0, 'Constant', np.ones(ols_X_sig.shape[0], dtype=int))

ols_sig = sm.OLS(y_sig, ols_X_sig)
ols_results_sig = ols_sig.fit()
print(ols_results_sig.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.829
Model:                            OLS   Adj. R-squared:                  0.794
Method:                 Least Squares   F-statistic:                     24.17
Date:                Sat, 14 Aug 2021   Prob (F-statistic):           2.04e-07
Time:                        14:33:54   Log-Likelihood:                -2.1622
No. Observations:                  25   AIC:                             14.32
Df Residuals:                      20   BIC:                             20.42
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Constant       -3.4300      1.766     -1.942      

In [5]:
# testing set.
wine_test = pd.read_csv('../data/wine_test.csv')
wine_test.head()

Unnamed: 0,Year,Price,WinterRain,AGST,HarvestRain,Age,FrancePop
0,1979,6.9541,717,16.1667,122,4,54835.832
1,1980,6.4979,578,16.0,74,3,55110.236


In [6]:
# fit model and predict.
X_test = wine_test[features_sig]
y_test = wine_test['Price']

model = LinearRegression().fit(X_sig.iloc[:, 1:], y_sig)

y_pred = model.predict(X_test)
print(f"Actual: {np.round(np.array(y_test), 3)}\nPredict: {np.round(y_pred, 3)}")

Actual: [6.954 6.498]
Predict: [6.769 6.685]


In [7]:
# compute R2 for test set.
results = pd.DataFrame({'Actual': np.array(y_test), 'Predict': y_pred}).round(3)

sse = ((results['Actual'] - results['Predict'])**2).sum()
sst = ((results['Actual'] - y_sig.mean())**2).sum()
print(f"R2 Score: {(1 - sse / sst).round(3)}")

R2 Score: 0.795
