# Hands On - Predicting The Quality Of Wine - Introduction to Regression Models

# Import Data & Check Structure



## Import

In [None]:
import pandas as pd
wine = pd.read_csv("https://raw.githubusercontent.com/casbdai/datasets/main/wine_regression.csv")

## Checking Structure

In [None]:
wine.____

In [None]:
wine.____

Vintage and age carry the same information. Age is only a mathematical transformation of the vintage year. 

Taking 2021 as "reference year": 2021 - 1952 = 69

Thus, we delete the variable "vintage": 


In [None]:
wine = wine.____
wine.head()

## Separate Features and Targets

In [None]:
X = wine.____
y = wine____

# Workflow for Simple Linear Regression

### 1) Import Model Function

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### 2) Instantiate Model

In [None]:
linreg = LinearRegression()

### 3) Create Test & Training Data

We want to mantain the temporal order of the data set. So, we set the additional attribute "shuffle" to False. 

Checkout what happens, if we set it to True!


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12, shuffle=False)

In [None]:
X_train.tail()

In [None]:
X_test.head()

#### Selecting "Harvest Temperatur" Variable and redo splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.loc[:,["harvest.temp"]], y, test_size=0.12, shuffle=False)
print(X_train.tail())
print(X_test.head())

### 4) Fit Model to Data

In [None]:
linreg.fit(X_train,y_train)

### 5) Make Predictions on Test Data

In [None]:
y_pred = linreg.predict(X_test)
y_pred

### 6) Evaluate Performance

In [None]:
mean_squared_error(y_test, y_pred, squared=False)

## Further investigate results

### Get Regression Coefficient

In [None]:
linreg.coef_

For each additional degree Celsius in the harvest season, the price of wine is increasing by 9.63 units!

### Calculate MAPE

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y_test, y_pred)

### Draw Regression Line

In [None]:
import matplotlib.pyplot as plt

y_pred = linreg.predict(X_train)

plt.scatter(X_train, y_train,  color='blue')
plt.plot(X_train, y_pred, color='green', linewidth=3)
plt.title("Wine Price by Harvest Temperatur")
plt.xlabel("Harvest Temperatur")
plt.ylabel("Price")
plt.figure(figsize=(20,10))

plt.show()

### Understanding Nature of Effects

SKLearn does not calculate significance levels - We need to redo the analysis with a different software library ("statsmodel"), if we want that information. 

The following function is doing that for us: 

In [None]:
def get_summary_report(X_train, y_train):
  import statsmodels.api as sm
  X_train = sm.add_constant(X_train, prepend=False)
  linreg_sm  = sm.OLS(y_train, X_train)
  results = linreg_sm.fit()
  return results

Let's execute that function! The regression coefficient is identical

In [None]:
regression_results = get_summary_report(X_train, y_train)
print(regression_results.summary())

# Workflow for Multipe Linear Regression

### 1) Import Model Function

In [None]:
from sklearn.____ import ____
from sklearn.____ import ____

### 2) Instantiate Model

In [None]:
linreg_full = ____

### 3) Create Test and Training Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(____, y____, test_size=0.12, shuffle=____)

### 4) Fit Model to Data


In [None]:
linreg_full.____(____,____)

### 5) Make Predictions on Test Data

In [None]:
y_pred = linreg_full.____(____)
y_pred

### 6) Evaluate Performance

In [None]:
mean_squared_error(____, y_test, squared=False)

In [None]:
mean_absolute_percentage_error(y_test, ____)

## Further Investigate Results

In [None]:
results = get_summary_report(____, ____)
print(results.summary())

In [None]:
X_train.corr()

purchasing.power and age are super strongly correlated. It is almost a perfect negative correlation (-1). Such high correlations can cause problems in many algorithms. All correlations of > 0.7 and < - 0.7 should be considered cautiously.

In [None]:
X_train.plot(kind="scatter", x="purchasing.power",y="age")

## Re-run Model

drop pruchasing.power from dataset

In [None]:
X = X.drop("purchasing.power", axis = 1)

Rerun model in identical fashion

In [None]:
____, ____, ____, ____ = train_test_split(X, y, test_size=0.12, shuffle=False)
linreg_full.____(____,____)
y_pred = linreg_full.____(____)
y_pred

In [None]:
results = get_summary_report(____, ____)
print(results.summary())

**Now, we have much more intuively understandable results !!!**

In [None]:
print(mean_squared_error(y_pred, y_test, squared=False))
print(mean_absolute_percentage_error(y_test, y_pred))

## Actual Vs Predicted Plot

In [None]:
def actual_vs_predicted_plot(y_true, y_pred):
  import numpy as np
  import matplotlib.pyplot as plt
  min_value=np.array([y_true.min(), y_pred.min()]).min()
  max_value= min=np.array([y_true.max(), y_pred.max()]).max()
  fig = plt.figure()
  ax = fig.gca()
  ax.scatter(y_true,y_pred, color="blue")
  ax.plot([min_value,max_value], [min_value, max_value], lw=4, color="green")
  ax.set_xlabel('Actual')
  ax.set_ylabel('Predicted')
  plt.show()

In [None]:
actual_vs_predicted_plot(y_test, y_pred)

# Regression Tree

The logic is almost identical we have to import and instantiate the DecisionTreeRegressor - all else is copy & paste

### 1) Import Model Function

In [None]:
from sklearn.____ import ____
from sklearn.metrics import mean_squared_error

### 2) Instantiate Model

In [None]:
tree = ____(random_state=11)

we add random_state to get identical results everywhere (they can vary a bit for regression trees)

### 3) Create Test and Training Data

In [None]:
X_train, X_test, y_train, y_test = ____(____, ____, ____=0.12, shuffle=False)

### 4) Fit Model to Data

In [None]:
tree.____(____,____)

### 5) Make Predictions on Test Data

In [None]:
y_pred = tree.____(____)

### 6) Evaluate Performance

In [None]:
mean_squared_error(____, ____, squared=____)

In [None]:
mean_absolute_percentage_error(____, ____)

The Decision Tree Regressor is so far our ____ model!

### Plot Tree

In [None]:
def plot_tree_regression(treemodel, X_train):
    from sklearn import tree
    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(60,20))
    _ = tree.plot_tree(treemodel, feature_names=X_train.columns, filled=True, precision=2)

In [None]:
plot_tree_regression(tree, X_train)