## Exploring the White Wine Dataset

- Which phsyiochemical property contributes most to high quality wine? Least?
- What is the ideal numeric value of that property that will generate the highest quality wine? 


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from pandas.plotting import scatter_matrix
from sklearn.preprocessing import QuantileTransformer
from pandas import DataFrame as df

In [2]:
#Read in csv file
white = pd.read_csv("winequality_white.csv", sep=';', encoding='UTF8') 
# Drop the null columns where all values are null
white = white.dropna(axis='columns', how='all')
# Drop the null rows
white.dropna()
white.head()

FileNotFoundError: [Errno 2] No such file or directory: 'winequality_white.csv'

In [None]:
metadata = white.describe()
metadata.round()

In [None]:
#Examining the distribution of each property
#PH level seems to be the most normally distributed
#Chlorides, Free of Sulfur dioxide and residual sugar appear to be the least normally distributed
white.hist(figsize=(12,10))

In [None]:
#Explore white wine quality scores (scale of 1-10)
quality_score_summary = white["quality"].value_counts()
quality_summary.sort_index(ascending=True)

Removing Outliers through IQR's

In [None]:

def remove_outlier(col_name,df):
    std = df[col_name].std()
    mean = df[col_name].mean()

    no_outlier_white = df[(df[col_name] > (mean - 3*std)) & (df[col_name] < (mean + 3*std))]
    return no_outlier_white



In [None]:
#df is the white dataset with outliers removed for all columns
df = white
for i in list(white.columns):
    print (i)
    f = remove_outlier(i,df)
    
df.round(2)

In [None]:
df.describe()

## Multicollinearity Assesment


In [None]:
Multicollinearity assessed through Variable Inflation Factor (VIF)

In [None]:
# CHECK FOR MULTICOLLINEARITY with Variance Inflation Factor
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["Physiochemical Variables"] = X.columns

In [None]:
#Reset Index or drop when cleaning up notebook
vif = vif.sort_values(by=["VIF Factor"]).round(1)
vif

Multicollinearity assesment through correlation + visual

In [None]:
phys_c = df.drop("quality", axis = 1)

phys_corr = phys_c.corr()
phys_corr

In [None]:
#Displays the the correlation ( r, not r^2 like VIF) between each independent variable to oneanother (physochemical var's, excludes quality) 
#to provide additional evidence of collinearity

import seaborn as sns 
corr = vif_df.corr()
fig = plt.subplots(figsize=(12,10))
sns.heatmap(phys_corr,square=True,annot=True,cmap='YlGnBu')
plt.title("Correlation among the predictors")

## Residuals
-Residual sugar is least correlated with the  remaining variables. Most of the variables are are highly correlated with one another, especially density.  
-Residuals appear to be normal (symetrical), but there is an outlier that can be skiewing the model.


In [None]:
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")

plt.legend()

plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.xlabel("Predicted Values")
plt.ylabel("Residual Values")
plt.title("Residual Plot")


## Multiple Regression Analysis

### Scaling Transformation with QuantileTransformer 
-Non-linear tranformation that will help balance the indpedent variables' distribution 
-Useful in the presence of outliers

In [None]:
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import QuantileTransformer
from pandas import DataFrame as df
trans = QuantileTransformer(n_quantiles=3000, output_distribution='normal')
data = trans.fit_transform(df)
dataset = df(df)

In [None]:
#Visual confirmation that the distribution has been changed 
dataset.hist()
plt.show()

## Multiple Regression Analysis

In [None]:
X = df[df.columns[0:11]]
y = df["quality"].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

In [None]:
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")

plt.legend()

plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.xlabel("Predicted Values")
plt.ylabel("Residual Values")
plt.title("Residual Plot")

Scaling input and target variables through the pipeline 
https://machinelearningmastery.com/how-to-transform-target-variables-for-regression-with-scikit-learn/

- Input variable scaling is accounted for
-Target scaling will be done through TransformedTargetRegressor
- Incorperate QuantileTransformer?

In [None]:
#Transform input variables scale (features?)

from sklearn.preprocessing import QuantileTransformer

X = white[white.columns[0:11]]
qt = QuantileTransformer(n_quantiles=3000, random_state=42)
x_transformed = qt.fit_transform(X)

#does randomstate matter?

In [None]:
data = trans.fit_transform(data)
dataset = df(x_transformed)
dataset.hist()
plt.show()

In [None]:
from numpy import mean
from numpy import absolute
from numpy import loadtxt

from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression

from sklearn.compose import TransformedTargetRegressor

from sklearn.preprocessing import QuantileTransformer

###split into inputs and outputs
# X, y = dataset[:, :-1], dataset[:, -1]

X = white[white.columns[0:11]]
y = white["quality"].values.reshape(-1, 1)


# transform input variables scales through pipeline
pipeline = Pipeline(steps=[('normalize', QuantileTransformer()), ('model', LinearRegression())])

# fit pipeline
pipeline.fit(train_x, train_y)

# make predictions
yhat = pipeline.predict(test_x)
pipeline = Pipeline(steps=[('power', PowerTransformer()), ('model', LinearRegression())])


# prepare the model with target scaling
model = TransformedTargetRegressor(regressor=pipeline, transformer=PowerTransformer())


# # evaluate model
# cv = KFold(n_splits=10, shuffle=True, random_state=1)
# scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# # convert scores to positive
# scores = absolute(scores)

# # summarize the result
# s_mean = mean(scores)
# print('Mean MAE: %.3f' % (s_mean))

# Principle Component Analysis (PCA)

- Rerun when outliers are removed from dataset
-Re-run with different scaling

In [None]:
from sklearn.decomposition import PCA

X
pca = PCA(n_components=10)
pc = pca.fit_transform(X)

print(X.shape)
print(pc.shape)

In [None]:
pc_df = pd.DataFrame(pc)
pc_df = pd.concat([pc_df, white[['quality']]], axis = 1)
pc_df

In [None]:
pcX = pc_df[pc_df.columns[0:11]]
pcy = pc_df["quality"].values.reshape(-1, 1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(pcX, pcy, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

prediction7 = model.predict(X_test_scaled)
rounded7 = np.rint(prediction7)

MSE = mean_squared_error(y_test, rounded7)
r2 = model.score(X_test_scaled, y_test)

print(f"MSE: {MSE}, R2: {r2}")

# Ordinal Regressions 
- I selected orginal regression because the target variable is ordinal, not continuous. 

- After trying three types of ordinal regressions (logistic, OrdinalRidge and LAD), logistic seems to be the strongest

- Outliers weren't removed for these analysis
- Used MinMaxScaling, but other scaling types may strengthen the model

** Double check negative values

Threshold-based model (LogisticIT)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
import mord as m
classifier = m.LogisticIT()

In [None]:
classifier.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Regression based model

-Ordinal Ridge

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
import mord as m
classifier = m.OrdinalRidge()
classifier.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Regression based model

- LAD :
The Least Absolute Deviation model instead minimizes the absolute value of the residuals, i.e.MAE regression

This provides a more robust solution when outliers are present, but it does have some undesirable properties, most notably that there are some situations where there is no unique solution, and in fact an infinite number of different regression lines are possible.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
import mord as m
classifier = m.LAD()
classifier.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")