In [198]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [199]:
wine_red = pd.read_csv('./winequality-red.csv', sep=';')
wine_white = pd.read_csv('./winequality-white.csv', sep=';')

In [200]:
wine_red['type']='red'
wine_white['type']='white'

In [201]:
wine = pd.concat([wine_red, wine_white])

In [202]:
original_rows = wine.shape[0]
original_columns = wine.shape[1]
print(f'Dataset has {original_rows} rows and {original_columns} columns.')

Dataset has 6497 rows and 13 columns.


In [203]:
wine.rename(columns={"residual sugar" : "sugar", "quality": "target"}, inplace=True)
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [204]:
wine.isna().sum()

fixed acidity           1
volatile acidity        0
citric acid             2
sugar                   0
chlorides               1
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      2
sulphates               0
alcohol                 0
target                  0
type                    0
dtype: int64

In [205]:
wine['fixed acidity'] = wine['fixed acidity'].fillna(wine['fixed acidity'].mean())

In [206]:
wine.dropna(axis=0, inplace=True)

In [207]:
wine.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
sugar                   0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
target                  0
type                    0
dtype: int64

In [208]:
wine['target'].value_counts().sort_index()

target
3      30
4     216
5    2135
6    2835
7    1078
8     193
9       5
Name: count, dtype: int64

In [209]:
wine = wine[(wine['target']>3) & (wine['target']<8)]

In [210]:
updated_rows = wine.shape[0]
updated_columns = wine.shape[1]
print(f'Dataset has {updated_rows} rows and {updated_columns} columns.')
print(f'Dataset has {original_rows-updated_rows} less rows.')

Dataset has 6264 rows and 13 columns.
Dataset has 233 less rows.


In [211]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [212]:
sugar = {}
unique_targets = wine['target'].unique()
for target in unique_targets:
    sugar[target] = {'min' : wine[wine['target']==target]['sugar'].min(),
                     'max': wine[wine['target']==target]['sugar'].max()}
print(sugar)

{5: {'min': 0.6, 'max': 23.5}, 6: {'min': 0.7, 'max': 65.8}, 7: {'min': 0.9, 'max': 19.25}, 4: {'min': 0.7, 'max': 17.55}}


In [213]:
wine.to_csv('wine_combined.csv', index=False)

In [214]:
features = list(wine.columns)
features.remove('target')

X = wine[features]
y = wine['target']

In [215]:
X = pd.get_dummies(X, columns=['type'], dtype=int, prefix='type')

In [216]:
X_scaled = StandardScaler().fit_transform(X)

In [217]:
for col in range(1, X_scaled.shape[1]+1):
    pca = PCA(n_components=col)
    principal_components = pca.fit_transform(X_scaled)
    df_pca = pd.DataFrame(principal_components)
    total = np.sum(pca.explained_variance_ratio_)
    print (f'Components {col} has ratio of {total}')
    if (total>0.90):
        print(f'Using {col} components for PCA')
        break

Components 1 has ratio of 0.36287186994784765


Components 2 has ratio of 0.5575059442519583
Components 3 has ratio of 0.6783992299149503
Components 4 has ratio of 0.7520179450752006
Components 5 has ratio of 0.8083630462978912
Components 6 has ratio of 0.855403590710867
Components 7 has ratio of 0.8973664675696449
Components 8 has ratio of 0.9357289127902951
Using 8 components for PCA


In [218]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(df_pca, y, test_size=0.3, random_state=42)

In [219]:
def train(X_train, y_train):
    X_T = X_train.T
    beta = np.linalg.inv(X_T.dot(X_train)).dot(X_T).dot(y_train)
    return beta

In [220]:
model_without_pca = train(X_train, y_train)
model_with_pca = train(X_train_pca, y_train_pca)

In [221]:
def calculate_rmse(beta, X_test, y_test):
    y_pred = X_test.dot(beta)
    rmse = np.sqrt(np.mean((y_pred-y_test)**2))
    return rmse

In [222]:
rmse_without_pca = calculate_rmse(model_without_pca, X_test, y_test)
rmse_with_pca = calculate_rmse(model_with_pca, X_test_pca, y_test_pca)

In [223]:
print(rmse_without_pca)
print(rmse_with_pca)

0.6424996124899546
5.969499304503357
