# Author
## Noah Call
## A02361280
This code assumes data comes from "/data/wine-quality-white.csv". These can be changed in the first block of code Config & Imports

## Config & Imports

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data_path = "../data/wine-quality-white.csv"

## 1.

In [6]:
# Load dataset
df = pd.read_csv(data_path, sep=';')

# Create the binary target variable
df['y'] = df['quality'].apply(lambda x: 0 if x <= 5 else 1)

# Check the result
df[['quality', 'y']].head()

Unnamed: 0,quality,y
0,6,1
1,6,1
2,6,1
3,6,1
4,6,1


## 2.

In [7]:
# Calculate class imbalance ratio
class_counts = df['y'].value_counts(normalize=True)
class_counts_0 = class_counts[0]  # Proportion of 0's
class_counts_1 = class_counts[1]  # Proportion of 1's

print(f"Class 0 proportion: {class_counts_0}")
print(f"Class 1 proportion: {class_counts_1}")

Class 0 proportion: 0.33483054307880766
Class 1 proportion: 0.6651694569211923


## 3.

In [8]:
# Split test set (20%) from the remaining (80%)
train_val, test = train_test_split(df, test_size=0.2, stratify=df['y'], random_state=42)

# Split the remaining data into train (60%) and validation (20%)
train, val = train_test_split(train_val, test_size=0.25, stratify=train_val['y'], random_state=42)  # 0.25 * 0.8 = 0.2

# Check the sizes
print(f"Train set size: {train.shape[0]}")
print(f"Validation set size: {val.shape[0]}")
print(f"Test set size: {test.shape[0]}")

Train set size: 2938
Validation set size: 980
Test set size: 980


## 4.

In [10]:
# Features (excluding 'quality' and 'y')
features = df.columns[:-2]  # Exclude 'quality' and 'y'

# Initialize the scaler and fit on the training data
scaler = StandardScaler()
scaler.fit(train[features])

# Apply the scaler on train, validation, and test sets
train_scaled = train.copy()
val_scaled = val.copy()
test_scaled = test.copy()

train_scaled[features] = scaler.transform(train[features])
val_scaled[features] = scaler.transform(val[features])
test_scaled[features] = scaler.transform(test[features])

# Check the result
train_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,y
4755,-0.323928,0.125337,3.317193,-0.814913,-0.856259,-0.814745,-1.048715,-1.33309,-0.444481,-0.104704,1.514126,6,1
4003,-0.084927,-1.383225,0.143173,-0.95317,0.075184,0.281572,-0.470623,-0.963156,-0.779307,1.950671,0.458833,6,1
2351,1.229578,-0.276946,0.059646,1.002178,-0.250821,1.031683,1.494889,1.026485,-0.511447,0.666062,-0.758812,6,1
2434,1.70758,0.024766,-0.525042,2.206988,-0.01796,0.743179,2.674196,2.046301,-1.114133,0.580421,-1.164694,6,1
299,-0.204428,-1.081512,1.396076,-0.834664,-0.111104,-0.46854,-0.355005,-0.566561,-0.243586,-0.361626,-0.190577,6,1
