In [3]:
import pandas as pd
from abc import ABC, abstractmethod
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from warnings import filterwarnings

In [4]:
wine_pro = pd.read_csv("../data/raw/winequalityN.csv", sep=';')
wine_pro.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [6]:
wine_pro.drop_duplicates()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
5,7.4,0.660,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1593,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [9]:
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density','pH', 'sulphates', 'alcohol']
target = 'quality'

In [10]:
#handling outliers
def remove_outliers_iqr(wine_pro, col):
    Q1 = wine_pro[col].quantile(0.25)
    Q3 = wine_pro[col].quantile(0.75)
    IQR = Q3 - Q1
    # Define bounds: 1.5 * IQR is standard practice
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Filter the DataFrame for values within the bounds
    return wine_pro[(wine_pro[col] >= lower_bound) & (wine_pro[col] <= upper_bound)]

# Apply outlier removal to all features (excluding the final target 'quality')
for col in features:
    # We remove outliers iteratively
    wine_cleaned = remove_outliers_iqr(wine_cleaned, col)

final_rows = len(wine_cleaned)
removed_rows = initial_rows - final_rows

print(f"Rows removed due to outliers: {removed_rows}")
print(f"Cleaned Data Shape: {wine_cleaned.shape}")
print("-" * 50)

NameError: name 'wine_cleaned' is not defined

In [None]:
#Target Mapping
def categorize_quality(score):
    if score <= 4:
        return 0 #Low
    elif score <= 6:
        return 1 #medium
    else:
        return 2 #high

wine_pro["quality_category"] = wine_pro["quality"].apply(categorize_quality)
wine_pro["quality_label"] = wine_pro["quality_category"].map({0: "Low", 1: "Medium", 2: "High"})

In [None]:
#feature engineering

In [None]:
X = df_cleaned[FEATURES]
y = df_cleaned['is_good_quality']

In [None]:
# Stratified split to maintain the same proportion of 'Good'/'Bad' quality in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print("-" * 50)

In [None]:
encoding , missing values

In [None]:
scaler = StandardScaler()

# Fit scaler only on training data and transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for consistency
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print("Features scaled successfully.")
print("-" * 50)

In [None]:
#models with the strategy pattern
# 1. Define the Strategy Interface
class ModelStrategy(ABC):
    """Abstract class defining the interface for all model strategies."""
    @abstractmethod
    def train(self, X_train, y_train):
        pass

    @abstractmethod
    def predict(self, X):
        pass

In [None]:
# 2. Implement Concrete Strategies
# <<<<<<<<<<<Each model encapsulates its own initialization and logic. >>>>>>>>>>>>>>>>>>>>>>>>

class LogisticRegressionStrategy(ModelStrategy):
    def __init__(self, **kwargs):
        self.model = LogisticRegression(**kwargs)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X):
        return self.model.predict(X)

class SVCStrategy(ModelStrategy):
    def __init__(self, **kwargs):
        self.model = SVC(**kwargs)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X):
        return self.model.predict(X)

class RandomForestStrategy(ModelStrategy):
    def __init__(self, **kwargs):
        self.model = RandomForestClassifier(**kwargs)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X):
        return self.model.predict(X)

In [None]:
# 3. The Context Class
# This class is configured with a specific strategy and works through the interface.
class ModelTrainer:
    def __init__(self, strategy: ModelStrategy):
        self.strategy = strategy

    def fit(self, X_train, y_train):
        self.strategy.train(X_train, y_train)

    def evaluate(self, X_test, y_test):
        predictions = self.strategy.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        return accuracy

In [None]:
if __name__ == "__main__":
    
    # Load data
    data = load_iris()
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the strategies we want to test
    strategies = {
        "Logistic Regression": LogisticRegressionStrategy(max_iter=200),
        "SVC": SVCStrategy(probability=True),
        "Random Forest": RandomForestStrategy(n_estimators=100)
    }

    # Use the Context (ModelTrainer) to run each strategy
    for name, strategy in strategies.items():
        trainer = ModelTrainer(strategy)
        trainer.fit(X_train, y_train)
        accuracy = trainer.evaluate(X_test, y_test)
        print(f"{name} Accuracy: {accuracy:.4f}")