In [1]:
#pip install imblearn

In [2]:
import pandas as pd
from abc import ABC, abstractmethod
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from warnings import filterwarnings

In [3]:
class DataLoader(ABC):

    @abstractmethod
    def load_data(self):
        pass

    def _clean_data(self, df):
        """ENCAPSULATION: A protected method for internal cleaning steps."""
        df.columns = [col.lower().replace(' ', '_') for col in df.columns]

        print("Basic cleaning applied.")
        return df

class WineCleanCSVLoader(DataLoader):

    def __init__(self, filepath, target_col="quality", sep=','):
        self.filepath = filepath
        self.target_col = target_col
        self.sep = sep  # <-- store separator

    def load_data(self):
        df = pd.read_csv(self.filepath, sep=self.sep)
        print(f"Loaded {df.shape[0]} rows and {df.shape[1]} columns from {self.filepath}")
        df = self._clean_data(df)
        return df

In [4]:
loader = WineCleanCSVLoader(filepath="../data/processed/winequality_with_quality_category.csv", sep=',', target_col="quality_category")
wine_pro = loader.load_data()
wine_pro.head()

Loaded 1599 rows and 13 columns from ../data/processed/winequality_with_quality_category.csv
Basic cleaning applied.


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,quality_category
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5.0,1.0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5.0,1.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5.0,1.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6.0,1.0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5.0,1.0


In [5]:
# Then split into X and y
X = wine_pro.drop(columns=["quality", "quality_category"])  # or keep "quality" if needed
y = wine_pro["quality_category"]

# Ensure no NaNs
mask = ~y.isna()
X = X[mask]
y = y[mask]

In [6]:
# Stratified split to maintain the same proportion of 'Good'/'Bad' quality in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print("-" * 50)

X_train shape: (1256, 11)
X_test shape: (315, 11)
--------------------------------------------------


### Building a pipeline

In [7]:
mi_selector = SelectKBest(score_func=mutual_info_classif, k=10)  
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=10)

In [8]:
pipeline = ImbPipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
    ("smote", SMOTE(random_state=42)),
    ("feature_selection", mi_selector)
])

In [9]:
#fit the train
X_train_proc, y_train_proc = pipeline.fit_resample(X_train, y_train)

X_test_proc = pipeline.transform(X_test)

AttributeError: This 'Pipeline' has no attribute 'fit_resample'

In [None]:
# Implement Concrete Strategies
# <<<<<<<<<<<Each model encapsulates its own initialization and logic. >>>>>>>>>>>>>>>>>>>>>>>>

class ModelStrategy(ABC):
    @abstractmethod
    def train(self, X, y):
        pass
    
    @abstractmethod
    def predict(self, X):
        pass

# Concrete Strategies
class LogisticRegressionStrategy(ModelStrategy):
    def __init__(self, **kwargs):
        self.model = LogisticRegression(multi_class='multinomial', **kwargs)
    
    def train(self, X, y):
        self.model.fit(X, y)
    
    def predict(self, X):
        return self.model.predict(X)

class RandomForestStrategy(ModelStrategy):
    def __init__(self, **kwargs):
        self.model = RandomForestClassifier(**kwargs)
    
    def train(self, X, y):
        self.model.fit(X, y)
    
    def predict(self, X):
        return self.model.predict(X)

class SVCStrategy(ModelStrategy):
    def __init__(self, **kwargs):
        self.model = SVC(probability=True, **kwargs)
    
    def train(self, X, y):
        self.model.fit(X, y)
    
    def predict(self, X):
        return self.model.predict(X)

# Similarly, you can define GradientBoostingStrategy, KNNStrategy, etc.


In [None]:
# 3. The Context Class
# This class is configured with a specific strategy and works through the interface.
class ModelTrainer:
    def __init__(self, strategy: ModelStrategy):
        self.strategy = strategy

    def fit(self, X_train, y_train):
        self.strategy.train(X_train, y_train)

    def evaluate(self, X_test, y_test):
        predictions = self.strategy.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        return accuracy

In [None]:
if __name__ == "__main__":
    
    # Load data
    data = load_iris()
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the strategies we want to test
    strategies = {
        "Logistic Regression": LogisticRegressionStrategy(max_iter=200),
        "SVC": SVCStrategy(probability=True),
        "Random Forest": RandomForestStrategy(n_estimators=100)
    }

    # Use the Context (ModelTrainer) to run each strategy
    for name, strategy in strategies.items():
        trainer = ModelTrainer(strategy)
        trainer.fit(X_train, y_train)
        accuracy = trainer.evaluate(X_test, y_test)
        print(f"{name} Accuracy: {accuracy:.4f}")