In [27]:
import logging
from abc import ABC, abstractmethod 
from typing import Union
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split


class DataStrategy(ABC):
    """
    Abstract class for handling data
    """
    @abstractmethod
    def handle_data(self, data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
        pass

class DataPreProcessStrategy(DataStrategy):
    """
    Class for cleaning data
    """
    def handle_data(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Preprocess data
        """
        try:
            encoder = LabelEncoder()

            # Encoding 'sex' and 'smoker' columns
            data['sex'] = encoder.fit_transform(data['sex'])
            data['smoker'] = encoder.fit_transform(data['smoker'])

            # Dropping less correlated columns based on the correlation matrix
            data = data.drop(
                ['region', 'children', 'sex'],
                axis=1
            )

            return data
        except Exception as e:
            logging.error(f"Error in cleaning data: {e}")
            raise e
        
class DataDivideStrategy(DataStrategy):
    """
    Split the data into train and test sets
    """
    def handle_data(self, data: pd.DataFrame) -> Union[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
        """
        Split the data into training and test sets and return them.
        """
        try:
            y = data['charges']  # Target column
            X = data[['smoker', 'age', 'bmi']]  # Features
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            return X_train, X_test, y_train, y_test
        except Exception as e:
            logging.error(f"Error in splitting data: {e}")
            raise e


class DataCleaning:
    """
    Class to clean and split the data
    """
    def __init__(self, data: pd.DataFrame, strategy: DataStrategy):
        self.data = data
        self.strategy = strategy

    def handle_data(self) -> Union[pd.DataFrame, pd.Series]:
        """
        Clean the data
        """
        try:
            return self.strategy.handle_data(self.data)
        except Exception as e:  
            logging.error(f"Error in cleaning data: {e}")
            raise e

In [3]:
df_test = pd.DataFrame(data=[
                [19,"female",27.9,0,"yes","southwest",16884.924],
                [18,"male",33.77,1,"no","southeast",1725.5523],
                [28,"male",33,3,"no","southeast",4449.462]
            ],
            columns=["age","sex","bmi","children","smoker","region","charges"])

df_test

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [4]:
cleaning_test = DataPreProcessStrategy()

In [7]:
df_cleaned = cleaning_test.handle_data(df_test)

In [35]:
df_cleaned

Unnamed: 0,age,bmi,smoker,charges
0,19,27.9,1,16884.924
1,18,33.77,0,1725.5523
2,28,33.0,0,4449.462


In [6]:
split_data = DataDivideStrategy()

In [28]:
a, b, c, d = split_data.handle_data(df_cleaned)

In [30]:
a

Unnamed: 0,smoker,age,bmi
1,0,18,33.77
2,0,28,33.0


In [38]:
b

Unnamed: 0,smoker,age,bmi
0,1,19,27.9


In [32]:
c

1    1725.5523
2    4449.4620
Name: charges, dtype: float64

In [33]:
d

0    16884.924
Name: charges, dtype: float64