In [25]:
from sklearn.impute import SimpleImputer 
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_percentage_error,r2_score
from sklearn.linear_model import *
from sklearn.preprocessing import PowerTransformer,RobustScaler,StandardScaler,MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

#loading the data
class Loading:
    """ 
    Class to load data from a specified path using a given read method.
    
    Attributes:
        path (str): The path to the data file.
        read_method (function): The method used to read the data.
        data (DataFrame): The loaded data.
    """
    def __init__(self, path_,read_method_):
        """ 
        Initializes the Loading class with a file path and read method.

        Args:
            path_ (str): The file path to the data.
            read_method_ (function): The method used to read the data.
        """
        self.path=path_
        self.read_method=read_method_
        self.data=self.getData()
    
    def getData(self):
        """
        Loads the data from the specified path using the read method.

        Returns:
            DataFrame: The loaded data.
        """
        return self.read_method(self.path)

#filling the data
class FillNa:
    """
    Class to fill or drop missing values (NaN) based on the specified strategy.

    Attributes:
        data (DataFrame): The data to fill/drop missing values.
        strategy (str): The strategy to use for filling or dropping missing values.
        filledData (DataFrame): The data after filling/dropping missing values.
    """
    def __init__(self,data,strategy):
        """
        Initializes the FillNa class with data and filling strategy.

        Args:
            data (DataFrame): The data to be filled/dropped.
            strategy (str): The strategy to fill missing values, e.g., 'mean', 'drop'.
        """
        self.data=data
        self.strategy=strategy
        self.filledData=self.apply()

    def apply(self):
        """
        Applies the filling or dropping method based on the strategy.

        Returns:
            DataFrame: Data after filling or dropping NaN values.
        """
        if self.strategy == "drop":
            return self.dropNa()
        else:
            return self.fillingNullValues()
        
    def fillingNullValues(self):
        """
        Fills missing numerical values using SimpleImputer.

        Returns:
            DataFrame: Data with filled missing values.
        """
        data = pd.DataFrame(self.data) 
        numerical_cols = data.select_dtypes(include=[np.number]).columns
        imputer = SimpleImputer(missing_values=np.nan, strategy=self.strategy)
        data[numerical_cols] = imputer.fit_transform(data[numerical_cols])
        return data

    def dropNa(self):
        """
        Drops rows containing missing values.

        Returns:
            DataFrame: Data after dropping rows with NaN values.
        """
        return self.data.dropna()

#onehotencoding the data
class OneHotEncoding:
    """
    Class to apply OneHotEncoding on categorical columns of the data.

    Attributes:
        data (DataFrame): The data to be encoded.
        categoricalColumns (list): The categorical columns to encode.
        oneHotEncodedData (DataFrame): The data after encoding.
    """
    def __init__(self,data,categoricalColumns):
        """
        Initializes the OneHotEncoding class with data and categorical columns.

        Args:
            data (DataFrame): The data to encode.
            categoricalColumns (list): List of columns to apply one-hot encoding.
        """
        self.data=data
        self.categoricalColumns=categoricalColumns
        self.oneHotEncodedData=self.oneHotEncoding()

    def oneHotEncoding(self):
        """
        Performs OneHotEncoding on the specified categorical columns.

        Returns:
            DataFrame: Data after one-hot encoding.
        """
        ohe=OneHotEncoder(handle_unknown='ignore',sparse_output=False).set_output(transform='pandas')
        ohetransform=ohe.fit_transform(self.data[self.categoricalColumns]).astype(int)
        data= pd.concat([self.data,ohetransform],axis=1).drop(columns=self.categoricalColumns)    
        return pd.DataFrame(data)

#finding outliers
class Outlier:
    """
    Class to detect and remove outliers using specified method (IQR).

    Attributes:
        oneHotEncodedData (DataFrame): Data to clean.
        method (str): The method to use for outlier detection.
        cleanedData (DataFrame): The data after outlier removal.
    """
    def __init__(self,oneHotEncodedData,method):
        """
        Initializes the Outlier class with data and outlier detection method.

        Args:
            oneHotEncodedData (DataFrame): The data to detect and remove outliers.
            method (str): The method to use for outlier detection (e.g., 'iqr').
        """
        self.oneHotEncodedData=oneHotEncodedData
        self.method=method
        if(method_outlier_=="iqr"):
            self.cleanedData=self.iqrOutlierDetectionAndRemoval()

    def iqrOutlierDetectionAndRemoval(self):
        """
        Removes outliers using IQR method.

        Returns:
            DataFrame: Data after outlier removal.
        """
        data=pd.DataFrame(self.oneHotEncodedData)
        dffeatureNames=data.columns
        cleaned_df = data.copy()
        for col in dffeatureNames:
                Q1 = cleaned_df[col].quantile(0.25)
                Q3 = cleaned_df[col].quantile(0.75)
                IQR = Q3 - Q1        
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                cleaned_df = cleaned_df[(cleaned_df[col] >= lower_bound) & (cleaned_df[col] <= upper_bound)]

        cleaned_df = cleaned_df.reset_index(drop=True)
        return pd.DataFrame(cleaned_df)
        
#splitting into train and test
class SplittingData:
    """
    Class to split the dataset into features and target variable.

    Attributes:
        data (DataFrame): The data to split.
        targetColumn (str): The target column for prediction.
        X (DataFrame): Feature data.
        y (Series): Target data.
    """
    def __init__(self,data,targetColumn):
        """
        Initializes the SplittingData class with data and target column.

        Args:
            data (DataFrame): The dataset.
            targetColumn (str): The target column name.
        """
        self.data=data
        self.targetColumn=targetColumn
        self.X,self.y=self.splitData()
        
    def splitData(self):
        """
        Splits data into features (X) and target variable (y).

        Returns:
            Tuple[DataFrame, Series]: Feature and target data.
        """
        X = self.data.drop(columns=self.targetColumn)  
        y = self.data[self.targetColumn]
        return X,y

#splitting train and test
class TrainTestSplit:
    """
    Class to split data into training and testing sets.

    Attributes:
        X (DataFrame): Feature data.
        y (Series): Target data.
        testingSize (float): Test size proportion.
        randomState (int): Random seed for reproducibility.
    """
    def __init__(self, X,y,**kwargs):
        """
        Initializes the TrainTestSplit class with data and split parameters.

        Args:
            X (DataFrame): Feature data.
            y (Series): Target data.
            testingSize (float): Proportion of test data.
            randomState (int): Seed for reproducibility.
        """
        self.X=X
        self.y=y
        self.testingSize=kwargs['testingSize']
        self.randomState=kwargs['random_State']
        self.xTrain, self.xTest, self.yTrain, self.yTest = self.trainTestSplit()
        
    def trainTestSplit(self):
        """
        Splits data into training and testing sets based on specified parameters.

        Returns:
            Tuple: xTrain, xTest, yTrain, yTest.
        """
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.testingSize, random_state=self.randomState)
        return X_train, X_test, y_train, y_test

#standardization xtrain
class Standardization:
    """
    Class to apply standardization (PowerTransformation) on training and test data.

    Attributes:
        xTrain (DataFrame): Training data.
        xTest (DataFrame): Testing data.
        method (str): Transformation method.
    """
    def __init__(self, **kwargs): 
        """
        Initializes the Standardization class with data and transformation method.

        Args:
            Xtrain (DataFrame): Training data.
            XTest (DataFrame): Testing data.
            method (str): Transformation method, e.g., 'yeo-johnson'.
        """
        self.xTrain = kwargs['Xtrain']  
        self.xTest = kwargs['XTest']
        self.method = kwargs['method']
        self.pt = None
        self.xTrain_transformed = self.powerTransformer()
        self.xTest_transformed = self.powerTransformTest()
        
    def powerTransformer(self):
        """
        Applies power transformation on training data.

        Returns:
            DataFrame: Transformed training data.
        """
        data = pd.DataFrame(self.xTrain).copy()
        not_one_hot_columns = [
            col for col in data.columns 
            if data[col].dtype in [np.int64, np.float64] and not set(data[col].dropna()) <= {0, 1, 0.0, 1}
        ]
        self.pt = PowerTransformer(method=self.method)
        data[not_one_hot_columns] = self.pt.fit_transform(data[not_one_hot_columns])
        return data

    def powerTransformTest(self):
        """
        Applies power transformation on test data.

        Returns:
            DataFrame: Transformed test data.
        """
        data = pd.DataFrame(self.xTest).copy()
        not_one_hot_columns = [
            col for col in data.columns 
            if data[col].dtype in [np.int64, np.float64] and not set(data[col].dropna()) <= {0, 1, 0.0, 1}
        ]
        data[not_one_hot_columns] = self.pt.transform(data[not_one_hot_columns])
        return data


#model fitting
class ModelFitting():
    """
    Class to fit a specified model on the training data.

    Attributes:
        model (object): The model to fit.
        xtrain (DataFrame): Training data.
        ytrain (Series): Target data.
    """
    def __init__(self, model,xtrain,ytrain): 
        """
        Initializes the ModelFitting class with model and training data.

        Args:
            model (object): The model instance to fit.
            xtrain (DataFrame): Training data.
            ytrain (Series): Target data.
        """
        self.xtrain=xtrain
        self.model=model
        self.ytrain=ytrain   
        self.fittedModel=self.modelFit()

    def modelFit(self):
        """
        Fits the specified model on the training data.

        Returns:
            object: Fitted model.
        """
        fittedModel = self.model.fit(self.xtrain, self.ytrain)
        return fittedModel

#model evaluation
class Evaluation():
    """
    Class to evaluate the fitted model using specified regression metrics.

    Attributes:
        xtest (DataFrame): Test features.
        ytest (Series): Test targets.
        regressionMetrics (function): Regression metric function.
        fittedModel (object): Fitted model.
    """    
    def __init__(self, XTest,ytest,regressionMetrics,fittedModel):
        """
        Initializes the Evaluation class with test data and evaluation metrics.

        Args:
            XTest (DataFrame): Test features.
            ytest (Series): Test targets.
            regressionMetrics (function): Metric function.
            fittedModel (object): Fitted model.
        """
        self.xtest=XTest
        self.ytest=ytest
        self.regressionMetrics=regressionMetrics
        self.fittedModel=fittedModel
        self.score=self.metrics()
    
    def metrics(self):
        """
        Evaluates the model performance using the provided regression metrics.

        Returns:
            float: Model performance score.
        """
        xtest=pd.DataFrame(self.xtest)
        yPred=self.fittedModel.predict(xtest)
        score=self.regressionMetrics(self.ytest,yPred)
        return score


#statmodelsummary
class StatModelOLS:
    """
    Class to generate the summary of an Ordinary Least Squares regression model.

    Attributes:
        X (DataFrame): Feature data.
        y (Series): Target data.
        summary (str): Model summary.
    """
    def __init__(self,xData,ydata):
        """
        Initializes the StatModelOLS class with data.

        Args:
            xData (DataFrame): Feature data.
            ydata (Series): Target data.
        """
        self.X=xData
        self.y=ydata
        self.summary=self.statModel()

    def statModel(self):
        """
        Generates OLS summary for the provided data.

        Returns:
            str: Summary of the OLS model.
        """
        X = sm.add_constant(self.X)
        model = sm.OLS(self.y, X).fit()
        summary = model.summary()
        return summary


In [26]:
read_method_=pd.read_csv
path_=r"C:\Users\Admin\Desktop\Polestar_Work\Day-1_Preprocesing_2\Day-1_Preprocesing\housing\housing.csv"
loaderObject = Loading(path_,read_method_)
data_ = loaderObject.data

data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [27]:
strategy_="drop"
fillNaObject = FillNa(data_, strategy_)
filledData_=fillNaObject.filledData

filledData_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


In [28]:
categoricalColumns_=["ocean_proximity"]
oneHotEncodedDataObject=OneHotEncoding(filledData_,categoricalColumns_)
oneHotEncodedData_=oneHotEncodedDataObject.oneHotEncodedData

oneHotEncodedData_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   longitude                   20433 non-null  float64
 1   latitude                    20433 non-null  float64
 2   housing_median_age          20433 non-null  float64
 3   total_rooms                 20433 non-null  float64
 4   total_bedrooms              20433 non-null  float64
 5   population                  20433 non-null  float64
 6   households                  20433 non-null  float64
 7   median_income               20433 non-null  float64
 8   median_house_value          20433 non-null  float64
 9   ocean_proximity_<1H OCEAN   20433 non-null  int32  
 10  ocean_proximity_INLAND      20433 non-null  int32  
 11  ocean_proximity_ISLAND      20433 non-null  int32  
 12  ocean_proximity_NEAR BAY    20433 non-null  int32  
 13  ocean_proximity_NEAR OCEAN  20433 no

In [29]:
method_outlier_="iqr"
outlierObject=Outlier(oneHotEncodedData_,method_outlier_)
cleanedData_=outlierObject.cleanedData

cleanedData_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12828 entries, 0 to 12827
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   longitude                   12828 non-null  float64
 1   latitude                    12828 non-null  float64
 2   housing_median_age          12828 non-null  float64
 3   total_rooms                 12828 non-null  float64
 4   total_bedrooms              12828 non-null  float64
 5   population                  12828 non-null  float64
 6   households                  12828 non-null  float64
 7   median_income               12828 non-null  float64
 8   median_house_value          12828 non-null  float64
 9   ocean_proximity_<1H OCEAN   12828 non-null  int32  
 10  ocean_proximity_INLAND      12828 non-null  int32  
 11  ocean_proximity_ISLAND      12828 non-null  int32  
 12  ocean_proximity_NEAR BAY    12828 non-null  int32  
 13  ocean_proximity_NEAR OCEAN  128

In [30]:
targetColumn_=['median_house_value']
splittingDataObject=SplittingData(cleanedData_,targetColumn_)
XData,yData=splittingDataObject.X,splittingDataObject.y

yData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12828 entries, 0 to 12827
Data columns (total 1 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   median_house_value  12828 non-null  float64
dtypes: float64(1)
memory usage: 100.3 KB


In [31]:
testSize=0.2
randomState=15
trainTestSplitObject=TrainTestSplit(XData, yData, testingSize=testSize, random_State=randomState)

In [32]:
xtrain_=trainTestSplitObject.xTrain
ytrain_=trainTestSplitObject.yTrain
xtest_=trainTestSplitObject.xTest
ytest_=trainTestSplitObject.yTest

In [33]:
method_ = 'yeo-johnson'
standardizationObject = Standardization(Xtrain=xtrain_, XTest=xtest_, method=method_)
standardizedXTrain=standardizationObject.xTrain_transformed
standardizedXTest=standardizationObject.xTest_transformed

standardizedXTrain.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
396,-1.387779e-15,1.915135e-15,-1.135762,0.891847,0.717733,0.9323,0.74297,0.081575,0,1,0,0,0
4583,6.938894e-16,-1.249001e-15,0.509345,1.719231,1.299347,1.785643,1.37364,1.141202,1,0,0,0,0
5959,6.938894e-16,-3.469447e-16,0.343859,1.324757,1.099068,1.002968,0.660716,0.066445,0,1,0,0,0
5691,4.718448e-16,-1.304512e-15,0.426777,0.414215,0.610153,0.865661,0.723692,0.579544,1,0,0,0,0
9081,8.881784e-16,-1.06859e-15,0.176921,0.742869,1.377273,2.300767,1.507033,-0.867263,0,1,0,0,0


In [34]:
model=Ridge()
modelFitObject = ModelFitting(model, standardizedXTrain, ytrain_)
fittedModel=modelFitObject.fittedModel
fittedModel

In [35]:
regressionMetrics_=mean_absolute_percentage_error 
evaluationObject=Evaluation(standardizedXTest,ytest_,regressionMetrics_,fittedModel)
evaluationObject.score

0.2707364256368932

In [36]:
statModelOLSObject=StatModelOLS(XData,yData)
print(statModelOLSObject.summary)

                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.644
Model:                            OLS   Adj. R-squared:                  0.643
Method:                 Least Squares   F-statistic:                     2572.
Date:                Fri, 08 Nov 2024   Prob (F-statistic):               0.00
Time:                        17:37:28   Log-Likelihood:            -1.5757e+05
No. Observations:               12828   AIC:                         3.152e+05
Df Residuals:                   12818   BIC:                         3.152e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               