In [None]:
from sklearn.impute import SimpleImputer 
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_percentage_error,r2_score
from sklearn.linear_model import *
from sklearn.preprocessing import PowerTransformer,RobustScaler,StandardScaler,MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

#loading the data
class Loading:
    '''Class to load data from a specified path using a given read method.'''
    def __init__(self, path_,read_method_):
        self.path=path_
        self.read_method=read_method_
        self.data=self.getData()
    def getData(self):
        '''Loads the data from the specified path using the read method.'''
        return self.read_method(self.path)

#filling the data
class FillNa:
    '''Class to fill or drop missing values (NaN) based on the specified strategy.'''
    def __init__(self,data,strategy):
        self.data=data
        self.strategy=strategy
        self.filledData=self.apply()

    def apply(self):
        '''Applies the filling or dropping method based on the strategy.'''
        if self.strategy == "drop":
            return self.dropNa()
        else:
            return self.fillingNullValues()
        
    def fillingNullValues(self):
        '''Fills missing numerical values using SimpleImputer.'''
        data = pd.DataFrame(self.data) 
        numerical_cols = data.select_dtypes(include=[np.number]).columns
        imputer = SimpleImputer(missing_values=np.nan, strategy=self.strategy)
        data[numerical_cols] = imputer.fit_transform(data[numerical_cols])
        return data

    def dropNa(self):
        '''Drops rows containing missing values.'''
        return self.data.dropna()

#onehotencoding the data
class OneHotEncoding:
    '''Class to apply OneHotEncoding on categorical columns of the data.'''
    def __init__(self,data,categoricalColumns):
        self.data=data
        self.categoricalColumns=categoricalColumns
        self.oneHotEncodedData=self.oneHotEncoding()

    def oneHotEncoding(self):
        '''Performs OneHotEncoding on the specified categorical columns.'''
        ohe=OneHotEncoder(handle_unknown='ignore',sparse_output=False).set_output(transform='pandas')
        ohetransform=ohe.fit_transform(self.data[self.categoricalColumns]).astype(int)
        data= pd.concat([self.data,ohetransform],axis=1).drop(columns=self.categoricalColumns)    
        return pd.DataFrame(data)

#finding outliers
class Outlier:
    '''Class to detect and remove outliers using specified method (IQR).'''
    def __init__(self,oneHotEncodedData,method):
        self.oneHotEncodedData=oneHotEncodedData
        self.method=method
        if(method_outlier_=="iqr"):
            self.cleanedData=self.iqrOutlierDetectionAndRemoval()

    def iqrOutlierDetectionAndRemoval(self):
        '''Removes outliers using IQR method.'''
        data=pd.DataFrame(self.oneHotEncodedData)
        dffeatureNames=data.columns
        cleaned_df = data.copy()
        for col in dffeatureNames:
                Q1 = cleaned_df[col].quantile(0.25)
                Q3 = cleaned_df[col].quantile(0.75)
                IQR = Q3 - Q1        
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                cleaned_df = cleaned_df[(cleaned_df[col] >= lower_bound) & (cleaned_df[col] <= upper_bound)]

        cleaned_df = cleaned_df.reset_index(drop=True)
        return pd.DataFrame(cleaned_df)
        
#splitting into train and test
class SplittingData:
    '''Class to split the dataset into features and target variable.'''
    def __init__(self,data,targetColumn):
        self.data=data
        self.targetColumn=targetColumn
        self.X,self.y=self.splitData()
        
    def splitData(self):
        '''Splits data into features (X) and target variable (y).'''
        X = self.data.drop(columns=self.targetColumn)  
        y = self.data[self.targetColumn]
        return X,y

#splitting train and test
class TrainTestSplit:
    '''Class to split data into training and testing sets.'''
    def __init__(self, X,y,**kwargs):
        self.X=X
        self.y=y
        self.testingSize=kwargs['testingSize']
        self.randomState=kwargs['random_State']
        self.xTrain, self.xTest, self.yTrain, self.yTest = self.trainTestSplit()
        
    def trainTestSplit(self):
        '''Splits data into training and testing sets based on specified parameters.'''
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.testingSize, random_state=self.randomState)
        return X_train, X_test, y_train, y_test

#standardization xtrain
class Standardization:
    '''Class to apply standardization (PowerTransformation) on training and test data.'''
    def __init__(self, **kwargs): 
        self.xTrain = kwargs['Xtrain']  
        self.xTest = kwargs['XTest']
        self.method = kwargs['method']
        self.pt = None
        self.xTrain_transformed = self.powerTransformer()
        self.xTest_transformed = self.powerTransformTest()
        
    def powerTransformer(self):
        '''Applies power transformation on training data.'''
        data = pd.DataFrame(self.xTrain).copy()
        not_one_hot_columns = [
            col for col in data.columns 
            if data[col].dtype in [np.int64, np.float64] and not set(data[col].dropna()) <= {0, 1, 0.0, 1}
        ]
        self.pt = PowerTransformer(method=self.method)
        data[not_one_hot_columns] = self.pt.fit_transform(data[not_one_hot_columns])
        return data

    def powerTransformTest(self):
        '''Applies power transformation on test data.'''
        data = pd.DataFrame(self.xTest).copy()
        not_one_hot_columns = [
            col for col in data.columns 
            if data[col].dtype in [np.int64, np.float64] and not set(data[col].dropna()) <= {0, 1, 0.0, 1}
        ]
        data[not_one_hot_columns] = self.pt.transform(data[not_one_hot_columns])
        return data


#model fitting
class ModelFitting():
    '''Class to fit a specified model on the training data.'''
    def __init__(self, model,xtrain,ytrain): 
        self.xtrain=xtrain
        self.model=model
        self.ytrain=ytrain   
        self.fittedModel=self.modelFit()

    def modelFit(self):
        '''Fits the specified model on the training data.'''
        fittedModel = self.model.fit(self.xtrain, self.ytrain)
        return fittedModel

#model evaluation
class Evaluation():
    def __init__(self, XTest,ytest,regressionMetrics,fittedModel):
        '''Class to evaluate the fitted model using specified regression metrics.''' 
        self.xtest=XTest
        self.ytest=ytest
        self.regressionMetrics=regressionMetrics
        self.fittedModel=fittedModel
        self.score=self.metrics()
    
    def metrics(self):
        '''Evaluates the model performance using the provided regression metrics.'''
        xtest=pd.DataFrame(self.xtest)
        yPred=self.fittedModel.predict(xtest)
        score=self.regressionMetrics(self.ytest,yPred)
        return score


#statmodelsummary
class StatModelOLS:
        
    '''Class to generate the summary of an Ordinary Least Squares regression model.'''
    def __init__(self,xData,ydata):
        self.X=xData
        self.y=ydata
        self.summary=self.statModel()
    def statModel(self):
        '''Generates OLS summary for the provided data.'''
        X = sm.add_constant(self.X)
        model = sm.OLS(self.y, X).fit()
        summary = model.summary()
        return summary


read_method_=pd.read_csv
path_=r"C:\Users\Admin\Desktop\Polestar_Work\Day-1_Preprocesing_2\Day-1_Preprocesing\housing\housing.csv"
loaderObject = Loading(path_,read_method_)
data_ = loaderObject.data


strategy_="drop"
fillNaObject = FillNa(data_, strategy_)
filledData_=fillNaObject.filledData


categoricalColumns_=["ocean_proximity"]
oneHotEncodedDataObject=OneHotEncoding(filledData_,categoricalColumns_)
oneHotEncodedData_=oneHotEncodedDataObject.oneHotEncodedData


method_outlier_="iqr"
outlierObject=Outlier(oneHotEncodedData_,method_outlier_)
cleanedData_=outlierObject.cleanedData

targetColumn_=['median_house_value']
splittingDataObject=SplittingData(cleanedData_,targetColumn_)
XData,yData=splittingDataObject.X,splittingDataObject.y


testSize=0.2
randomState=15
trainTestSplitObject=TrainTestSplit(XData, yData, testingSize=testSize, random_State=randomState)

xtrain_=trainTestSplitObject.xTrain
ytrain_=trainTestSplitObject.yTrain
xtest_=trainTestSplitObject.xTest
ytest_=trainTestSplitObject.yTest



method_ = 'yeo-johnson'
standardizationObject = Standardization(Xtrain=xtrain_, XTest=xtest_, method=method_)
standardizedXTrain=standardizationObject.xTrain_transformed
standardizedXTest=standardizationObject.xTest_transformed


model=Ridge()
modelFitObject = ModelFitting(model, standardizedXTrain, ytrain_)
fittedModel=modelFitObject.fittedModel


regressionMetrics_=mean_absolute_percentage_error 
evaluationObject=Evaluation(standardizedXTest,ytest_,regressionMetrics_,fittedModel)
evaluationObject.score

statModelOLSObject=StatModelOLS(XData,yData)
print(statModelOLSObject.summary)




                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.644
Model:                            OLS   Adj. R-squared:                  0.643
Method:                 Least Squares   F-statistic:                     2572.
Date:                Fri, 08 Nov 2024   Prob (F-statistic):               0.00
Time:                        16:04:30   Log-Likelihood:            -1.5757e+05
No. Observations:               12828   AIC:                         3.152e+05
Df Residuals:                   12818   BIC:                         3.152e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               