In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#all library for data manupulation
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#all models we gonna use
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

#all performance measuring
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_val_score

#all fine tuning library
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [52]:
df=pd.read_csv("D:\\ML project\\data\\archive\\housing.csv")

In [53]:
#it will take a label column name , test size and random state
class trainTestSplitter(TransformerMixin, BaseEstimator):
    def __init__(self,test_size,random_state):
        self.test_size=test_size
        self.random_state=random_state
        
    def fit(self):
        return self
        
    def transform(self,df,label):
        X=df.drop(label,axis=1)
        Y=df[label]
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=self.test_size, random_state=self.random_state)
        return X_train, X_test, y_train, y_test


In [54]:
'''
1. We will first make a diff num df and categorical df
2.we will apply imputer on num df
3. we will use scaler on num df
4. we will use hot encoder on cat df

strategy will be passed
'''
class DataProcessor(BaseEstimator,TransformerMixin):
    def __init__(self,imputer_strategy):
        self.imputer_strategy=imputer_strategy
        self.num_df=None
        self.cat_df=None
        self.num_pipeline=None
        self.cat_pipeline=None
        self.full_pipeline=None
    def fit(self,df):
        self.num_df=df.select_dtypes(include=["number"])
        self.cat_df=df.select_dtypes(include=["object"])

        self.num_pipeline=Pipeline([
            ("imputer",SimpleImputer(strategy=self.imputer_strategy)),
            ("scaler",MinMaxScaler())
        ])
        self.cat_pipeline=Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ("encoder",OneHotEncoder())
        ])

        self.full_pipeline=ColumnTransformer([
            ('num', self.num_pipeline, self.num_df.columns.tolist()),
            ('cat', self.cat_pipeline, self.cat_df.columns.tolist())
        ])

        self.full_pipeline.fit(df)
        return self

    def transform(self,df):
        data_prep=self.full_pipeline.transform(df)
        return data_prep
        

In [55]:
'''
we will give 4 options: linear, tree, random forest and SVM
'''
class PredictingTransformer(BaseEstimator,TransformerMixin):
    def __init__(self,model_name):
        self.model_name=model_name
        self.model=None
        self.train_data=None
        self.train_label=None
    def fit(self,train_data,train_label):
        self.train_data=train_data
        self.train_label=train_label
        if self.model_name=="linear":
            self.model=LinearRegression()
        if self.model_name=="tree":
            self.model=DecisionTreeRegressor()
        if self.model_name=="forest":
            self.model=RandomForestRegressor()
        if self.model_name=="svm":
            self.model=SVR(kernel='rbf', C=1.0, epsilon=0.2)

        self.model.fit(train_data,train_label)
        return self

    def transform(self,test_data):
        train_pred=self.model.predict(self.train_data)
        test_pred=self.model.predict(test_data)
        return train_pred,test_pred
        

In [56]:
'''
1. cross validation
2. rmse calculation
3. fine tuning using grid search
'''
class FineTunerTransformer(BaseEstimator,TransformerMixin):
    def __init__(self,model,param,cv_finetune,prediction,labels,cv=10):
        self.param=param
        self.cv_finetune=cv_finetune
        self.cv=cv
        self.model=model
        self.prediction=prediction
        self.labels=labels
        self.gridsearch=None
        self.best_params=None
        self.cross_val=None
        self.mse_model=None
    def fit(self,df):
        self.mse_model=mse(self.labels,self.prediction)
        self.cross_val=cross_val_score(self.model,df,self.labels,scoring="neg_mean_squared_error",cv=self.cv)
        self.gridsearch=GridSearchCV(self.model,self.param,cv=self.cv_finetune,scoring='neg_mean_squared_error')
        return self
    def transform(self,df):
        self.gridsearch.fit(df,self.labels)
        return self.gridsearch
        
    def display_scores(self):
        rmse=np.sqrt(-self.cross_val)
        print("scores:",rmse)
        print("Mean:",rmse.mean())
        print("std deviation:",rmse.std())

In [72]:
class CompleteTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, df, target, finetune_params=[], imputer_strategy="median", splitting=0.2, random_state=42, model_name="linear", cv=10, finetune_cv=5):
        self.df=df
        self.target=target
        self.splitting=splitting
        self.random_state=random_state
        self.imputer_strategy=imputer_strategy
        self.model_name=model_name
        self.finetune_params=finetune_params
        self.cv=cv
        self.finetune_cv=finetune_cv
        self.x_train=None
        self.x_test=None
        self.y_train=None
        self.y_test=None
        self.trained_processed_data=None
        self.test_processed_data=None
        self.predictor=None
        self.fine_tuner=None
        self.train_pred=None
        self.test_pred=None
    def fit(self):
        #splitting data
        spliter=trainTestSplitter(self.splitting,self.random_state)
        spliter.fit()
        self.x_train, self.x_test, self.y_train, self.y_test=spliter.transform(self.df,self.target)
        print("data splitting done \n")
        #data processing both train and test
        train_processor=DataProcessor(self.imputer_strategy)
        train_processor.fit(self.x_train)
        self.trained_processed_data=train_processor.transform(self.x_train)
        print("trained data processed\n")
        test_processor=DataProcessor(self.imputer_strategy)
        test_processor.fit(self.x_test)
        self.test_processed_data=test_processor.transform(self.x_test)
        print("test data processed\n")

        #model initialization
        self.predictor=PredictingTransformer(self.model_name)   
        self.predictor.fit(self.trained_processed_data,self.y_train)
        print("model trained\n")
        return self

    def transform(self,X):
        self.train_pred,self.test_pred=self.predictor.transform(X)
        print("prediction is done\n")
        self.fine_tuner=FineTunerTransformer(self.predictor.model,self.finetune_params,self.finetune_cv,self.train_pred,self.y_train,self.cv)
        self.fine_tuner.fit(self.trained_processed_data)
        self.grid_search=self.fine_tuner.transform(self.trained_processed_data)

        print("grid search done\n")

        return self.train_pred,self.test_pred,self.grid_search,self.predictor.model
        
        

In [73]:
param=[
    {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
    {'n_estimators':[3,10],'max_features':[2,3,4],'bootstrap':[False]},
]
cmp_transformer = CompleteTransformer(df, "median_house_value", model_name="forest",finetune_params=param)
cmp_transformer.fit()
train_pred, test_pred, grid_search_results, model = cmp_transformer.transform(cmp_transformer.test_processed_data)

data splitting done 

trained data processed

test data processed

model trained

prediction is done

grid search done



In [74]:
train_pred

array([106756.  , 371009.  , 182343.  , ..., 228744.  , 277835.  ,
       323586.01])

In [75]:
test_pred

array([ 51692.  ,  68499.  , 349039.08, ..., 496607.95,  98922.  ,
       226006.  ])

In [76]:
model

In [77]:
grid_search_results.cv_results_

{'mean_fit_time': array([0.10377316, 0.3198441 , 0.95503287, 0.16129651, 0.56915684,
        1.59654593, 0.21056805, 0.70526733, 2.09877219, 0.26789093,
        0.89143462, 2.75104904, 0.12770867, 0.42886281, 0.16751828,
        0.56497564, 0.21248035, 0.68852773]),
 'std_fit_time': array([0.00820495, 0.00909144, 0.02786666, 0.00583726, 0.02987822,
        0.07728068, 0.00192327, 0.00483902, 0.02196209, 0.00567018,
        0.00646133, 0.051231  , 0.00304291, 0.00552398, 0.00292086,
        0.00869537, 0.00737878, 0.01384481]),
 'mean_score_time': array([0.00380054, 0.0084796 , 0.02410669, 0.00370216, 0.01049271,
        0.02522278, 0.00340061, 0.00848618, 0.0227066 , 0.00300097,
        0.00820155, 0.02512569, 0.0037137 , 0.01110392, 0.00368843,
        0.01078105, 0.00390282, 0.01060481]),
 'std_score_time': array([1.16567270e-03, 6.86852851e-04, 9.17037751e-04, 3.98700683e-04,
        5.47217575e-04, 2.69108000e-03, 4.89940455e-04, 5.23105958e-04,
        3.99662611e-04, 3.69356475e-

In [78]:
cmp_transformer.fine_tuner.display_scores()

scores: [46590.70395524 50674.18176837 47688.1990024  50128.10048237
 50046.9946549  46575.1661624  45839.43151203 51123.50396358
 49464.00058943 49799.6365286 ]
Mean: 48792.99186193128
std deviation: 1830.3468308188721


In [79]:
grid_search_results.best_estimator_

In [81]:
np.sqrt(cmp_transformer.fine_tuner.mse_model)

18228.226293899097