In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error



In [2]:
data=pd.read_csv('data/melb_data.csv')
data.drop(columns=['Address','SellerG','CouncilArea'],axis=1,inplace=True)

In [3]:
X=data.drop('Price',axis=1)
y=data['Price']
X_train,X_valid,y_train,y_valid=train_test_split(X,y,train_size=0.8, random_state=0)

In [None]:
from transformers import DateTransformer, SubUrbMeanEncoder

suburb_mean_price = data.groupby('Suburb')['Price'].mean()

pipeline = Pipeline([
    ('date_transformer', DateTransformer()),
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('suburb', SubUrbMeanEncoder(suburb_mean_price), ['Suburb']),
            
            ('onehot_type', OneHotEncoder(), ['Type']),
            ('onehot_method', OneHotEncoder(), ['Method']),
            ('onehot_region', OneHotEncoder(), ['Regionname']),
            
            ('car_imputer', SimpleImputer(strategy='constant', fill_value=0), ['Car']),
            
            ('building_year_imputer', SimpleImputer(strategy='mean'), ['BuildingArea', 'YearBuilt']),

            
        ],
        remainder='passthrough',
    )),
    ('model', RandomForestRegressor())
])

pipeline.fit(X_train, y_train)


In [None]:
y_pred=pipeline.predict(X_valid)

In [None]:
results_df = pd.DataFrame({
    'Actual': y_valid,
    'Predicted': y_pred
})

In [None]:
import pickle

with open('model_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
    
with open('my_variable.pkl', 'wb') as f:
    pickle.dump(X_valid, f)