### **DATA COLLECTION**

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from win32com.client import constants

data = pd.read_csv('melb_data.csv')

y = data.Price
X = data.drop(['Price'] , axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#cardinality means no. of unique values in column
cat_cols = [cname for cname in X_train.columns
            if X_train[cname].nunique() < 10 and
            X_train[cname].dtype == 'object']

#number colums
num_cols = [cname for cname in X_train.columns
            if X_train[cname].dtype in ['int64', 'float64']]

my_cols = cat_cols + num_cols
X_train = X_train[my_cols].copy()
X_test = X_test[my_cols].copy()

In [34]:
X_train.head()


Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12796,h,S,Eastern Metropolitan,4,14.2,3149.0,4.0,2.0,2.0,695.0,160.0,1970.0,-37.86127,145.14271,13366.0
9642,h,S,Eastern Metropolitan,3,14.2,3149.0,3.0,1.0,2.0,810.0,,,-37.86838,145.14664,13366.0
3207,u,S,Southern Metropolitan,2,4.6,3122.0,2.0,1.0,1.0,82.0,,,-37.818,145.0268,11308.0
1698,u,S,Northern Metropolitan,2,3.2,3054.0,2.0,1.0,1.0,0.0,76.0,1975.0,-37.7902,144.97,3106.0
761,h,S,Southern Metropolitan,4,13.0,3204.0,4.0,2.0,1.0,292.0,,,-37.9148,145.0243,6795.0


### **DATA PREPROCESSING**

In [35]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from  sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#preprocessing for numericals data
numeric_transformer = SimpleImputer(strategy='constant')

#preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#preprocessing for all
preprocessor = ColumnTransformer(
    [ ('num', numeric_transformer, num_cols),
      ('cat', categorical_transformer, my_cols),]
)

### ***MODEL SELECTION***

In [36]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=10,random_state=42)

### ***EVALUATION***

In [37]:
from sklearn.metrics import mean_absolute_error

#bundling first pp and model
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)
                              ])
#fit the train data
my_pipeline.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

#evaluate
score = mean_absolute_error(y_test, preds)
print("MAE IS ", score)




MAE IS  170393.65325233186


In [None]:
from sklearn.tree import DecisionTreeRegressor
