In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#Reading Data using Pandas
data=pd.read_csv('./melb_data.csv')
print(data.head())

       Suburb           Address  Rooms Type      Price Method SellerG  \
0  Abbotsford      85 Turner St      2    h  1480000.0      S  Biggin   
1  Abbotsford   25 Bloomburg St      2    h  1035000.0      S  Biggin   
2  Abbotsford      5 Charles St      3    h  1465000.0     SP  Biggin   
3  Abbotsford  40 Federation La      3    h   850000.0     PI  Biggin   
4  Abbotsford       55a Park St      4    h  1600000.0     VB  Nelson   

        Date  Distance  Postcode  ...  Bathroom  Car  Landsize  BuildingArea  \
0  3/12/2016       2.5    3067.0  ...       1.0  1.0     202.0           NaN   
1  4/02/2016       2.5    3067.0  ...       1.0  0.0     156.0          79.0   
2  4/03/2017       2.5    3067.0  ...       2.0  0.0     134.0         150.0   
3  4/03/2017       2.5    3067.0  ...       2.0  1.0      94.0           NaN   
4  4/06/2016       2.5    3067.0  ...       1.0  2.0     120.0         142.0   

   YearBuilt  CouncilArea Lattitude  Longtitude             Regionname  \
0     

In [3]:
#Removing rows without price value
data.dropna(axis=0,subset=['Price'],inplace=True)
Y=data['Price']
X=data.drop(['Price'],axis=1)

In [4]:
#Splitting in training and validation data
from sklearn.model_selection import train_test_split
X_train_full,X_val_full,Y_train_full,Y_val_full=train_test_split(X,Y,test_size=0.3,random_state=42)

In [5]:
#Getting columns with numeric values
num_col=[col for col in X_train_full.columns if X_train_full[col].dtype in ['int64','float64']]
print(num_col)

['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']


In [6]:
#Removing problematic categorical columns
obj_col_b=[col for col in X_train_full.columns if X_train_full[col].dtype=='object']
good_col=[col for col in obj_col_b if set(X_val_full[col]).issubset(X_train_full[col])]
bad_col=list(set(obj_col_b)-set(good_col))
X_train_full=X_train_full.drop(bad_col,axis=1)
X_val_full=X_val_full.drop(bad_col,axis=1)

#Getting columns with categoric values and number of unique values<10
obj_col_a=[col for col in X_train_full.columns if X_train_full[col].dtype=='object' and X_train_full[col].nunique()<10]
print(obj_col_a)





['Type', 'Method', 'Regionname']


In [7]:
my_col=num_col+obj_col_a
X_train=X_train_full[my_col].copy()
X_val=X_val_full[my_col].copy()

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#Making Numerical transformer
num_transformer=SimpleImputer(strategy='constant')

#Making Categorical transformer
cat_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])
#Preparing Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_col),
        ('cat', cat_transformer, obj_col_a)
    ])
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=500,random_state=42)

#Making Pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, Y_train_full)
preds=my_pipeline.predict(X_val_full)


from sklearn.metrics import mean_absolute_error
# Evaluating the model
score = mean_absolute_error(Y_val_full, preds)
print('MAE:', score)



MAE: 167499.55196055514
