In [18]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error



from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [19]:
df=pd.read_csv('data_version_4.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,House Number,City,Parking Spaces,Bedrooms,Bathrooms,Servant Quarters,Kitchens,Store Rooms,price,Purpose,Location,Age Possession,area,price_per_sqft,colony,province,society,property Type,area_room_ratio
0,3,52061409,Rawalpindi,1.0,3.0,3.0,0.0,1.0,0.0,1.5,For Sale,"Bahria Town Rawalpindi, Rawalpindi, Punjab,",Relatively New,1361.0,11021.0,Bahria Town Phase 8,Punjab,Bahria Town Rawalpindi,Houses,453.666667
1,4,52010487,Rawalpindi,3.0,6.0,6.0,1.0,2.0,1.0,4.75,For Sale,"Bahria Town Rawalpindi, Rawalpindi, Punjab,",New Property,3812.0,12461.0,Bahria Greens,Punjab,Bahria Town Rawalpindi,Houses,635.333333
2,5,42644685,Rawalpindi,2.0,3.0,3.0,1.0,1.0,1.0,2.25,For Sale,"Askari 14, Rawalpindi, Punjab,",Moderately Old,2722.0,8266.0,Askari 14,Punjab,Askari 14,Houses,907.333333
3,6,52099738,Rawalpindi,3.0,2.0,2.0,0.0,1.0,1.0,1.15,For Sale,"Bahria Town Rawalpindi, Rawalpindi, Punjab,",Relatively New,1171.0,9821.0,Bahria Town,Punjab,Bahria Town Rawalpindi,Flats,585.5
4,7,51956291,Rawalpindi,1.0,4.0,5.0,0.0,0.0,0.0,1.75,For Sale,"Defence Road, Rawalpindi, Punjab,",Undefined,1361.0,12858.0,Defence Road,Punjab,Defence Road,Houses,340.25


In [20]:
df.drop('Unnamed: 0',axis=1,inplace=True)
df.drop(['House Number','society','price_per_sqft','Location','area_room_ratio','Purpose'],axis=1,inplace=True)

In [21]:
df.head()

Unnamed: 0,City,Parking Spaces,Bedrooms,Bathrooms,Servant Quarters,Kitchens,Store Rooms,price,Age Possession,area,colony,province,property Type
0,Rawalpindi,1.0,3.0,3.0,0.0,1.0,0.0,1.5,Relatively New,1361.0,Bahria Town Phase 8,Punjab,Houses
1,Rawalpindi,3.0,6.0,6.0,1.0,2.0,1.0,4.75,New Property,3812.0,Bahria Greens,Punjab,Houses
2,Rawalpindi,2.0,3.0,3.0,1.0,1.0,1.0,2.25,Moderately Old,2722.0,Askari 14,Punjab,Houses
3,Rawalpindi,3.0,2.0,2.0,0.0,1.0,1.0,1.15,Relatively New,1171.0,Bahria Town,Punjab,Flats
4,Rawalpindi,1.0,4.0,5.0,0.0,0.0,0.0,1.75,Undefined,1361.0,Defence Road,Punjab,Houses


In [22]:
df['property Type'].value_counts()

Unnamed: 0_level_0,count
property Type,Unnamed: 1_level_1
Houses,677
Flats,70
Upper,8
Lower,4


In [23]:
df['property Type'].replace({'Upper':"Houses",'Lower':"Houses"},inplace=True)

In [24]:
df['property Type'].value_counts()

Unnamed: 0_level_0,count
property Type,Unnamed: 1_level_1
Houses,689
Flats,70


In [25]:
df.head()

Unnamed: 0,City,Parking Spaces,Bedrooms,Bathrooms,Servant Quarters,Kitchens,Store Rooms,price,Age Possession,area,colony,province,property Type
0,Rawalpindi,1.0,3.0,3.0,0.0,1.0,0.0,1.5,Relatively New,1361.0,Bahria Town Phase 8,Punjab,Houses
1,Rawalpindi,3.0,6.0,6.0,1.0,2.0,1.0,4.75,New Property,3812.0,Bahria Greens,Punjab,Houses
2,Rawalpindi,2.0,3.0,3.0,1.0,1.0,1.0,2.25,Moderately Old,2722.0,Askari 14,Punjab,Houses
3,Rawalpindi,3.0,2.0,2.0,0.0,1.0,1.0,1.15,Relatively New,1171.0,Bahria Town,Punjab,Flats
4,Rawalpindi,1.0,4.0,5.0,0.0,0.0,0.0,1.75,Undefined,1361.0,Defence Road,Punjab,Houses


In [26]:
columns_to_encode =['property Type']

In [27]:
df.columns

Index(['City', 'Parking Spaces', 'Bedrooms', 'Bathrooms', 'Servant Quarters',
       'Kitchens', 'Store Rooms', 'price', 'Age Possession', 'area', 'colony',
       'province', 'property Type'],
      dtype='object')

In [28]:
X = df.drop(columns=['price'])
y = df['price']

In [29]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [30]:
!pip install category_encoders



In [31]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Parking Spaces', 'Bedrooms', 'Bathrooms', 'Servant Quarters','Kitchens', 'Store Rooms','area']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['property Type']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['Age Possession','colony','province','City'])
    ],
    remainder='passthrough'
)

In [32]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [33]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [34]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [35]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.856343,0.979519
7,gradient boosting,0.85378,1.013791
5,random forest,0.842855,1.048142
6,extra trees,0.840007,1.076316
1,svr,0.818289,1.107681
9,mlp,0.761551,1.126349
2,ridge,0.789831,1.153873
8,adaboost,0.77768,1.173699
4,decision tree,0.754777,1.263721
0,linear_reg,0.684045,1.316138
