# 1 Imports

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge

# 2 Data

In [2]:
df = pd.read_csv("Housing.csv")
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


# 3.1 Categorical Data Distribution

In [3]:
x = df.drop(['price'], axis=1)
y = df['price'] 

In [4]:
cat_features = x.select_dtypes(include=['object', 'category']).columns

for column in cat_features:
    label_encoder = LabelEncoder()
    x[column] = label_encoder.fit_transform(x[column])
    
all_columns = x.columns    
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
         ("StandardScaler", numeric_transformer, all_columns),        
    ]
)
x = preprocessor.fit_transform(x)

In [5]:
cat_features

Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [6]:
for column in cat_features:
    unique_values = df[column].unique()
    print(f"Unique values in {column}: {unique_values}")

Unique values in mainroad: ['yes' 'no']
Unique values in guestroom: ['no' 'yes']
Unique values in basement: ['no' 'yes']
Unique values in hotwaterheating: ['no' 'yes']
Unique values in airconditioning: ['yes' 'no']
Unique values in prefarea: ['yes' 'no']
Unique values in furnishingstatus: ['furnished' 'semi-furnished' 'unfurnished']


In [7]:
print("Shape of transformed data:", x.shape)
print("Number of columns in transformed data:", x.shape[1])


Shape of transformed data: (545, 12)
Number of columns in transformed data: 12


# 4.Data Splitting

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [9]:
models = [
    ("XGBoost", XGBRegressor()),
    ("Decision Tree", DecisionTreeRegressor()),
    ("Random Forest", RandomForestRegressor()),
    ("AdaBoost", AdaBoostRegressor()),
    ("Linear Regression", LinearRegression()),
    ("Lasso", Lasso()),
    ("Ridge", Ridge())
]

results = []
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append((name, mse, r2))

for name, mse, r2 in results:
    print(f"{name} - Mean Squared Error: {mse:.2f}, R-squared: {r2:.2f}")


XGBoost - Mean Squared Error: 2032404618961.44, R-squared: 0.60
Decision Tree - Mean Squared Error: 3050871940366.97, R-squared: 0.40
Random Forest - Mean Squared Error: 1934170185641.80, R-squared: 0.62
AdaBoost - Mean Squared Error: 2325152562302.96, R-squared: 0.54
Linear Regression - Mean Squared Error: 1771751116594.04, R-squared: 0.65
Lasso - Mean Squared Error: 1771752846723.99, R-squared: 0.65
Ridge - Mean Squared Error: 1772261478130.60, R-squared: 0.65
