# preprocessing
- find null values
- replace with feature mean
- find outliers (especially m2)
- enumarate categorical features
- drop title col
- drop id col
- convert all prices to try
- drop lat lon
- convert date values to be of the same race
- drop type (bcz all values are flat)
- drop currency
- remove outlier prices (25000 TL, 8500000TL)

In [1]:
import polars as pl
import matplotlib.pyplot as plt

In [2]:
df = pl.read_csv("real_estate_data.csv", null_values=["Unknown", "None"])
df.head()

title,Id,price,currency,loc city,loc county,loc dist,lat,lon,date,type,m2,rooms,age,floor,tfloor,heat,bath,furn,status,resid,due,loan,saler,exc
str,i64,i64,str,str,str,str,f64,f64,str,str,i64,str,str,str,str,str,str,bool,str,str,i64,bool,str,str
"""SAHİBİNDEN-%10…",323589399,225000,"""Turkish Lira""","""İstanbul""","""Sancaktepe""","""Sarıgazi Mah.""",41.003838,29.213019,"""28 Temmuz 2016…","""Flat""",85,"""2+1""","""5-10""","""High Entrance""","""5""","""Combi""","""1""",False,"""Tenant""","""False.1""",0.0,True,"""Owner""","""True.1"""
"""İSTANBULUN MER…",323599659,360000,"""Turkish Lira""","""İstanbul""","""Üsküdar""","""Ünalan Mah.""",41.005051,29.062934,"""28 Temmuz 2016…","""Flat""",87,"""2+1""","""0""","""Ground Floor""","""9""","""Combi""","""1""",False,"""Empty""","""True""",,True,"""Owner""","""False"""
"""Merkezde 4+1 Ş…",323536105,335000,"""Turkish Lira""","""İstanbul""","""Arnavutköy""","""Hastane Mah.""",41.15349,28.619143,"""27 Temmuz 2016…","""Flat""",160,"""4+1""","""0""","""4""","""5""","""Combi""","""1""",False,"""Empty""","""True""",,True,"""Owner""","""False"""
"""kelepir büyük …",323709392,235000,"""Turkish Lira""","""Antalya""","""Muratpaşa""","""Yenigün Mah.""",36.902681,30.721658,"""28 Temmuz 2016…","""Flat""",165,"""4+1""","""5-10""","""Rise 4""","""4""","""Floor Calorie""","""2""",False,"""Tenant""","""False""",25.0,True,"""Owner""","""True"""
"""ACİL SAHİBİNDE…",323730617,229000,"""Turkish Lira""","""Manisa""","""Yunusemre""","""Muradiye""",38.638822,27.346087,"""28 Temmuz 2016…","""Flat""",208,"""5+1""","""0""","""Rise 4""","""5""","""Combi""","""2""",False,"""Tenant""","""False""",,True,"""Owner""","""False"""


| Tmp  | Tmp | 
| ---  | --- | 
| TITLE      | title of the ad |
| ID         | identification number of the ad |
| PRICE      | price of the flat |
| CURRENCY   | currency of the price     |
| LOCCITY    | city of the building |
| LOCOUNTY   | county of the building |
| LOCDIST    | distirct of the building |
| LAT        | latitude of the location of the building |
| LON        | longitude of the location of the building |
| DATE      | ad release date |
| TYPE      | type of the ad |
| M2        | size of the flat in meters |
| ROOMS     | Rooms in flat |
| AGE       | age of the building |
| FLOOR     | floor number of the flat |
| TFLOOR    | number of floors in building |
| HEAT      | heating type of the building |
| BATH      | number of bathrooms in the building |
| FURN      | flat is furnitured or not |
| STATUS    | occupied by owner, lessee or empty |
| RESID     | building is in residence or not |
| DUE       | monthly dues of the building |
| LOAN      | flat is available for loan or not |
| SALER     | saler of the flat is owner, real estate office or construction company |
| EXC       | exchange is possible or not |

## Enumerate Function


In [3]:
def set_rank(feature: str, df: pl.DataFrame) -> pl.DataFrame:
    new_name = f"{feature} rank"
    if new_name not in df.columns:
        return df.with_columns(pl.col(feature).rank("dense").alias(new_name))
    return df

In [4]:
df.null_count()

title,Id,price,currency,loc city,loc county,loc dist,lat,lon,date,type,m2,rooms,age,floor,tfloor,heat,bath,furn,status,resid,due,loan,saler,exc
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,80,80,0,0,0,2,0,0,0,574,73,615,4,20,21252,675,0,0


In [5]:
df.describe()

describe,title,Id,price,currency,loc city,loc county,loc dist,lat,lon,date,type,m2,rooms,age,floor,tfloor,heat,bath,furn,status,resid,due,loan,saler,exc
str,str,f64,f64,str,str,str,str,f64,f64,str,str,f64,str,str,str,str,str,str,str,str,str,f64,str,str,str
"""count""","""33272""",33272.0,33272.0,"""33272""","""33272""","""33272""","""33272""",33192.0,33192.0,"""33272""","""33272""",33272.0,"""33270""","""33272""","""33272""","""33272""","""32698""","""33199""","""32657""","""33268""","""33252""",12020.0,"""32597""","""33272""","""33272"""
"""null_count""","""0""",0.0,0.0,"""0""","""0""","""0""","""0""",80.0,80.0,"""0""","""0""",0.0,"""2""","""0""","""0""","""0""","""574""","""73""","""615""","""4""","""20""",21252.0,"""675""","""0""","""0"""
"""mean""",,436580000.0,392147.060862,,,,,39.75082,30.532574,,,671.963543,,,,,,,,,,113.376705,,,
"""std""",,55570000.0,622345.248211,,,,,1.531269,3.248306,,,17384.054832,,,,,,,,,,1101.26955,,,
"""min""","""! ! ! FIRSAT D…",9394397.0,17000.0,"""British Pound""","""Adana""","""Acıpayam""","""1 Nolu Beşirli…",36.078714,25.915707,"""01 Ağustos 201…","""Flat""",0.0,"""1+0 (Studio)""","""0""","""1""","""1""","""Air Conditioni…","""1""","""False""","""Empty""","""False""",0.0,"""False""","""Bank""","""False"""
"""25%""",,422607676.0,170000.0,,,,,38.39765,28.721244,,,95.0,,,,,,,,,,15.0,,,
"""50%""",,464277408.0,250000.0,,,,,40.433758,29.121923,,,120.0,,,,,,,,,,30.0,,,
"""75%""",,472774972.0,380000.0,,,,,41.011123,32.511549,,,160.0,,,,,,,,,,90.0,,,
"""max""","""【ＫＯÇＡＫ】'tan ⭐️…",477549169.0,14000000.0,"""US Dollar""","""Şırnak""","""Şişli""","""Şıralık Mah.""",42.035103,44.048744,"""31 Temmuz 2017…","""Flat""",2500000.0,"""9+5""","""5-10""","""Villa Type""","""9""","""VRV""","""6+""","""True""","""Tenant""","""True""",88000.0,"""True""","""Real Estate Of…","""True.1"""


## Enumerate features ()

In [6]:
df.columns

['title',
 'Id',
 'price',
 'currency',
 'loc city',
 'loc county',
 'loc dist',
 'lat',
 'lon',
 'date',
 'type',
 'm2',
 'rooms',
 'age',
 'floor',
 'tfloor',
 'heat',
 'bath',
 'furn',
 'status',
 'resid',
 'due',
 'loan',
 'saler',
 'exc']

## Drop unnecessary Features lat - lon - type - title - id - due and Status features from dataframe

In [7]:
df_updated = df.select(pl.col("*").exclude("lat", "lon", "type", "title", "Id", "due","status", ))
df_updated.columns

['price',
 'currency',
 'loc city',
 'loc county',
 'loc dist',
 'date',
 'm2',
 'rooms',
 'age',
 'floor',
 'tfloor',
 'heat',
 'bath',
 'furn',
 'resid',
 'loan',
 'saler',
 'exc']

## Merge all loc * to location column

In [8]:
if "loc city" in df_updated.columns:
    df_updated = df_updated.select(pl.struct(["loc city", "loc county", "loc dist"]).map_elements(lambda x: f'{x["loc city"]}-{x["loc county"]}-{x["loc dist"]}').alias("location"), pl.exclude(["loc city", "loc county", "loc dist"]))
df_updated
    

location,price,currency,date,m2,rooms,age,floor,tfloor,heat,bath,furn,resid,loan,saler,exc
str,i64,str,str,i64,str,str,str,str,str,str,bool,str,bool,str,str
"""İstanbul-Sanca…",225000,"""Turkish Lira""","""28 Temmuz 2016…",85,"""2+1""","""5-10""","""High Entrance""","""5""","""Combi""","""1""",false,"""False.1""",true,"""Owner""","""True.1"""
"""İstanbul-Üsküd…",360000,"""Turkish Lira""","""28 Temmuz 2016…",87,"""2+1""","""0""","""Ground Floor""","""9""","""Combi""","""1""",false,"""True""",true,"""Owner""","""False"""
"""İstanbul-Arnav…",335000,"""Turkish Lira""","""27 Temmuz 2016…",160,"""4+1""","""0""","""4""","""5""","""Combi""","""1""",false,"""True""",true,"""Owner""","""False"""
"""Antalya-Muratp…",235000,"""Turkish Lira""","""28 Temmuz 2016…",165,"""4+1""","""5-10""","""Rise 4""","""4""","""Floor Calorie""","""2""",false,"""False""",true,"""Owner""","""True"""
"""Manisa-Yunusem…",229000,"""Turkish Lira""","""28 Temmuz 2016…",208,"""5+1""","""0""","""Rise 4""","""5""","""Combi""","""2""",false,"""False""",true,"""Owner""","""False"""
"""Hatay-Belen-Fa…",300000,"""Turkish Lira""","""28 Temmuz 2016…",220,"""4+2""","""0""","""6""","""6""","""Combi""","""2""",false,"""True""",true,"""Owner""","""False"""
"""Ankara-Çankaya…",370000,"""Turkish Lira""","""28 Temmuz 2016…",100,"""3+1""","""21-25""","""3""","""3""","""Combi""","""1""",false,"""False""",true,"""Owner""","""False"""
"""Nevşehir-Merke…",315000,"""Turkish Lira""","""28 Temmuz 2016…",276,"""5+2""","""0""","""Rise 4""","""4""","""Combi""","""4""",false,"""True""",false,"""Owner""","""True"""
"""İstanbul-Eseny…",1650000,"""Turkish Lira""","""28 Temmuz 2016…",210,"""4+1""","""0""","""22""","""30+""","""Central""","""3""",false,"""True""",false,"""Owner""","""False"""
"""Aydın-Didim-Ça…",220000,"""Turkish Lira""","""28 Temmuz 2016…",80,"""2+1""","""3""","""2""","""3""","""Air Conditioni…","""1""",true,"""False""",,"""Owner""","""False"""


In [9]:
df_updated.group_by("location").count()

location,count
str,u32
"""Çanakkale-Merk…",17
"""Uşak-Merkez-At…",5
"""Hatay-Belen-Sa…",2
"""Konya-Selçuklu…",22
"""Kilis-Merkez-A…",3
"""Kayseri-Pınarb…",1
"""Ankara-Etimesg…",16
"""Trabzon-Ortahi…",2
"""Antalya-Kepez-…",5
"""Düzce-Merkez-Ş…",4



## Eliminate currency feature 


In [10]:
df_updated.group_by("currency").count()

currency,count
str,u32
"""Turkish Lira""",32004
"""Euro""",167
"""US Dollar""",1076
"""British Pound""",25


In [11]:
# 02.01.2017 Günü Saat 15:30'da Belirlenen Gösterge Niteliğindeki Türkiye Cumhuriyet Merkez Bankası Kurları
# https://www.tcmb.gov.tr/kurlar/kurlar_tr.html

dolar_buy = 3.5338
dolar_sell = 3.5402
dolar = (dolar_buy + dolar_sell) / 2

euro_buy = 3.7086
euro_sell = 3.7153
euro = (euro_buy + euro_sell) / 2

brit_buy = 4.3488
brit_sell = 4.3715
brit = (brit_buy + brit_sell) / 2


currency_dict = {"Euro": euro, "US Dollar": dolar, "British Pound": brit, "Turkish Lira": 1.0}

In [12]:
# update currency column
if "currency" in df_updated.columns: 
    df_updated = df_updated.select(pl.struct(["currency", "price"]).map_elements(lambda x: currency_dict[x["currency"]] * x["price"]).alias("price_tr"), pl.col("*").exclude("currency", "price"))
df_updated

price_tr,location,date,m2,rooms,age,floor,tfloor,heat,bath,furn,resid,loan,saler,exc
f64,str,str,i64,str,str,str,str,str,str,bool,str,bool,str,str
225000.0,"""İstanbul-Sanca…","""28 Temmuz 2016…",85,"""2+1""","""5-10""","""High Entrance""","""5""","""Combi""","""1""",false,"""False.1""",true,"""Owner""","""True.1"""
360000.0,"""İstanbul-Üsküd…","""28 Temmuz 2016…",87,"""2+1""","""0""","""Ground Floor""","""9""","""Combi""","""1""",false,"""True""",true,"""Owner""","""False"""
335000.0,"""İstanbul-Arnav…","""27 Temmuz 2016…",160,"""4+1""","""0""","""4""","""5""","""Combi""","""1""",false,"""True""",true,"""Owner""","""False"""
235000.0,"""Antalya-Muratp…","""28 Temmuz 2016…",165,"""4+1""","""5-10""","""Rise 4""","""4""","""Floor Calorie""","""2""",false,"""False""",true,"""Owner""","""True"""
229000.0,"""Manisa-Yunusem…","""28 Temmuz 2016…",208,"""5+1""","""0""","""Rise 4""","""5""","""Combi""","""2""",false,"""False""",true,"""Owner""","""False"""
300000.0,"""Hatay-Belen-Fa…","""28 Temmuz 2016…",220,"""4+2""","""0""","""6""","""6""","""Combi""","""2""",false,"""True""",true,"""Owner""","""False"""
370000.0,"""Ankara-Çankaya…","""28 Temmuz 2016…",100,"""3+1""","""21-25""","""3""","""3""","""Combi""","""1""",false,"""False""",true,"""Owner""","""False"""
315000.0,"""Nevşehir-Merke…","""28 Temmuz 2016…",276,"""5+2""","""0""","""Rise 4""","""4""","""Combi""","""4""",false,"""True""",false,"""Owner""","""True"""
1.65e6,"""İstanbul-Eseny…","""28 Temmuz 2016…",210,"""4+1""","""0""","""22""","""30+""","""Central""","""3""",false,"""True""",false,"""Owner""","""False"""
220000.0,"""Aydın-Didim-Ça…","""28 Temmuz 2016…",80,"""2+1""","""3""","""2""","""3""","""Air Conditioni…","""1""",true,"""False""",,"""Owner""","""False"""


In [13]:
df_updated.sample(10)

price_tr,location,date,m2,rooms,age,floor,tfloor,heat,bath,furn,resid,loan,saler,exc
f64,str,str,i64,str,str,str,str,str,str,bool,str,bool,str,str
255000.0,"""İstanbul-Malte…","""2017-09-03""",90,"""2+1""","""0""","""High Entrance""","""4""","""Combi""","""1""",False,"""False""",True,"""Real Estate Of…","""False"""
175000.0,"""Manisa-Yunusem…","""2017-08-28""",75,"""1+1""","""0""","""3""","""3""","""Combi""","""1""",False,"""True""",True,"""Real Estate Of…","""False"""
200000.0,"""İstanbul-Bahçe…","""2017-08-28""",85,"""2+1""","""5-10""","""High Entrance""","""5""","""Combi""","""1""",False,"""False""",True,"""Real Estate Of…","""False"""
135000.0,"""Şanlıurfa-Hali…","""2017-08-23""",170,"""3+1""","""2""","""Rise 1""","""1""","""Stove""","""1""",False,"""False""",True,"""Owner""","""True"""
190000.0,"""İstanbul-Sanca…","""2017-09-03""",110,"""3+1""","""0""","""1""","""5""","""Combi""","""2""",False,"""False""",True,"""Real Estate Of…","""False"""
260000.0,"""İstanbul-Tuzla…","""2017-08-20""",85,"""2+1""","""0""","""4""","""4""","""Combi""","""1""",,"""False""",True,"""Owner""","""False"""
140000.0,"""Antalya-Serik-…","""2017-08-28""",61,"""1+1""","""5-10""","""1""","""1""","""Combi""","""1""",False,"""True""",True,"""Bank""","""False"""
230000.0,"""Kocaeli-Çayıro…","""2017-08-21""",120,"""3+1""","""5-10""","""5""","""6""","""Combi""","""1""",False,"""False""",True,"""Bank""","""False"""
390000.0,"""İstanbul-Ümran…","""2017-08-23""",95,"""2+1""","""5-10""","""2""","""5""","""Combi""","""2""",False,"""True""",True,"""Real Estate Of…","""True"""
257000.0,"""Kocaeli-Derinc…","""2017-08-26""",180,"""3+1""","""3""","""Penthouse""","""5""","""Combi""","""2""",False,"""False""",True,"""Owner""","""True"""


In [14]:
df_updated.columns

['price_tr',
 'location',
 'date',
 'm2',
 'rooms',
 'age',
 'floor',
 'tfloor',
 'heat',
 'bath',
 'furn',
 'resid',
 'loan',
 'saler',
 'exc']

## Update dates with timestamp

In [15]:
aylar = {"Ocak": "01", "Şubat": "02", "Mart": "03", "Nisan": "04", "Mayıs": "05", "Haziran": "06",
             "Temmuz": "07", "Ağustos": "08", "Eylül": "09", "Ekim": "10", "Kasım": "11", "Aralık": "12"}

def transform_date(date):
    match date:
        case str():
            result = '-'.join(date.split()[::-1])
            for ay, ay_kodu in aylar.items():
                result = result.replace(ay, ay_kodu)
            
            return result
        case _:
            return date

def transform_date_to_ms(date_str) -> int:
    from dateutil import parser

    match date_str:
        case str():
            return parser.parse(date_str, dayfirst=True).timestamp().__floor__()
        case _:
            return int(date_str)

df_updated = df_updated.with_columns(pl.col("date").map_elements(transform_date).map_elements(transform_date_to_ms))
df_updated


## Bath transformation

In [None]:
df_updated.describe()

## Enumeration Process

In [None]:
for d_type, column in zip(df_updated.dtypes, df_updated.columns):
    if d_type == pl.String:
        df_updated = set_rank(column, df_updated)
df_updated

## Create correlation matrix

In [None]:

df_updated.drop_nulls().select(pl.exclude(pl.String)).corr()


Room has  0.57 correlation with bath so we can fill null rooms using bath feature

Heat has 0.077738 correlation with tfloor,  0.04684 corr with floor


In [None]:
df_updated = df_updated.with_columns(pl.col("rooms rank").map_elements(lambda s: s.fill_null(s.mode())).over("bath rank")) 
df_updated = df_updated.with_columns(pl.col("heat rank").map_elements(lambda s: s.fill_null(s.mode())).over("tfloor rank")) 
df_updated = df_updated.with_columns(pl.col("bath rank").map_elements(lambda s: s.fill_null(s.mode())).over("rooms rank")) 
df_updated.describe()

In [None]:
df_updated = df_updated.select(pl.exclude("furn rank", "resid rank", "loan rank", "furn", "loan"))
df_updated.select(pl.exclude(pl.String)).describe()

## Drop Outliers

In [None]:
l = "price_tr"

Q1 = pl.col(l).quantile(0.25)
Q3 = pl.col(l).quantile(0.75)
IQR = Q3 - Q1

LOWER = Q1 - 1.5 * IQR
UPPER = Q3 + 1.5 * IQR

df_updated = df_updated.filter((pl.col(l) > LOWER) & (pl.col(l) < UPPER))
df_updated.describe()

## split data to train and test

In [None]:
from sklearn.model_selection import train_test_split

df_model = df_updated.select(pl.exclude(pl.String))

# Separate target from predictors
y = df_model.select("price_tr").to_pandas()
X = df_model.select(pl.exclude("price_tr")).to_pandas()

# Divide data into training and validation subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=42)
X_train.describe()

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Number of samples

In [None]:
print("Number of samples in X_train:", len(X_train))
print("Number of samples in y_train:", len(y_train))
print("Number of samples in X_test:", len(X_test))
print("Number of samples in y_valid:", len(y_test))

### Combine all ran models

In [None]:
pred_results = []

### A general function that prints all important metrics

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def print_model_stats(y_train_pred, y_test_pred, model):
    # Calculate regression metrics for training set
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    r2_train = r2_score(y_train, y_train_pred)

    # Calculate regression metrics for test set
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_test = r2_score(y_test, y_test_pred)
    train_accuracy = model.score(X_train, y_train) 
    test_accuracy = model.score(X_test, y_test)

    # Print the results
    print("Metrics for Training Set:")
    print(f"Mean Squared Error (MSE): {mse_train}")
    print(f"Root Mean Squared Error (RMSE): {rmse_train}")
    print(f"Mean Absolute Error (MAE): {mae_train}")
    print(f"R-squared (R2): {r2_train}")
    print(f"Accuracy: {train_accuracy}")

    print("\n")

    print("Metrics for Test Set:")
    print(f"Mean Squared Error (MSE): {mse_test}")
    print(f"Root Mean Squared Error (RMSE): {rmse_test}")
    print(f"Mean Absolute Error (MAE): {mae_test}")
    print(f"R-squared (R2): {r2_test}")
    print(f"Accuracy: {test_accuracy}")

    # visualize
    plt.scatter(y_train, y_train_pred, label='Training Set')
    plt.scatter(y_test, y_test_pred, label='Test Set', alpha=0.5)  # Use alpha to control point transparency
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.legend()
    plt.title('Actual vs. Predicted Values')
    plt.show()

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

y_train_score = model.score(X_train, y_train)
y_test_score = model.score(X_test, y_test)

pred_results.append((y_train_score, y_test_score, "Linear Regression"))

print_model_stats(y_train_pred, y_test_pred, model)

## Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)

y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

y_train_score = tree.score(X_train, y_train)
y_test_score = tree.score(X_test, y_test)

pred_results.append((y_train_score, y_test_score, "DT"))

print_model_stats(y_train_pred, y_test_pred, tree)

## Decision Tree Regression with Cross Validation

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

tree = DecisionTreeRegressor()

# Combine the training and test data for cross-validation
X_combined = np.vstack((X_train, X_test))
y_combined = np.concatenate((y_train, y_test))

# Perform cross-validation with 5 folds
cv_scores = cross_val_score(tree, X_combined, y_combined, cv=5)

# Fit the model on the entire training set
tree.fit(X_combined, y_combined)

# Make predictions on the training and test sets
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

# Compute scores
y_train_score = tree.score(X_train, y_train)
y_test_score = tree.score(X_test, y_test)

pred_results.append((y_train_score, y_test_score, "DT CV"))

print_model_stats(y_train_pred, y_test_pred, tree)

## Decision Tree with Grid search

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their possible values
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

tree = DecisionTreeRegressor()

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=tree, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

# Perform grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}\n')

# Create a new decision tree regressor with the best hyperparameters
best_tree = DecisionTreeRegressor(max_depth=15, min_samples_leaf=4, min_samples_split=10)


# Train the model on the entire training set
best_tree.fit(X_train, y_train)

# Make predictions on the test set
y_train_pred = best_tree.predict(X_train)
y_test_pred = best_tree.predict(X_test)

y_train_score = best_tree.score(X_train, y_train)
y_test_score = best_tree.score(X_test, y_test)

pred_results.append((y_train_score, y_test_score, "DT Grid Search"))

print_model_stats(y_train_pred, y_test_pred, model)


## Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train)

# Make predictions on the test set
y_test_pred = model.predict(X_test)

y_train_score = model.score(X_train, y_train)
y_test_score = model.score(X_test, y_test)

pred_results.append((y_train_score, y_test_score, "RF"))

print_model_stats(y_train_pred, y_test_pred, model)

## Random Forest with Scaler

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data (optional but often recommended for RandomForest)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Fit the model to the training data
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_train_pred = rf_model.predict(X_train_scaled)
y_test_pred = rf_model.predict(X_test_scaled)

y_train_score = rf_model.score(X_train, y_train)
y_test_score = rf_model.score(X_test, y_test)

pred_results.append((y_train_score, y_test_score, "RF Scaled"))

print_model_stats(y_train_pred, y_test_pred, rf_model)

## Scatter Plots

In [None]:
# Scatter plot for Training Set
plt.figure(figsize=(10, 6))
plt.scatter(y_train, y_train_pred, color='blue', label='Actual vs. Predicted (Training Set)')
plt.title('Actual vs. Predicted Values - Training Set')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()

# Scatter plot for Test Set
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred, color='red', label='Actual vs. Predicted (Test Set)')
plt.title('Actual vs. Predicted Values - Test Set')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()

## Visualize scores of algorithms

In [None]:
# remove last from pred_results
new_pred_results = pred_results[:-1]


# Extract data from the tuple
train_scores, test_scores, algorithms = zip(*new_pred_results)

# Create a bar plot for training scores
plt.figure(figsize=(10, 5))
plt.bar(algorithms, train_scores, color='blue', alpha=0.7, label='Training Score')

# Add data labels
for i, score in enumerate(train_scores):
    plt.text(i, score + 0.01, f'{score:.2f}', ha='center', va='bottom')

# Create a bar plot for test scores
plt.bar(algorithms, test_scores, color='orange', alpha=0.7, label='Test Score')

# Add data labels
for i, score in enumerate(test_scores):
    plt.text(i, score + 0.01, f'{score:.2f}', ha='center', va='bottom')

# Set plot properties
plt.ylim(0, 1.0)
plt.title('Training and Test Scores of Regression Algorithms')
plt.xlabel('Algorithms')
plt.ylabel('Scores')
plt.legend()
plt.show()