# preprocessing
- find null values
- replace with feature mean
- find outliers (especially m2)
- enumarate categorical features
- drop title col
- drop id col
- convert all prices to try
- drop lat lon
- convert date values to be of the same race
- drop type (bcz all values are flat)
- drop currency
- remove outlier prices (25000 TL, 8500000TL)

In [None]:
import polars as pl
import matplotlib.pyplot as plt

In [None]:
df = pl.read_csv("real_estate_data.csv", null_values=["Unknown", "None"])
df.head()

| Tmp  | Tmp | 
| ---  | --- | 
| TITLE      | title of the ad |
| ID         | identification number of the ad |
| PRICE      | price of the flat |
| CURRENCY   | currency of the price     |
| LOCCITY    | city of the building |
| LOCOUNTY   | county of the building |
| LOCDIST    | distirct of the building |
| LAT        | latitude of the location of the building |
| LON        | longitude of the location of the building |
| DATE      | ad release date |
| TYPE      | type of the ad |
| M2        | size of the flat in meters |
| ROOMS     | Rooms in flat |
| AGE       | age of the building |
| FLOOR     | floor number of the flat |
| TFLOOR    | number of floors in building |
| HEAT      | heating type of the building |
| BATH      | number of bathrooms in the building |
| FURN      | flat is furnitured or not |
| STATUS    | occupied by owner, lessee or empty |
| RESID     | building is in residence or not |
| DUE       | monthly dues of the building |
| LOAN      | flat is available for loan or not |
| SALER     | saler of the flat is owner, real estate office or construction company |
| EXC       | exchange is possible or not |

## Enumerate Function


In [None]:
def set_rank(feature: str, df: pl.DataFrame) -> pl.DataFrame:
    new_name = f"{feature} rank"
    if new_name not in df.columns:
        return df.with_columns(pl.col(feature).rank("dense").alias(new_name))
    return df

In [None]:
df.null_count()

In [None]:
df.describe()

## Enumerate features ()

In [None]:
df.columns

## Drop unnecessary Features lat - lon - type - title - id - due and Status features from dataframe

In [None]:
df_updated = df.select(pl.col("*").exclude("lat", "lon", "type", "title", "Id", "due","status", ))
df_updated.columns

## Merge all loc * to location column

In [None]:
if "loc city" in df_updated.columns:
    df_updated = df_updated.select(pl.struct(["loc city", "loc county", "loc dist"]).map_elements(lambda x: f'{x["loc city"]}-{x["loc county"]}-{x["loc dist"]}').alias("location"), pl.exclude(["loc city", "loc county", "loc dist"]))
df_updated
    

In [None]:
df_updated.group_by("location").count()


## Eliminate currency feature 


In [None]:
df_updated.group_by("currency").count()

In [None]:
# 02.01.2017 Günü Saat 15:30'da Belirlenen Gösterge Niteliğindeki Türkiye Cumhuriyet Merkez Bankası Kurları
# https://www.tcmb.gov.tr/kurlar/kurlar_tr.html

dolar_buy = 3.5338
dolar_sell = 3.5402
dolar = (dolar_buy + dolar_sell) / 2

euro_buy = 3.7086
euro_sell = 3.7153
euro = (euro_buy + euro_sell) / 2

brit_buy = 4.3488
brit_sell = 4.3715
brit = (brit_buy + brit_sell) / 2


currency_dict = {"Euro": euro, "US Dollar": dolar, "British Pound": brit, "Turkish Lira": 1.0}

In [None]:
# update currency column
if "currency" in df_updated.columns: 
    df_updated = df_updated.select(pl.struct(["currency", "price"]).map_elements(lambda x: currency_dict[x["currency"]] * x["price"]).alias("price_tr"), pl.col("*").exclude("currency", "price"))
df_updated

In [None]:
df_updated.sample(10)

In [None]:
df_updated.columns

## Update dates with timestamp

In [None]:
aylar = {"Ocak": "01", "Şubat": "02", "Mart": "03", "Nisan": "04", "Mayıs": "05", "Haziran": "06",
             "Temmuz": "07", "Ağustos": "08", "Eylül": "09", "Ekim": "10", "Kasım": "11", "Aralık": "12"}

def transform_date(date):
    match date:
        case str():
            result = '-'.join(date.split()[::-1])
            for ay, ay_kodu in aylar.items():
                result = result.replace(ay, ay_kodu)
            
            return result
        case _:
            return date

def transform_date_to_ms(date_str) -> int:
    from dateutil import parser

    match date_str:
        case str():
            return parser.parse(date_str, dayfirst=True).timestamp().__floor__()
        case _:
            return int(date_str)

df_updated = df_updated.with_columns(pl.col("date").map_elements(transform_date).map_elements(transform_date_to_ms))
df_updated


## Bath transformation

In [None]:
df_updated.describe()

## Enumeration Process

In [None]:
for d_type, column in zip(df_updated.dtypes, df_updated.columns):
    if d_type == pl.String:
        df_updated = set_rank(column, df_updated)
df_updated

## Create correlation matrix

In [None]:

df_updated.drop_nulls().select(pl.exclude(pl.String)).corr()


Room has  0.57 correlation with bath so we can fill null rooms using bath feature

Heat has 0.077738 correlation with tfloor,  0.04684 corr with floor


In [None]:
df_updated = df_updated.with_columns(pl.col("rooms rank").map_elements(lambda s: s.fill_null(s.mode())).over("bath rank")) 
df_updated = df_updated.with_columns(pl.col("heat rank").map_elements(lambda s: s.fill_null(s.mode())).over("tfloor rank")) 
df_updated = df_updated.with_columns(pl.col("bath rank").map_elements(lambda s: s.fill_null(s.mode())).over("rooms rank")) 
df_updated.describe()

In [None]:
df_updated = df_updated.select(pl.exclude("furn rank", "resid rank", "loan rank", "furn", "loan"))
df_updated.select(pl.exclude(pl.String)).describe()

## Drop Outliers

In [None]:
l = "price_tr"

Q1 = pl.col(l).quantile(0.25)
Q3 = pl.col(l).quantile(0.75)
IQR = Q3 - Q1

LOWER = Q1 - 1.5 * IQR
UPPER = Q3 + 1.5 * IQR

df_updated = df_updated.filter((pl.col(l) > LOWER) & (pl.col(l) < UPPER))
df_updated.describe()

## split data to train and test

In [None]:
from sklearn.model_selection import train_test_split

df_model = df_updated.select(pl.exclude(pl.String))

# Separate target from predictors
y = df_model.select("price_tr").to_pandas()
X = df_model.select(pl.exclude("price_tr")).to_pandas()

# Divide data into training and validation subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=42)
X_train.describe()

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Number of samples

In [None]:
print("Number of samples in X_train:", len(X_train))
print("Number of samples in y_train:", len(y_train))
print("Number of samples in X_test:", len(X_test))
print("Number of samples in y_valid:", len(y_test))

### A general function that prints all important metrics

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def print_model_stats(y_train_pred, y_test_pred, model):
    # Calculate regression metrics for training set
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    r2_train = r2_score(y_train, y_train_pred)

    # Calculate regression metrics for test set
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_test = r2_score(y_test, y_test_pred)
    train_accuracy = model.score(X_train, y_train) 
    test_accuracy = model.score(X_test, y_test)

    # Print the results
    print("Metrics for Training Set:")
    print(f"Mean Squared Error (MSE): {mse_train}")
    print(f"Root Mean Squared Error (RMSE): {rmse_train}")
    print(f"Mean Absolute Error (MAE): {mae_train}")
    print(f"R-squared (R2): {r2_train}")
    print(f"Accuracy: {train_accuracy}")

    print("\n")

    print("Metrics for Test Set:")
    print(f"Mean Squared Error (MSE): {mse_test}")
    print(f"Root Mean Squared Error (RMSE): {rmse_test}")
    print(f"Mean Absolute Error (MAE): {mae_test}")
    print(f"R-squared (R2): {r2_test}")
    print(f"Accuracy: {test_accuracy}")

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print_model_stats(y_train_pred, y_test_pred, model)

## Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)

y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

print_model_stats(y_train_pred, y_test_pred, tree)

## Decision Tree with Grid search

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their possible values
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

tree = DecisionTreeRegressor()

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=tree, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

# Perform grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}\n')

# Create a new decision tree regressor with the best hyperparameters
best_tree = DecisionTreeRegressor(max_depth=15, min_samples_leaf=4, min_samples_split=10)


# Train the model on the entire training set
best_tree.fit(X_train, y_train)

# Make predictions on the test set
y_train_pred = best_tree.predict(X_train)
y_test_pred = best_tree.predict(X_test)

print_model_stats(y_train_pred, y_test_pred, model)


## Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train)

# Make predictions on the test set
y_test_pred = model.predict(X_test)

print_model_stats(y_train_pred, y_test_pred, model)

## Random Forest with Scaler

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data (optional but often recommended for RandomForest)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Fit the model to the training data
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_train_pred = rf_model.predict(X_train_scaled)
y_test_pred = rf_model.predict(X_test_scaled)

print_model_stats(y_train_pred, y_test_pred, rf_model)

## Scatter Plots

In [None]:
# Scatter plot for Training Set
plt.figure(figsize=(10, 6))
plt.scatter(y_train, y_train_pred, color='blue', label='Actual vs. Predicted (Training Set)')
plt.title('Actual vs. Predicted Values - Training Set')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()

# Scatter plot for Test Set
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred, color='red', label='Actual vs. Predicted (Test Set)')
plt.title('Actual vs. Predicted Values - Test Set')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()