# preprocessing
- find null values
- replace with feature mean
- find outliers (especially m2)
- enumarate categorical features
- drop title col
- drop id col
- convert all prices to try
- drop lat lon
- convert date values to be of the same race
- drop type (bcz all values are flat)
- drop currency
- remove outlier prices (25000 TL, 8500000TL)

In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
FILENAME = "real_estate_data.csv"
df = pd.read_csv(FILENAME)
df

In [None]:
# Find rows where 'bath' column has null values
null_bath_rows = df[df['bath'].isnull()]
null_bath_rows.isnull().count()
df[df["rooms"] == "Unknown"]

# Set rooms property

In [None]:
# Convert 'm2' column to numeric if it's not already
df['m2'] = pd.to_numeric(df['m2'], errors='coerce')

for i in df[df["rooms"] == "Unknown"].index:
    m2 = df.iloc[i]["m2"]
    mode = df[df['m2'] == m2]["rooms"].mode()
    if not mode.empty:  # Check if mode is not empty
        mode_value = mode.iloc[0]  # Take the first mode value
        df.at[i, "rooms"] = mode_value

# Set residence property

In [None]:
df['resid'] = (df['resid'] == 'unknown') & (df['price'] > 150000)

In [None]:
# Replace 'None' with np.nan if it's not already done
new_df = df.replace('None', np.NaN)

# Remove rows with any null values
df_cleaned = new_df.dropna()

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv('cleaned_data.csv', index=False)

# Display the cleaned DataFrame

df_cleaned



In [None]:

df = pd.read_csv('cleaned_data.csv')



In [None]:
missing_values_summary = df.isnull().sum()
missing_values_summary


# DROP FETURES

## Drop lat - lon - type - title - id - due and Status features from dataframe

In [None]:
if 'lat' in df.columns:
    df.drop('lat', axis=1, inplace=True)

if 'lon' in df.columns:
    df.drop('lon', axis=1, inplace=True)

if 'type' in df.columns:
    df.drop('type', axis=1, inplace=True)

if 'title' in df.columns:
    df.drop('title', axis=1, inplace=True)

if 'Id' in df.columns:
    df.drop('Id', axis=1, inplace=True)

if 'due' in df.columns:
    df.drop('due', axis=1, inplace=True)

if 'Status' in df.columns:
    df.drop('Status', axis=1, inplace=True)

#drop resid column
if 'resid' in df.columns:
    df.drop('resid', axis=1, inplace=True)

df

## Change all currency with turkish lira equivalent and drop currency feature

In [None]:

if "currency" in df.columns:

    df.loc[df['currency'] == 'Euro', 'price'] *= 4
    df.loc[df['currency'] == 'US Dollar', 'price'] *= 3.5
    df.loc[df['currency'] == 'British Pound', 'price'] *= 4.5

    df.drop("currency", axis=1, inplace=True)
df["price"]




## Outlier detection using confidence interval

In [None]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# Define a condition to identify outliers
outlier_condition = (df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR))

# sadece turk lirasinda karsilasilan bir durum
# Display rows containing outliers
outliers = df[outlier_condition]
df.drop(outliers.index, inplace=True)
df


## Date transformation

In [None]:
# df.loc()

# for tr, en in turkish_months.values():
#     df.loc[df['date'].str.contains(tr), 'price']

aylar = {"Ocak": "01", "Şubat": "02", "Mart": "03", "Nisan": "04", "Mayıs": "05", "Haziran": "06",
             "Temmuz": "07", "Ağustos": "08", "Eylül": "09", "Ekim": "10", "Kasım": "11", "Aralık": "12"}

def transform_date(date):
    match date:
        case str():
            result = '-'.join(date.split()[::-1])
            for ay, ay_kodu in aylar.items():
                result = result.replace(ay, ay_kodu)
            
            return result
        case _:
            return date

def transform_date_to_ms(date_str) -> int:
    from dateutil import parser

    match date_str:
        case str():
            return parser.parse(date_str, dayfirst=True).timestamp().__floor__()
        case _:
            return int(date_str)


# 'date' sütununu dönüştürün
df['date'] = df['date'].apply(transform_date)
df['date'] = df['date'].apply(transform_date_to_ms)
df['date']

## Bath transformation

In [None]:
def transform_bath(bath):
    match bath:
        case str():
            if "+" in bath:
                return float(bath.replace("+", ""))
            else:
                return float(bath)
        case _:
            return bath

df["bath"] = df["bath"].apply(transform_bath)
df

In [None]:
df["rooms"].value_counts()

In [None]:
df["rooms"].value_counts()

# Enumeration Process

In [None]:
from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder

def enumerate_column(column: str, df: DataFrame):
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

columns_to_enumerate = ["loc city", "loc county", "loc dist", "rooms", "age", "floor", "heat", ""]
df.dtypes


## Date converted from string to epoch ms

In [None]:
# enumerate_column("loc city", df)

In [None]:
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# Create a box plot for the 'price' column
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.boxplot(x='price', data=df, orient='v')
plt.title('Box Plot for Price')
plt.show()

In [None]:
obj = (df.dtypes == 'object')
object_cols = list(obj[obj].index)
print("Categorical variables:",len(object_cols))

int_ = (df.dtypes == 'int')
num_cols = list(int_[int_].index)
print("Integer variables:",len(num_cols))

fl = (df.dtypes == 'float')
fl_cols = list(fl[fl].index)
print("Float variables:",len(fl_cols))

In [None]:
from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    df[col] = label_encoder.fit_transform(df[col].astype(str))




In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(),
			cmap = 'BrBG',
			fmt = '.2f',
			linewidths = 2,
			annot = True)

In [None]:
# plt.figure(figsize=(18, 36))
# plt.title('Categorical Features: Distribution')
# plt.xticks(rotation=90)
# index = 1

# for col in object_cols:
# 	y = df[col].value_counts()
# 	plt.subplot(11, 4, index)
# 	plt.xticks(rotation=90)
# 	sns.barplot(x=list(y.index), y=y)
# 	index += 1

In [None]:
# Split

In [None]:
from sklearn.model_selection import train_test_split

# Separate target from predictors
y = df.price
X = df.drop('price', axis=1)

# Divide data into training and validation subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=42)
X_train.describe()


In [None]:


from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_test.drop(cols_with_missing, axis=1)

print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_test))


In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

In [None]:
print("Number of samples in X_train:", len(X_train))
print("Number of samples in y_train:", len(y_train))
print("Number of samples in X_test:", len(X_test))
print("Number of samples in y_valid:", len(y_test))


In [None]:
from sklearn.ensemble import RandomForestClassifier

def RandomForest(X_train,y_train, X_test, y_test):
    clf = RandomForestClassifier()
    x_scaled=sc.fit_transform(X_train)
    clf.fit(x_scaled, y_train)
    X_test=sc.fit_transform(X_test)
    y_pred = clf.predict(X_test)
    #print("ConfusionMatrix:")
    #print(confusion_matrix(y_test, y_pred))
    print("accuracy: ",accuracy_score(y_test, y_pred))
    print("f1_score: ",f1_score(y_test, y_pred, zero_division=1))
    sns.heatmap((confusion_matrix(y_test, y_pred)),annot=True,fmt='.5g',cmap="YlGn").set_title('Test Data'); 
    #return clf

In [None]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate and print the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

train_accuracy = model.score(X_train, y_train)  # R-squared or another appropriate metric
print(f'Train Accuracy: {train_accuracy}')

test_accuracy = model.score(X_test, y_test)  # R-squared or another appropriate metric
print(f'Test Accuracy: {test_accuracy}')


this can only mean one thing: *Linear regression is unefficient for this dataset*
RandomForest(X_train,y_train, X_test,  y_test)


In [None]:
# decision tree regression

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Create and train the decision tree regression model
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = tree.predict(X_test)

# Calculate and print the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

train_accuracy = tree.score(X_train, y_train)  # R-squared or another appropriate metric
print(f'Train Accuracy: {train_accuracy}')

test_accuracy = tree.score(X_test, y_test)  # R-squared or another appropriate metric
print(f'Test Accuracy: {test_accuracy}')
# # five element from df
# X_train.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is your DataFrame containing the dataset
# Replace 'your_target_column' with the actual name of your target column

# Extract features (X) and target variable (y)
X = df.drop('price', axis=1)  # Features
y = df['price']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data (optional but often recommended for RandomForest)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Fit the model to the training data
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load your dataset
# Assuming your dataset is stored in a DataFrame called 'df'
# Make sure to preprocess your data and handle any missing values before proceeding

# Convert categorical variables to numerical using Label Encoding
label_encoder = LabelEncoder()
df['loc city'] = label_encoder.fit_transform(df['loc city'])
df['loc county'] = label_encoder.fit_transform(df['loc county'])
df['loc dist'] = label_encoder.fit_transform(df['loc dist'])
df['heat'] = label_encoder.fit_transform(df['heat'])
df['bath'] = label_encoder.fit_transform(df['bath'])
df['furn'] = label_encoder.fit_transform(df['furn'])
df['status'] = label_encoder.fit_transform(df['status'])
df['loan'] = label_encoder.fit_transform(df['loan'])
df['saler'] = label_encoder.fit_transform(df['saler'])
df['exc'] = label_encoder.fit_transform(df['exc'])

# Select features (independent variables) and target variable (dependent variable)
X = df.drop(['price'], axis=1)  # Assuming 'price' is the target variable
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train)

# Make predictions on the validation set

# Make predictions on the test set
y_test_pred = model.predict(X_test)

# Evaluate the model on training set
mse_train = mean_squared_error(y_train, y_train_pred)
print(f'Training MSE: {mse_train}')

# Evaluate the model on validation set
# Evaluate the model on test set
mse_test = mean_squared_error(y_test, y_test_pred)
print(f'Test MSE: {mse_test}')  
# Now, you can use the trained model to make predictions on new data
# For example, you can use model.predict(new_data) where new_data is a DataFrame with the same columns as X_train


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Train the model
model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train)

# Make predictions on the test set
y_test_pred = model.predict(X_test)

# Calculate regression metrics for training set
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

# Calculate regression metrics for test set
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
train_accuracy = model.score(X_train, y_train) 
# Print the results
print("Metrics for Training Set:")
print(f"Mean Squared Error (MSE): {mse_train}")
print(f"Root Mean Squared Error (RMSE): {rmse_train}")
print(f"Mean Absolute Error (MAE): {mae_train}")
print(f"R-squared (R2): {r2_train}")

print("\nMetrics for Test Set:")
print(f"Mean Squared Error (MSE): {mse_test}")
print(f"Root Mean Squared Error (RMSE): {rmse_test}")
print(f"Mean Absolute Error (MAE): {mae_test}")
print(f"R-squared (R2): {r2_test}")
print(f"Accuracy: {train_accuracy}")

In [None]:
test_accuracy = model.score(X_test, y_test) 
print(f"Accuracy: {test_accuracy}")

In [None]:
# Scatter plot for Training Set
plt.figure(figsize=(10, 6))
plt.scatter(y_train, y_train_pred, color='blue', label='Actual vs. Predicted (Training Set)')
plt.title('Actual vs. Predicted Values - Training Set')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()

# Scatter plot for Test Set
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred, color='red', label='Actual vs. Predicted (Test Set)')
plt.title('Actual vs. Predicted Values - Test Set')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()