In [36]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Lead the dataset
FILE_PATH="data.csv"
houses = pd.read_csv(FILE_PATH)
houses.shape

In [None]:
#Privew Dataset
houses.info()
houses.sample(5)

In [None]:
#Data Cleaning
houses.isnull().sum()

In [40]:
#Removing Duplicates
houses = pd.DataFrame.drop_duplicates(houses)

In [None]:
houses.isnull().sum()

In [None]:
houses.shape
houses.head()

In [43]:
houses = houses.drop(columns=['Unnamed: 0'])

In [None]:
houses.isnull().sum()

In [None]:
#Remove ',' and '.' from columns 'sold_at' & 'limit_price'
houses['sold_at'] = pd.to_numeric(
    houses['sold_at']
    .astype(str)
    .str.replace('.', '', regex=False)
    .str.replace(',', '.', regex=False),
    errors='coerce'
)

houses['limit_price'] = pd.to_numeric(
    houses['limit_price']
    .astype(str)
    .str.replace('.', '', regex=False)
    .str.replace(',', '.', regex=False),
    errors='coerce'
)

houses.head()

In [None]:
# Convert 'status' to categorical and encode as numeric.
houses['status'] = houses['status'].astype('category').cat.codes
houses.head()

In [None]:
# Load the German language model for named entity recognition (NER)
import spacy
nlp = spacy.load("de_core_news_sm")  

# Function to extract named entities from text (e.g., location, property type)
def extract_entities(text):
    doc = nlp(text)  # Process the text using SpaCy's NLP pipeline
    return [(ent.text, ent.label_) for ent in doc.ents]  # Return entity text and label

# Apply the function to the 'info' column to extract entities for each house description
houses['house_features'] = houses['info'].apply(extract_entities)

# Configure Pandas to display full content of columns without truncation
pd.set_option('display.max_colwidth', None)  
houses = houses.drop(columns=['info'])

# Display a random sample of 5 rows with house descriptions and their extracted features
print(houses['house_features'].sample(5))


In [None]:
houses.sample(10)

In [49]:
features = ['status', 'limit_price', 'house_features']
target = 'sold_at'
x = houses[features]
y = houses[target]

In [50]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [51]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [52]:
# Example: Convert list elements into strings

# Convert list elements into strings
X_train['house_features'] = X_train['house_features'].apply(
    lambda x: ', '.join([str(item) for item in x]) if isinstance(x, list) else str(x)
)
X_test['house_features'] = X_test['house_features'].apply(
    lambda x: ', '.join([str(item) for item in x]) if isinstance(x, list) else str(x)
)

In [None]:
print(X_train.sample(1))

In [None]:
# Perform one-hot encoding for house_features
X_train = pd.get_dummies(X_train, columns=['house_features'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['house_features'], drop_first=True)

print(X_train.sample(1))

In [55]:
# Align the columns in X_train and X_test
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# Fill missing columns in X_test with zeros
X_test = X_test.fillna(0)


In [None]:
X_train.isnull().sum()

In [None]:
y_train.isnull().sum()

In [58]:
X_train = X_train.dropna()
y_train = y_train.dropna()

In [59]:
y_train.isnull().sum()

# download X_train as pkl
import pickle
with open('X_train.pkl', 'wb') as f:
    pickle.dump(X_train, f)

In [None]:
# Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Create a default template with all features set to default values
default_input = pd.DataFrame({col: [False] if col.startswith("house_features_") else [0] for col in X_train})

# Update the relevant columns with user-provided input
user_input = {
    
    "limit_price": 150000,
    # Activate specific house features based on input
    "house_features_('Alleinlage Schönerstädt', 'LOC')": True,
    "house_features_('attraktives Mehrfamilienhaus', 'PER'), ('Paul-Gruner-Straße', 'LOC'), ('Chemnitz', 'LOC')": True,
}

# Update default input with user-provided values
for key, value in user_input.items():
    if key in default_input.columns:
        default_input[key] = value

# Predict house price using the trained model
predicted_price = rf_model.predict(default_input)

# Display the result
print(f"Predicted House Price: {predicted_price[0]}")

In [None]:
# Evaluate the model
y_pred = rf_model.predict(X_test)
print(y_pred)

In [None]:
# Example metrics
from sklearn.metrics import mean_squared_error, r2_score
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

In [64]:
# Save the model
pickle.dump(rf_model, open('rf_model.pkl', 'wb'))