# Airbnb Berlin – Preisanalyse
Dieses Notebook analysiert die Airbnb-Angebote in Berlin und erstellt ein Vorhersagemodell für die Preise.
Exploratory Data Analysis (eda):

In [None]:
# (Eventually) Set Jupyter (YOUR) workstation:
import os
os.chdir("Path/to/your/project directory")

# Check workplace:
print("Arbeitsverzeichnis gesetzt auf:", os.getcwd())   # Or: pwd

In [None]:
# Import packages:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_cleaning import clean_airbnb_data
from src.modeling import train_models

In [None]:
# Load data:
df = pd.read_csv('data/listings.csv')

# Number of rows and columns:
print(df.shape)

df.head()

In [None]:
# Clean up data:
df_clean = clean_airbnb_data(df)

print(df_clean.shape)
df_clean.head()

In [None]:
# First visualizations:
plt.figure(figsize=(10, 4))
sns.histplot(df_clean['price'], bins=50, kde=True)
    # bins: Divide the range of data into 50 equal intervals
    # kde: Shows a smooth curve
plt.title('Verteilung der Preise')
plt.show()

In [None]:
# Feature Engineering:
features = ['room_type', 'minimum_nights', 'number_of_reviews', 'reviews_per_month']
df_model = df_clean[features + ['price']].dropna()

# One-hot Encoding (converts the categorical variable "room_type" into numeric columns):
df_model = pd.get_dummies(df_model, columns=['room_type'], drop_first=True)
    # drop_first: The first category is omitted to avoid redundancy

X = df_model.drop('price', axis=1)
y = df_model['price']

# print(df_model)
df_model.head()


In [None]:
# Train models ("LinearRegression", "RandomForest", "GradientBoosting" and "XGBoost"):
results = train_models(X, y)
for name, result in results.items():
    print(f"\n{name}:")
    for coef, value in result.items():
        if not isinstance(value, (int, float)):
            print(f"  {coef}: {value}")
        else:
            print(f"  {coef}: {value:.2f}")