In [None]:
import geopandas as gpd
import rasterio
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import mlflow
from mlflow.sklearn import log_model

# Read in the two datasets
neighborhoods = gpd.read_file('Land_Prices_Neighborhood_Köln.gpkg')
grid = gpd.read_file('Zensus_Köln_Grid_100m.gpkg')

# Perform spatial join using sjoin
merged_data = gpd.sjoin(grid, neighborhoods, how='left', op='intersects')

# Save merged data to file
merged_data.to_file('merged_data.gpkg', driver='GPKG')


# Read in merged data
merged_data = gpd.read_file("merged_data.gpkg")

# Read in Sentinel data
with rasterio.open("Sentinel_Köln.tif") as src:
    sentinel_data = src.read(1)
    sentinel_transform = src.transform

# Read in WorldCover data
with rasterio.open("WorldCover_Köln.tif") as src:
    worldcover_data = src.read(1)
    worldcover_transform = src.transform

# Compute average population density in the neighborhood
merged_data["population_density"] = merged_data["Population"] / merged_data.geometry.area

# Compute amount of green space in the neighborhood
merged_data["green_space"] = np.sum(worldcover_data[merged_data.geometry.to_crs(src.crs).bounds.round().astype(int)])

# Compute number of buildings with special function in the neighborhood
special_functions = ["school", "hospital", "restaurant"]
for sf in special_functions:
    buildings = gpd.read_file(f"OSM_{sf}_Köln.gpkg")
    buildings_in_neighborhoods = gpd.sjoin(merged_data, buildings, op='intersects')
    merged_data[f"{sf}_buildings"] = buildings_in_neighborhoods.groupby("Neighborhood_FID").size()

# Compute average (or shortest) distance from each residential building to a building with a special function
residential_buildings = gpd.read_file("OSM_residential_Köln.gpkg")
for sf in special_functions:
    buildings = gpd.read_file(f"OSM_{sf}_Köln.gpkg")
    distances = []
    for index, residential_building in residential_buildings.iterrows():
        distances_to_buildings = buildings.distance(residential_building.geometry)
        distances.append(np.min(distances_to_buildings))
    merged_data[f"{sf}_distance"] = distances

# Compute total length of walkable paths in the neighborhood
walkable_paths = gpd.read_file("OSM_walkable_paths_Köln.gpkg")
paths_in_neighborhoods = gpd.sjoin(merged_data, walkable_paths, op='intersects')
merged_data["walkable_path_length"] = paths_in_neighborhoods.groupby("Neighborhood_FID").length.sum()

# Compute isolation of the senior population
senior_population = merged_data["Population_65+"] / merged_data["Population"]
merged_data["isolation_seniors"] = senior_population - np.mean(senior_population)

# Prepare data for regression
X = merged_data[["population_density", "green_space", "school_buildings", "hospital_buildings",
                 "restaurant_buildings", "school_distance", "hospital_distance", "restaurant_distance",
                 "walkable_path_length", "isolation_seniors"]]
y = merged_data["Land_Value"]

# Scale X data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Log model and parameters with MLflow
with mlflow.start_run():
    mlflow.log_param("model_type", "linear_regression")
    mlflow.log
