In [1]:
# General
import geopandas as gpd
import pandas as pd
import numpy as np

In [2]:
change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
       'Mega Projects': 5}

## Read csvs
print("Reading data...")
train_df = gpd.read_file('train.geojson')
test_df = gpd.read_file('test.geojson')
print("Data read successfully")

Reading data...
Data read successfully


In [3]:
# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [5]:
# FEATURE ENGINEERING
print("Feature Engineering...")

# 1. Geometry-based features
train_df = train_df[train_df.geometry.notna() & ~train_df.geometry.is_empty]
test_df = test_df[test_df.geometry.notna() & ~test_df.geometry.is_empty]

train_df["geometry"] = train_df["geometry"].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)
test_df["geometry"] = test_df["geometry"].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)

train_df = train_df.to_crs(epsg=3857)
test_df = test_df.to_crs(epsg=3857)

train_df = train_df[train_df.geometry.notna()]
test_df = test_df[test_df.geometry.notna()]

train_df['area'] = train_df.geometry.area
train_df['perimeter'] = train_df.geometry.length
train_df['compactness'] = train_df['area'] / (train_df['perimeter']**2 + 1e-6)

test_df['area'] = test_df.geometry.area
test_df['perimeter'] = test_df.geometry.length
test_df['compactness'] = test_df['area'] / (test_df['perimeter']**2 + 1e-6)

train_df[['area', 'perimeter', 'compactness']] = train_df[['area', 'perimeter', 'compactness']].fillna(0)
test_df[['area', 'perimeter', 'compactness']] = test_df[['area', 'perimeter', 'compactness']].fillna(0)


Feature Engineering...


Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get back the old behaviour.

  train_df = train_df[train_df.geometry.notna() & ~train_df.geometry.is_empty]
Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get back the old behaviour.

  train_df = train_df[train_df.geometry.notna()]
Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get back the old behaviour.

  test_df = test_df[test_df.geometry.notna()]


In [6]:
# 2. Date-based features
date_cols = ['date0', 'date1', 'date2', 'date3', 'date4']
if all(col in train_df.columns for col in date_cols):
    for col in date_cols:
        train_df[col] = pd.to_datetime(train_df[col], format="%d-%m-%Y", errors='coerce')
        test_df[col] = pd.to_datetime(test_df[col], format="%d-%m-%Y", errors='coerce')

    for i in range(len(date_cols) - 1):
        diff_col = f'days_diff_{i}_{i+1}'
        train_df[diff_col] = (train_df[date_cols[i+1]] - train_df[date_cols[i]]).dt.days
        test_df[diff_col] = (test_df[date_cols[i+1]] - test_df[date_cols[i]]).dt.days

    print("Date-based features successfully processed.")
else:
    print("Some date columns are missing, skipping date-based features.")

Date-based features successfully processed.


In [7]:
# 3. Categorical features
categorical_features = []
if 'urban_type' in train_df.columns:
    categorical_features.append('urban_type')
if 'geography_type' in train_df.columns:
    categorical_features.append('geography_type')

if categorical_features:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    train_cat = encoder.fit_transform(train_df[categorical_features])
    test_cat = encoder.transform(test_df[categorical_features])
else:
    train_cat = np.empty((len(train_df), 0))
    test_cat = np.empty((len(test_df), 0))

In [8]:
# 4. Combine numerical features
num_features = ['area', 'perimeter', 'compactness']
date_diff_cols = [col for col in train_df.columns if col.startswith('days_diff_')]
num_features.extend(date_diff_cols)

imputer = SimpleImputer(strategy='median')
train_num = imputer.fit_transform(train_df[num_features])
test_num = imputer.transform(test_df[num_features])

train_x = np.hstack([train_num, train_cat])
test_x = np.hstack([test_num, test_cat])

train_y = train_df['change_type'].apply(lambda x: change_type_map[x]).values

print("Feature engineering completed. Feature shapes:")
print("Train features:", train_x.shape)
print("Test features:", test_x.shape)

Feature engineering completed. Feature shapes:
Train features: (296145, 204)
Test features: (120526, 204)


In [9]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# DIMENSIONALITY REDUCTION
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test_x)

pca = PCA(n_components=0.95)
train_x_pca = pca.fit_transform(train_x_scaled)
test_x_pca = pca.transform(test_x_scaled)

print(f"Reduction from {train_x_scaled.shape[1]} to {train_x_pca.shape[1]} dimensions due to PCA")

# Separate training data
X_train, X_val, y_train, y_val = train_test_split(train_x_pca, train_y, test_size=0.2, random_state=42, stratify=train_y)

Reduction from 204 to 188 dimensions due to PCA


In [10]:
# Importing models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Dictionary to store model results
model_results = {}
best_model = None
best_score = 0


In [11]:
# RANDOM FOREST
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=cv, scoring="f1_macro", n_jobs=-1)
rf_mean_score = np.mean(rf_scores)

print(f"RandomForest - Mean F1 Score: {rf_mean_score:.4f}")
model_results["RandomForest"] = rf_mean_score

if rf_mean_score > best_score:
    best_score = rf_mean_score
    best_model = rf_model

RandomForest - Mean F1 Score: 0.3742


In [12]:
# SVM
svm_model = SVC(kernel="rbf", probability=True)
svm_scores = cross_val_score(svm_model, X_train, y_train, cv=cv, scoring="f1_macro", n_jobs=-1)
svm_mean_score = np.mean(svm_scores)

print(f"SVM - Mean F1 Score: {svm_mean_score:.4f}")
model_results["SVM"] = svm_mean_score

if svm_mean_score > best_score:
    best_score = svm_mean_score
    best_model = svm_model

KeyboardInterrupt: 

In [None]:
# LOGISTIC REGRESSION
lr_model = LogisticRegression(max_iter=500, n_jobs=-1)
lr_scores = cross_val_score(lr_model, X_train, y_train, cv=cv, scoring="f1_macro", n_jobs=-1)
lr_mean_score = np.mean(lr_scores)

print(f"LogisticRegression - Mean F1 Score: {lr_mean_score:.4f}")
model_results["LogisticRegression"] = lr_mean_score

if lr_mean_score > best_score:
    best_score = lr_mean_score
    best_model = lr_model

LogisticRegression - Mean F1 Score: 0.5218


In [13]:
# XGBOOST
xgb_model = XGBClassifier(eval_metric="mlogloss", n_jobs=-1)
xgb_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring="f1_macro", n_jobs=-1)
xgb_mean_score = np.mean(xgb_scores)

print(f"XGBoost - Mean F1 Score: {xgb_mean_score:.4f}")
model_results["XGBoost"] = xgb_mean_score

if xgb_mean_score > best_score:
    best_score = xgb_mean_score
    best_model = xgb_model


XGBoost - Mean F1 Score: 0.3720


In [14]:
# Train best model
print(f"Best model: {best_model.__class__.__name__} with F1-score: {best_score:.4f}")
best_model.fit(X_train, y_train)
pred_y = best_model.predict(test_x_pca)

# Save predictions
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("submission.csv", index=True, index_label='Id')

print("Predictions saved successfully!")


Best model: RandomForestClassifier with F1-score: 0.3742
Predictions saved successfully!
