In [1]:
%pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Collecting tqdm (from opendatasets)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting kaggle (from opendatasets)
  Downloading kaggle-1.7.4.2-py3-none-any.whl.metadata (16 kB)
Collecting bleach (from kaggle->opendatasets)
  Using cached bleach-6.2.0-py3-none-any.whl.metadata (30 kB)
Collecting protobuf (from kaggle->opendatasets)
  Using cached protobuf-6.30.2-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting python-slugify (from kaggle->opendatasets)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting setuptools>=21.0.0 (from kaggle->opendatasets)
  Downloading setuptools-79.0.1-py3-none-any.whl.metadata (6.5 kB)
Collecting text-unidecode (from kaggle->opendatasets)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Collecting webencodings (from kaggle->opendatasets)
  Using cached webencodings-0.5.1-py2.py3-non



In [4]:
%pip install --user kaggle



Note: you may need to restart the kernel to use updated packages.


In [3]:
import dask.dataframe as dd
import pandas as pd
# 1. Lazy load only needed columns
ddf = dd.read_csv("Europe towers.csv",
                  usecols=["LON","LAT","Country","Continent"],
                  dtype={"Continent":"category","Country":"category"})

# 2. Filter to Slovenia (still lazy)
ddf_sl = ddf[ddf.Country=="Slovenia"]

# 3. Persist to Parquet for super-fast re-loads later
ddf_sl.to_parquet("slovenia_towers.parquet", engine="pyarrow",
                  compression="snappy", write_index=False)

df = pd.read_parquet("slovenia_towers.parquet")

In [4]:
# Cell 1: Imports & Load Data
# ---------------------------
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
from sklearn.cluster import DBSCAN

# 1a. Load your ping DataFrame (example: all pings for one device)
#    Must have columns: lat, lon, datetime (datetime64[ns])
# df_pings = pd.read_parquet("df_user_all.parquet")  
#    — or however you have it loaded

# 1b. Load Slovenia cell-tower locations
df_towers = pd.read_parquet("slovenia_towers.parquet")  # columns: LAT, LON

print(df_towers.head())


         LON        LAT   Country Continent
0  15.650914  46.655765  Slovenia    Europe
1  15.724667  46.582526  Slovenia    Europe
2  15.652736  46.642071  Slovenia    Europe
3  15.650014  46.656214  Slovenia    Europe
4  15.660907  46.619164  Slovenia    Europe


In [6]:
# Cell A: Find the device with the most pings in a small head sample
from fastparquet import ParquetFile
import pandas as pd

pf      = ParquetFile("training_set/20230327.parquet")
# read only deviceid for the first 100k rows
df_head = pf.head(nrows=100_000, columns=["deviceid"])
# pick the most frequent device in that sample
top_dev = df_head["deviceid"].value_counts().idxmax()
print(f"Top device in sample: {top_dev}")

# Cell B: Load every ping for that top device from the full day (≈130 M rows)
import dask.dataframe as dd

# only load the columns we care about, and push the filter into the Parquet reader
ddf = dd.read_parquet(
    "training_set/20230327.parquet",
    columns=["deviceid","date","time","lat","lon"],
    filters=[("deviceid", "=", top_dev)]
)

# now compute into pandas — this will only pull in that one device’s rows
df_pings = ddf.compute()
print(f"Loaded {len(df_pings)} pings for device {top_dev}")

# Peek
df_pings.head()

Top device in sample: 136b30f450ef55cf22afcc5e6213c5f7f664766324c1057fa6edef424409b6a4
Loaded 3153 pings for device 136b30f450ef55cf22afcc5e6213c5f7f664766324c1057fa6edef424409b6a4


Unnamed: 0,deviceid,date,time,lat,lon
0,136b30f450ef55cf22afcc5e6213c5f7f664766324c105...,27.03.2023,20:00:04,46.05167,14.50667
1,136b30f450ef55cf22afcc5e6213c5f7f664766324c105...,27.03.2023,20:00:07,46.05167,14.50667
2,136b30f450ef55cf22afcc5e6213c5f7f664766324c105...,27.03.2023,20:00:19,46.05167,14.50667
3,136b30f450ef55cf22afcc5e6213c5f7f664766324c105...,27.03.2023,20:00:22,46.05167,14.50667
4,136b30f450ef55cf22afcc5e6213c5f7f664766324c105...,27.03.2023,20:00:29,46.05167,14.50667


In [8]:
# Cell 2: Build k-d Tree & Compute Nearest-Tower Distances
# --------------------------------------------------------
# 2a. Build spatial index on towers
tower_coords = np.vstack([df_towers["LAT"], df_towers["LON"]]).T
tower_tree   = cKDTree(tower_coords)

# 2b. Query each ping’s 3 nearest towers (d1 ≤ d2 ≤ d3)
ping_coords = np.vstack([df_pings["lat"], df_pings["lon"]]).T
dists, idxs = tower_tree.query(ping_coords, k=3)

# 2c. Attach distances & nearest-tower index as features
df_pings["d1_m"]    = dists[:,0]
df_pings["d2_m"]    = dists[:,1]
df_pings["d3_m"]    = dists[:,2]
df_pings["zone_id"] = idxs[:,0]    # Voronoi cell = nearest tower


In [9]:
# Cell 3: Outlier Removal (d1 > 3 km)
# -----------------------------------
# Drop any ping further than 3 000 m from its nearest tower
mask_valid = df_pings["d1_m"] <= 3000
df_clean   = df_pings.loc[mask_valid].copy()
print(f"Dropped {len(df_pings)-len(df_clean)} pings (>3km); {len(df_clean)} remain")


Dropped 0 pings (>3km); 3153 remain


In [None]:
# Cell 4: Zone-Aware Clustering
# -----------------------------
# For each tower-zone, run DBSCAN with a zone-specific eps
# (you can pick eps per zone based on local density; here we use 100 m)
# 0. Make sure df_clean has a clean integer index
df_clean = df_clean.reset_index(drop=True)

# 1. Prepare an output array of “unassigned” labels
stop_labels = np.full(len(df_clean), -1, dtype=int)

# 2. Zone-aware DBSCAN
eps_zone = 0.0009  # ~100 m in radians for haversine
for zone_id, grp in df_clean.groupby("zone_id"):
    # lat/lon in radians
    coords_rad = np.radians(grp[["lat", "lon"]].to_numpy())
    db = DBSCAN(eps=eps_zone, min_samples=5, metric="haversine")
    labels = db.fit_predict(coords_rad)
    
    # assign back by integer index
    stop_labels[grp.index] = labels

# 3. Add to DataFrame
df_clean["stop_cluster"] = stop_labels


ValueError: cannot reindex on an axis with duplicate labels

In [None]:
# Cell 5: Zone-Level Speed Binning
# -------------------------------
# Compute instantaneous speeds if you have datetime
df_clean = df_clean.sort_values("datetime")
delta_t = df_clean["datetime"].diff().dt.total_seconds().fillna(0)
# haversine distance
lat_r = np.radians(df_clean["lat"]); lon_r = np.radians(df_clean["lon"])
dlat  = lat_r.diff().fillna(0); dlon = lon_r.diff().fillna(0)
a     = np.sin(dlat/2)**2 + np.cos(lat_r.shift())*np.cos(lat_r)*np.sin(dlon/2)**2
dist  = 2*6371000*np.arctan2(np.sqrt(a), np.sqrt(1-a))
speeds = (dist / delta_t).replace([np.inf, -np.inf], np.nan).fillna(0)
df_clean["speed_m_s"] = speeds

# Aggregate per zone: min/max/avg speed and dwell time
zone_stats = (
    df_clean
      .groupby("zone_id")
      .agg(
        min_speed=("speed_m_s","min"),
        max_speed=("speed_m_s","max"),
        avg_speed=("speed_m_s","mean"),
        dwell_time=("delta_t","sum")
      )
)
zone_stats.head()
