
# Part B â€” Menifot Clean Tracks (Final Submission, Updated)

**Author:** Ashim Sharma  
**Goal:**  
Create a robust, label-free GPS spoofing (menifot) cleaning algorithm that:
- Works on any dataset with time, lat, lon columns.  
- Removes unrealistic motion points (spoofed).  
- Optionally evaluates accuracy if ground-truth labels exist (even as YES/NO).


## 1. Imports & Helper Functions

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import radians, sin, cos, asin, sqrt

pd.set_option("display.max_columns", None)

def haversine_km(lat1, lon1, lat2, lon2):
    """Compute great-circle distance between two points in kilometers."""
    lat1, lon1, lat2, lon2 = map(float, [lat1, lon1, lat2, lon2])
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return 6371 * c


## 2. Load Datasets

In [None]:

def load_menifot(path):
    df = pd.read_csv(path)
    print(f"\nLoaded {path} with shape {df.shape}")
    print("Columns:", list(df.columns))
    display(df.head(3))
    return df

menifot1 = load_menifot("menifot1.csv")
menifot2 = load_menifot("menifot2.csv")


## 3. Detect Timestamp Column

In [None]:

def detect_ts_col(df):
    candidates = ["timestamp","time","Time","datetime","Datetime","DateTime","date_time"]
    for c in candidates:
        if c in df.columns:
            return c
    for c in df.columns:
        try:
            s = pd.to_datetime(df[c], errors="raise")
            if s.notna().mean() > 0.8:
                return c
        except Exception:
            continue
    raise KeyError(f"No suitable timestamp column found in {list(df.columns)}")

print("menifot1 timestamp column:", detect_ts_col(menifot1))
print("menifot2 timestamp column:", detect_ts_col(menifot2))


## 4. Detect Latitude/Longitude Columns

In [None]:

def detect_lat_lon_cols(df):
    lat_candidates = ["lat","Lat","LAT","latitude","Latitude"]
    lon_candidates = ["lon","Lon","LON","long","lng","longitude","Longitude"]

    lat_col = next((c for c in lat_candidates if c in df.columns), None)
    lon_col = next((c for c in lon_candidates if c in df.columns), None)

    if lat_col is None or lon_col is None:
        raise KeyError(f"Latitude/Longitude columns not found in {list(df.columns)}")

    print(f"Detected lat column: {lat_col}, lon column: {lon_col}")
    return lat_col, lon_col

detect_lat_lon_cols(menifot1)
detect_lat_lon_cols(menifot2)


## 5. Clean Track Function (Final Robust Version)

In [None]:

def clean_track(
    df,
    lat_col=None,
    lon_col=None,
    ts_col=None,
    min_dt_seconds=10,
    min_jump_km=5.0,
    speed_cap_knots_low=18,
    speed_cap_knots_high=45,
    iqr_factor=1.5,
):
    if ts_col is None:
        ts_col = detect_ts_col(df)
    if lat_col is None or lon_col is None:
        lat_col, lon_col = detect_lat_lon_cols(df)

    work = df.copy()
    work[ts_col] = pd.to_datetime(work[ts_col])
    work = work.sort_values(ts_col).reset_index(drop=True)

    work["lat_prev"] = work[lat_col].shift(1)
    work["lon_prev"] = work[lon_col].shift(1)
    work["t_prev"] = work[ts_col].shift(1)
    work["lat_next"] = work[lat_col].shift(-1)
    work["lon_next"] = work[lon_col].shift(-1)
    work["t_next"] = work[ts_col].shift(-1)

    work["dt_prev_h"] = (work[ts_col] - work["t_prev"]).dt.total_seconds() / 3600.0
    work["dt_next_h"] = (work["t_next"] - work[ts_col]).dt.total_seconds() / 3600.0

    work["dist_prev_km"] = work.apply(
        lambda r: haversine_km(r["lat_prev"], r["lon_prev"], r[lat_col], r[lon_col]) if pd.notna(r["lat_prev"]) else np.nan,
        axis=1,
    )
    work["dist_next_km"] = work.apply(
        lambda r: haversine_km(r[lat_col], r[lon_col], r["lat_next"], r["lon_next"]) if pd.notna(r["lat_next"]) else np.nan,
        axis=1,
    )

    def safe_speed(dist_km, dt_h):
        return np.where(
            (dt_h > (min_dt_seconds / 3600.0)) & (dt_h > 0),
            dist_km / dt_h / 1.852,
            np.nan,
        )

    work["speed_prev_knots"] = safe_speed(work["dist_prev_km"], work["dt_prev_h"])
    work["speed_next_knots"] = safe_speed(work["dist_next_km"], work["dt_next_h"])

    all_speeds = (
        pd.concat([work["speed_prev_knots"], work["speed_next_knots"]])
        .replace([np.inf, -np.inf], np.nan)
        .dropna()
    )

    if len(all_speeds) == 0:
        work["is_outlier_pred"] = 0
        return work.copy(), work

    q1, q3 = np.percentile(all_speeds, [25, 75])
    iqr = q3 - q1
    iqr_upper = q3 + iqr_factor * iqr
    p90 = np.percentile(all_speeds, 90)
    raw_thr = max(p90, iqr_upper, speed_cap_knots_low)
    speed_thr = float(np.clip(raw_thr, speed_cap_knots_low, speed_cap_knots_high))

    cond_high_prev = (work["speed_prev_knots"] > speed_thr) & (work["dist_prev_km"] > min_jump_km)
    cond_high_next = (work["speed_next_knots"] > speed_thr) & (work["dist_next_km"] > min_jump_km)
    work["is_outlier_pred"] = (cond_high_prev & cond_high_next).astype(int)

    clean_df = work[work["is_outlier_pred"] == 0].copy()
    return clean_df, work


## 6. Normalize YES/NO Labels for Evaluation

In [None]:

def normalize_is_menifot(series):
    s = series.astype(str).str.strip().str.upper()
    mapping = {"1":1,"YES":1,"Y":1,"TRUE":1,"T":1,"0":0,"NO":0,"N":0,"FALSE":0,"F":0}
    return s.map(mapping)


## 7. Apply & Report

In [None]:

def apply_and_report(df, name):
    print(f"\n===== {name} =====")
    clean_df, full_df = clean_track(df)

    total = len(full_df)
    removed = int(full_df["is_outlier_pred"].sum())
    kept = len(clean_df)

    print(f"Total points: {total}")
    print(f"Predicted spoofed removed: {removed} ({removed/total*100:.2f}%)")
    print(f"Clean track points kept: {kept}")

    if "is_menifot" in full_df.columns:
        gt_norm = normalize_is_menifot(full_df["is_menifot"])
        pred = full_df["is_outlier_pred"].astype(int)
        mask = gt_norm.isin([0, 1])
        if mask.sum() == 0:
            print("\nWarning: Label column not interpretable, skipping eval.")
            return clean_df, full_df

        gt = gt_norm[mask]
        pred_eval = pred[mask]

        tp = int(((pred_eval == 1) & (gt == 1)).sum())
        fp = int(((pred_eval == 1) & (gt == 0)).sum())
        fn = int(((pred_eval == 0) & (gt == 1)).sum())
        tn = int(((pred_eval == 0) & (gt == 0)).sum())
        acc = (tp + tn) / len(gt)
        print("\nEvaluation vs label:")
        print(f"TP={tp}, FP={fp}, FN={fn}, TN={tn}, Accuracy={acc:.3f}")
    else:
        print("No 'is_menifot' column; skipped evaluation.")

    return clean_df, full_df

clean_meni1, meni1_full = apply_and_report(menifot1, "menifot1")
clean_meni2, meni2_full = apply_and_report(menifot2, "menifot2")


## 8. Optional Visualization

In [None]:

def plot_tracks(full_df, clean_df, lat_col=None, lon_col=None, title="Track Cleaning"):
    if lat_col is None or lon_col is None:
        lat_col, lon_col = detect_lat_lon_cols(full_df)
    plt.figure(figsize=(6,6))
    plt.scatter(full_df[lon_col], full_df[lat_col], s=6, alpha=0.25, label="Original")
    plt.scatter(clean_df[lon_col], clean_df[lat_col], s=10, alpha=0.9, marker="x", label="Cleaned")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()
