In [1]:
import json
from shapely.geometry import Point, Polygon

def clean_vgg_from_duplicates(vgg_polygons_path = "/home/adelb/Downloads/test_color (6).json", out_vgg_path = "/home/adelb/Downloads/test_color_clean.json"):

    with open(vgg_polygons_path, "r") as f:
        vgg_polygons = json.load(f)
    # -------------------------
    # HELPER: POLYGON IoU
    # -------------------------
    def poly_in_poly(poly1, poly2):
        if not poly1.is_valid:
            poly1 = poly1.buffer(0)
        if not poly2.is_valid:
            poly2 = poly2.buffer(0)

        inter = poly1.intersection(poly2).area
        p1a = poly1.area 
        p2a = poly2.area 
        
        p1_in_p2 = inter/p1a if p1a > 0 else 0
        p2_in_p1 = inter/p2a if p2a > 0 else 0
        
        
        return max(p1_in_p2, p2_in_p1)
        
    pip_thresh = 0.6

    def polygon_iou(poly1, poly2):
        # fix invalids
        if not poly1.is_valid:
            poly1 = poly1.buffer(0)
        if not poly2.is_valid:
            poly2 = poly2.buffer(0)

        inter = poly1.intersection(poly2).area
        union = poly1.union(poly2).area
        return inter / union if union > 0 else 0
    iou_thresh = 0.6
    # -------------------------
    # STEP 1 — DEDUPLICATE POLYGONS
    # -------------------------
    for img_key, img_data in vgg_polygons.items():
        regions = img_data.get("regions", [])

        polys = []
        for idx, region in enumerate(regions.values()):
            shape = region["shape_attributes"]
            if shape["name"] != "polygon":
                continue
            xs = shape["all_points_x"]
            ys = shape["all_points_y"]
            
            coords = list(zip(xs, ys))
            if len(coords) >= 3:  # need at least 3 distinct points
                poly = Polygon(coords)
            else:
                continue
            conf = float(region["region_attributes"].get("confidence", 0))
            label = region["region_attributes"].get("label", '')

            # Store separate dict, no shapely object in final structure
            polys.append({"poly": poly, "conf": conf, "label": label, "region": dict(region)})

        keep = {}
        used = set()
        k = 0
        for i in range(len(polys)):
            if i in used:
                continue
            best = polys[i]
            for j in range(i + 1, len(polys)):
                if j in used:
                    continue
                iou = polygon_iou(polys[i]["poly"], polys[j]["poly"])
                pip = poly_in_poly(polys[i]["poly"], polys[j]["poly"])
                if iou > iou_thresh or pip > pip_thresh:
                    if polys[j]["conf"] > best['conf']:
                        best = polys[j]
                    used.add(j)
            used.add(i)
            keep[k] = best['region']
            k += 1

        vgg_polygons[img_key]["regions"] = keep
        
        
    with open(out_vgg_path, 'w') as f:
        json.dump(vgg_polygons, f)

In [1]:
import ijson
import json
from shapely.geometry import Polygon

def clean_vgg_from_duplicates_stream(
    vgg_polygons_path="/home/adelb/Downloads/test_color (6).json", 
    out_vgg_path="/home/adelb/Downloads/test_color_clean.json"
):
    # -------------------------
    # HELPER: POLYGON IoU
    # -------------------------
    def poly_in_poly(poly1, poly2):
        if not poly1.is_valid:
            poly1 = poly1.buffer(0)
        if not poly2.is_valid:
            poly2 = poly2.buffer(0)

        inter = poly1.intersection(poly2).area
        p1a = poly1.area 
        p2a = poly2.area 
        
        p1_in_p2 = inter / p1a if p1a > 0 else 0
        p2_in_p1 = inter / p2a if p2a > 0 else 0
        
        return max(p1_in_p2, p2_in_p1)

    pip_thresh = 0.6

    def polygon_iou(poly1, poly2):
        if not poly1.is_valid:
            poly1 = poly1.buffer(0)
        if not poly2.is_valid:
            poly2 = poly2.buffer(0)

        inter = poly1.intersection(poly2).area
        union = poly1.union(poly2).area
        return inter / union if union > 0 else 0

    iou_thresh = 0.6

    # -------------------------
    # STREAM PARSE & PROCESS
    # -------------------------
    with open(vgg_polygons_path, "rb") as f_in, open(out_vgg_path, "w") as f_out:
        f_out.write("{")  # start JSON

        parser = ijson.kvitems(f_in, "")  # stream top-level dict (img_key → img_data)
        first = True
        for img_key, img_data in parser:
            regions = img_data.get("regions", {})

            polys = []
            for idx, region in regions.items():
                shape = region["shape_attributes"]
                if shape["name"] != "polygon":
                    continue
                xs = shape["all_points_x"]
                ys = shape["all_points_y"]
                coords = list(zip(xs, ys))
                if len(coords) >= 3:
                    poly = Polygon(coords)
                else:
                    continue

                conf = float(region["region_attributes"].get("confidence", 0))
                label = region["region_attributes"].get("label", '')

                polys.append({
                    "poly": poly,
                    "conf": conf,
                    "label": label,
                    "region": dict(region)
                })

            keep = {}
            used = set()
            k = 0
            for i in range(len(polys)):
                if i in used:
                    continue
                best = polys[i]
                for j in range(i + 1, len(polys)):
                    if j in used:
                        continue
                    iou = polygon_iou(polys[i]["poly"], polys[j]["poly"])
                    pip = poly_in_poly(polys[i]["poly"], polys[j]["poly"])
                    if iou > iou_thresh or pip > pip_thresh:
                        if polys[j]["conf"] > best["conf"]:
                            best = polys[j]
                        used.add(j)
                used.add(i)
                keep[k] = best["region"]
                k += 1

            # overwrite with cleaned regions
            img_data["regions"] = keep

            # stream write JSON (avoid full dict in memory)
            if not first:
                f_out.write(",")
            first = False
            json.dump(img_key, f_out)
            f_out.write(":")
            json.dump(img_data, f_out)

        f_out.write("}")  # close JSON


In [4]:
import json
import os

# Input / output paths
def poly_to_point(input_json , output_json):

    # Load VGG JSON
    with open(input_json, "r") as f:
        data = json.load(f)

    # Iterate through all images
    for image_key, image_data in data.items():
        if "regions" not in image_data:
            continue
        
        new_regions = {}
        for reg_id, region in image_data["regions"].items():
            shape = region["shape_attributes"]
            
            # Only process polygons
            if shape["name"] == "polygon":
                xs = shape["all_points_x"]
                ys = shape["all_points_y"]
                if len(xs) == 0:
                    print(shape)
                    continue
                # Compute centroid
                cx = int(sum(xs) / len(xs))
                cy = int(sum(ys) / len(ys))
                
                # Replace polygon with a point
                new_shape = {
                    "name": "point",
                    "cx": cx,
                    "cy": cy
                }
                
                # Preserve region attributes (labels)
                new_regions[reg_id] = {
                    "shape_attributes": new_shape,
                    "region_attributes": region.get("region_attributes", {})
                }
            else:
                # Keep other shapes unchanged (optional)
                new_regions[reg_id] = region
        
        # Update regions
        data[image_key]["regions"] = new_regions

    # Save updated JSON
    with open(output_json, "w") as f:
        json.dump(data, f, indent=2)

    print(f"✅ Converted {input_json} → {output_json}")


In [5]:
import json
from shapely.geometry import Point, Polygon

# Input files
def merge_polys_points_labels(vgg_polygons_path, vgg_points_2_path, vgg_points_3_path, output_path, order = 'CTS'):

    # Load JSON files
    with open(vgg_polygons_path, "r") as f:
        vgg_polygons = json.load(f)

    with open(vgg_points_2_path, "r") as f:
        vgg_points_2 = json.load(f)

    with open(vgg_points_3_path, "r") as f:
        vgg_points_3 = json.load(f)

    # Helper: extract points from VGG
    def extract_points(vgg_data):
        points = {}
        for img_key, img_data in vgg_data.items():
            pts = []
            for region in img_data['regions'].values():
                shape = region["shape_attributes"]
                if shape["name"] == "point":
                    cx, cy = shape["cx"], shape["cy"]
                    label = region["region_attributes"].get("label", "")
                    conf = float(region["region_attributes"].get("confidence", 1.0))
                    pts.append({"point": Point(cx, cy), "label": label, "conf": conf})
            points[img_key] = pts
        return points

    # Extract all points from files 2 and 3
    points_2 = extract_points(vgg_points_2)
    points_3 = extract_points(vgg_points_3)

    # Process polygons and update labels
    for img_key, img_data in vgg_polygons.items():
        regions = img_data.get("regions", [])
        img_points_2 = points_2.get(img_key, [])
        img_points_3 = points_3.get(img_key, [])

        for region in regions.values():
            shape = region["shape_attributes"]
            if shape["name"] != "polygon":
                continue

            # Original label
            label1 = region["region_attributes"].get("label", "")
            xs = shape["all_points_x"]
            ys = shape["all_points_y"]
            polygon = Polygon(zip(xs, ys))

            labels_to_concat = [label1]

            # ---- VGG2 ----
            pts_inside_2 = [p for p in img_points_2 if polygon.contains(p["point"]) and p["label"]]
            if pts_inside_2:
                best_pt_2 = max(pts_inside_2, key=lambda x: x["conf"])
                if order == "SCT" or order == "TCS":
                    labels_to_concat.insert(0, best_pt_2["label"])
                elif order == 'CTS':
                    labels_to_concat.append(best_pt_2["label"])
                    
            # ---- VGG3 ----
            pts_inside_3 = [p for p in img_points_3 if polygon.contains(p["point"]) and p["label"]]
            if pts_inside_3:
                best_pt_3 = max(pts_inside_3, key=lambda x: x["conf"])
                if order == 'CTS' or order == 'TCS':
                    labels_to_concat.append(best_pt_3["label"])
                elif order == "SCT":
                    labels_to_concat.insert(1, best_pt_3["label"])

            # Final label = unique + sorted
            final_label = "_".join(labels_to_concat)
            region["region_attributes"]["label"] = final_label

    # Save updated VGG JSON
    with open(output_path, "w") as f:
        json.dump(vgg_polygons, f, indent=2)

    print(f"✅ Updated VGG JSON saved to: {output_path}")


In [6]:
from copy import deepcopy


def add_polys_diff_label(cts_path, tcts_path, scts_path, out_path): 
    def get_labels(regions):
        return set(reg['region_attributes']['label'] for reg in regions)

    with open(tcts_path) as f:
        TCTS = json.load(f)
        
    with open(cts_path) as f:
        CTS = json.load(f)
        
    with open(scts_path) as f:
        SCTS = json.load(f)

    NCTS = deepcopy(CTS)
    
    for fn, data in NCTS.items():
        labels = get_labels(data['regions'].values())
        if fn in TCTS.keys():
            lbls = set()
            for reg in TCTS[fn]['regions'].values():
                lbl = reg['region_attributes']['label']
                if lbl not in labels:
                    data['regions'][str(len(data['regions']))] = reg
                    lbls.add(lbl)
            labels.update(lbls)
        if fn in SCTS.keys():
            for reg in SCTS[fn]['regions'].values():
                lbl = reg['region_attributes']['label']
                if lbl not in labels:
                    data['regions'][str(len(data['regions']))] = reg
                    
    with open(out_path, 'w') as f:
        json.dump(NCTS, f)
    
    

In [None]:

raw_path = f'/home/adelb/Downloads/test_color (6).json'
clean_path = f'/home/adelb/Downloads/test_color_clean.json'

clean_vgg_from_duplicates(raw_path, clean_path)

In [5]:
exp = "typo"
raw_path = f'big_VGGs/big_test_{exp}.json'
clean_path = f'big_VGGs/big_test_{exp}_clean.json'

clean_vgg_from_duplicates(raw_path, clean_path)

print(f"cleaning {exp}: done")

exp = "state"
raw_path = f'big_VGGs/big_test_{exp}.json'
clean_path = f'big_VGGs/big_test_{exp}_clean.json'

clean_vgg_from_duplicates(raw_path, clean_path)

print(f"cleaning {exp}: done")

cleaning typo: done
cleaning state: done


In [7]:
exp = "color"

clean_path = f'/home/adelb/Downloads/test_sahi_{exp}_clean.json'
point_path = f'big_VGGs/big_test_{exp}_point.json'

poly_to_point(clean_path, point_path)

print(f"pointing {exp}: done")

exp = "typo"

clean_path = f'/home/adelb/Downloads/test_sahi_{exp}_clean.json'
point_path = f'big_VGGs/big_test_{exp}_point.json'

poly_to_point(clean_path, point_path)

print(f"pointing {exp}: done")

exp = "state"

clean_path = f'/home/adelb/Downloads/test_sahi_{exp}_clean.json'
point_path = f'big_VGGs/big_test_{exp}_point.json'

poly_to_point(clean_path, point_path)

print(f"pointing {exp}: done")

✅ Converted /home/adelb/Downloads/test_sahi_color_clean.json → big_VGGs/big_test_color_point.json
pointing color: done
✅ Converted /home/adelb/Downloads/test_sahi_typo_clean.json → big_VGGs/big_test_typo_point.json
pointing typo: done
✅ Converted /home/adelb/Downloads/test_sahi_state_clean.json → big_VGGs/big_test_state_point.json
pointing state: done


In [8]:
order = "CTS"

poly_path = f'/home/adelb/Downloads/test_sahi_color_clean.json'
points_path_1 = 'big_VGGs/big_test_typo_point.json'
points_path_2 = 'big_VGGs/big_test_state_point.json'

out_path = f'big_VGGs/big_test_{order}_full.json'

merge_polys_points_labels(poly_path, points_path_1, points_path_2, out_path, order)

print(f"Merging {order}: Done")

order = "TCS"

poly_path = '/home/adelb/Downloads/test_sahi_typo_clean.json'
points_path_1 = 'big_VGGs/big_test_color_point.json'
points_path_2 = 'big_VGGs/big_test_state_point.json'

out_path = f'big_VGGs/big_test_{order}_full.json'

merge_polys_points_labels(poly_path, points_path_1, points_path_2, out_path, order)

print(f"Merging {order}: Done")

order = "SCT"

poly_path = '/home/adelb/Downloads/test_sahi_state_clean.json'
points_path_1 = 'big_VGGs/big_test_color_point.json'
points_path_2 = 'big_VGGs/big_test_typo_point.json'

out_path = f'big_VGGs/big_test_{order}_full.json'

merge_polys_points_labels(poly_path, points_path_1, points_path_2, out_path, order)

print(f"Merging {order}: Done")

✅ Updated VGG JSON saved to: big_VGGs/big_test_CTS_full.json
Merging CTS: Done
✅ Updated VGG JSON saved to: big_VGGs/big_test_TCS_full.json
Merging TCS: Done
✅ Updated VGG JSON saved to: big_VGGs/big_test_SCT_full.json
Merging SCT: Done


In [9]:
cts_path = "big_VGGs/big_test_CTS_full.json"
tcts_path = "big_VGGs/big_test_TCS_full.json"
scts_path = "big_VGGs/big_test_SCT_full.json"
ncts_path = "big_VGGs/big_test_NCTS_full.json"

add_polys_diff_label(cts_path, tcts_path, scts_path, ncts_path)
print('Done')

Done


In [1]:
import json
import ijson
from shapely.geometry import Polygon
from shapely.strtree import STRtree

def clean_vgg_from_duplicates_stream(
    vgg_polygons_path="res_VGGs/TEST_New_CTS.json",
    out_vgg_path="res_VGGs/TEST_New_CTS_clean.json",
    iou_thresh=0.6,
    pip_thresh=0.6,
):
    # -------------------------
    # HELPERS
    # -------------------------
    def fix_poly(poly):
        if not poly.is_valid:
            poly = poly.buffer(0)
        return poly

    def polygon_iou(poly1, poly2):
        poly1, poly2 = fix_poly(poly1), fix_poly(poly2)
        inter = poly1.intersection(poly2).area
        union = poly1.union(poly2).area
        return inter / union if union > 0 else 0

    def poly_in_poly(poly1, poly2):
        poly1, poly2 = fix_poly(poly1), fix_poly(poly2)
        inter = poly1.intersection(poly2).area
        p1a, p2a = poly1.area, poly2.area
        p1_in_p2 = inter / p1a if p1a > 0 else 0
        p2_in_p1 = inter / p2a if p2a > 0 else 0
        return max(p1_in_p2, p2_in_p1)

    # -------------------------
    # STREAM INPUT & WRITE OUTPUT
    # -------------------------
    with open(vgg_polygons_path, "r") as f_in, open(out_vgg_path, "w") as f_out:
        f_out.write("{\n")
        first = True

        for img_key, img_data in ijson.kvitems(f_in, "", use_float=True):
            regions = img_data.get("regions", {})

            polys = []
            for region in regions.values():
                shape = region["shape_attributes"]
                if shape["name"] != "polygon":
                    continue

                xs, ys = shape["all_points_x"], shape["all_points_y"]
                coords = list(zip(xs, ys))
                if len(coords) < 3:
                    continue

                poly = Polygon(coords)
                conf = float(region["region_attributes"].get("confidence", 0))
                label = region["region_attributes"].get("label", "")
                polys.append(
                    {"poly": poly, "conf": conf, "label": label, "region": dict(region)}
                )

            # Deduplication using STRtree
            keep = {}
            if polys:
                tree = STRtree([p["poly"] for p in polys])
                used = set()
                k = 0

                for i, p in enumerate(polys):
                    if i in used:
                        continue
                    best = p
                    # query candidates in same spatial vicinity
                    candidates = tree.query(p["poly"])
                    for cand_poly in candidates:
                        j = next(
                            (idx for idx, pp in enumerate(polys) if pp["poly"] == cand_poly),
                            None,
                        )
                        if j is None or j == i or j in used:
                            continue
                        iou = polygon_iou(p["poly"], polys[j]["poly"])
                        pip = poly_in_poly(p["poly"], polys[j]["poly"])
                        if iou > iou_thresh or pip > pip_thresh:
                            if polys[j]["conf"] > best["conf"]:
                                best = polys[j]
                            used.add(j)
                    used.add(i)
                    keep[k] = best["region"]
                    k += 1

            # Write cleaned image entry
            cleaned_entry = {img_key: {"regions": keep}}
            if not first:
                f_out.write(",\n")
            else:
                first = False
            cleaned_entry_dumped = json.dumps(cleaned_entry)
            f_out.write(cleaned_entry_dumped[1:-1])

        f_out.write("\n}")
    print(f"✅ Cleaned VGG saved to {out_vgg_path}")


In [10]:


raw_path = "big_VGGs/big_test_NCTS_full.json"
clean_path = "big_VGGs/big_test_NCTS_full_clean.json"
clean_vgg_from_duplicates_stream(raw_path, clean_path)

✅ Cleaned VGG saved to big_VGGs/big_test_NCTS_full_clean.json


In [12]:
import json, ast

cts_path = "big_VGGs/big_test_CTS_full.json"
tcts_path = "big_VGGs/big_test_TCS_full.json"
scts_path = "big_VGGs/big_test_SCT_full.json"
nctsc_path = "big_VGGs/big_test_NCTS_full_clean.json"

# with open(cts_path) as f:
#     cts = json.load(f)
    
# with open(tcts_path) as f:
#     tcts = json.load(f)
    
# with open(scts_path) as f:
#     scts = json.load(f)
    
with open(nctsc_path) as f:
    content = f.read().replace("'", '"')
    nctsc = json.loads(content)

In [14]:
import os
os.path.abspath('big_VGGs/big_test_NCTS_full_clean.json')

'/home/adelb/Documents/Bpartners/Stanislas/big_VGGs/big_test_NCTS_full_clean.json'

In [15]:
# cts_labels = set(reg['region_attributes']['label'] for file in cts.values() for reg in file['regions'].values())
# tcts_labels = set(reg['region_attributes']['label'] for file in tcts.values() for reg in file['regions'].values())
# scts_labels = set(reg['region_attributes']['label'] for file in scts.values() for reg in file['regions'].values())
ncts_labels = set(reg['region_attributes']['label'] for file in nctsc.values() for reg in file['regions'].values())


In [16]:
{i: lbl for i, lbl in enumerate(sorted(ncts_labels))}

{0: 'beige',
 1: 'beige_boucharde',
 2: 'beige_boucharde_degrade',
 3: 'beige_boucharde_satisfaisant',
 4: 'beige_degrade',
 5: 'beige_flamme',
 6: 'beige_flamme_degrade',
 7: 'beige_flamme_satisfaisant',
 8: 'beige_satisfaisant',
 9: 'beige_spuntato',
 10: 'beige_spuntato_degrade',
 11: 'beige_spuntato_satisfaisant',
 12: 'bleu',
 13: 'bleu_boucharde',
 14: 'bleu_boucharde_degrade',
 15: 'bleu_boucharde_satisfaisant',
 16: 'bleu_degrade',
 17: 'bleu_flamme',
 18: 'bleu_flamme_degrade',
 19: 'bleu_flamme_satisfaisant',
 20: 'bleu_satisfaisant',
 21: 'bleu_spuntato',
 22: 'bleu_spuntato_degrade',
 23: 'bleu_spuntato_satisfaisant',
 24: 'boucharde',
 25: 'boucharde_degrade',
 26: 'boucharde_satisfaisant',
 27: 'degrade',
 28: 'degrade_boucharde',
 29: 'degrade_flamme',
 30: 'degrade_spuntato',
 31: 'flamme',
 32: 'flamme_degrade',
 33: 'flamme_satisfaisant',
 34: 'satisfaisant',
 35: 'satisfaisant_boucharde',
 36: 'satisfaisant_flamme',
 37: 'satisfaisant_spuntato',
 38: 'spuntato',
 39:

In [20]:
len(nctsc)

38008