In [4]:
# stratified_reservoir_sample.py
import csv, random, sys
from pathlib import Path

# ------------------ CONFIG ------------------
infile = "/Users/manmeetkaur/Downloads/msr_data_cleaned.csv"
outfile = "/Users/manmeetkaur/Downloads/msrstratified_sample_5000.csv"

sample_size = 5000

# FORCE exact class distribution
TARGET_CLASS_COUNTS = {
    "0": 2600,   # non-vulnerable
    "1": 2400    # vulnerable
}

class_col_name = "vul"
# --------------------------------------------

# Robust CSV field size limit handling
max_int = sys.maxsize
while True:
    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = max_int // 10
        if max_int <= 0:
            raise RuntimeError("Could not set csv.field_size_limit")

print("=== Stratified Sampling with Forced Class Balance ===")

# ---------- 1) Count class distribution ----------
counts = {}
total_rows = 0

with open(infile, "r", encoding="latin1", errors="replace", newline='') as fin:
    reader = csv.reader(fin)
    header = next(reader)

    if class_col_name not in header:
        raise RuntimeError(f"Column '{class_col_name}' not found in CSV")

    class_idx = header.index(class_col_name)

    for row in reader:
        total_rows += 1
        lab = row[class_idx].strip()
        counts[lab] = counts.get(lab, 0) + 1

print("Original dataset size:", total_rows)
print("Original class distribution:", counts)

# Force labels
minority_label = "1"
majority_label = "0"

target_minority = TARGET_CLASS_COUNTS["1"]
target_majority = TARGET_CLASS_COUNTS["0"]

print(f"\nTARGET DISTRIBUTION:")
print(f"Class 0 -> {target_majority}")
print(f"Class 1 -> {target_minority}")

# ---------- 2) Reservoir sampling ----------
reservoir_min = []
reservoir_maj = []
seen_min = 0
seen_maj = 0

with open(infile, "r", encoding="latin1", errors="replace", newline='') as fin:
    reader = csv.reader(fin)
    hdr = next(reader)

    for row in reader:
        lab = row[class_idx].strip()

        if lab == minority_label:
            seen_min += 1
            if len(reservoir_min) < target_minority:
                reservoir_min.append(row)
            else:
                j = random.randrange(seen_min)
                if j < target_minority:
                    reservoir_min[j] = row
        elif lab == majority_label:
            seen_maj += 1
            if len(reservoir_maj) < target_majority:
                reservoir_maj.append(row)
            else:
                j = random.randrange(seen_maj)
                if j < target_majority:
                    reservoir_maj[j] = row

print("\nSeen class 1 rows:", seen_min)
print("Seen class 0 rows:", seen_maj)

# ---------- 3) Oversample if needed ----------
def oversample_to_target(pool_rows, target_n):
    out = list(pool_rows)
    if len(out) == 0:
        return out
    while len(out) < target_n:
        out.append(random.choice(pool_rows))
    return out[:target_n]

if len(reservoir_min) < target_minority:
    print("⚠ Oversampling class 1 (minority) with replacement")
    reservoir_min = oversample_to_target(reservoir_min, target_minority)

if len(reservoir_maj) < target_majority:
    print("⚠ Oversampling class 0 (majority) with replacement")
    reservoir_maj = oversample_to_target(reservoir_maj, target_majority)

# ---------- 4) Combine & shuffle ----------
final_rows = reservoir_maj + reservoir_min
random.shuffle(final_rows)

# ---------- 5) Write output ----------
Path(outfile).parent.mkdir(parents=True, exist_ok=True)

with open(outfile, "w", encoding="utf-8", newline='') as fout:
    writer = csv.writer(fout)
    writer.writerow(hdr)
    writer.writerows(final_rows)

# ---------- 6) Final verification ----------
final_counts = {"0": 0, "1": 0}
for row in final_rows:
    final_counts[row[class_idx].strip()] += 1

print("\n✅ FINAL CLASS DISTRIBUTION:")
print("Class 0:", final_counts["0"])
print("Class 1:", final_counts["1"])
print("Saved file →", outfile)


=== Stratified Sampling with Forced Class Balance ===
Original dataset size: 188636
Original class distribution: {'0': 177736, '1': 10900}

TARGET DISTRIBUTION:
Class 0 -> 2600
Class 1 -> 2400

Seen class 1 rows: 10900
Seen class 0 rows: 177736

✅ FINAL CLASS DISTRIBUTION:
Class 0: 2600
Class 1: 2400
Saved file → /Users/manmeetkaur/Downloads/msrstratified_sample_5000.csv
