In [26]:
import pickle
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import pickle
from sklearn.preprocessing import OneHotEncoder
import scipy.sparse as sp


In [27]:
# CELL A1 — show header and number of columns (no heavy memory)

fn = r"D:\python_programs\Swiggy Recommendation\cleaned_data.csv"
hdr = pd.read_csv(fn, nrows=0)
print("Columns count:", len(hdr.columns))
print("First 50 columns:", hdr.columns[:50].tolist())
print("'orig_index' in header?", "orig_index" in hdr.columns)


Columns count: 11
First 50 columns: ['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine', 'lic_no', 'link', 'address', 'menu']
'orig_index' in header? False


In [28]:
# ================================================================
#   FULL SPARSE ENCODING PIPELINE — NO PARQUET, NO EXTRA INSTALLS
# ================================================================

import pandas as pd
import pickle
import scipy.sparse as sp
import numpy as np
from sklearn.preprocessing import OneHotEncoder

INPUT = "cleaned_data.csv"

OUT_SPARSE = "encoded_features.npz"
OUT_FEATNAMES = "encoded_feature_names.pkl"
OUT_NONENC = "encoded_other_columns.csv"       # CSV instead of parquet
ENCODER_PATH = "encoder.pkl"
OUT_BUNDLE = "encoded_bundle.npz"

cat_cols = ["city", "cuisine"]

# -------------------------------
# LOAD CLEANED DATA
# -------------------------------

df = pd.read_csv(INPUT)
df["orig_index"] = df.index

# check cat columns
missing = [c for c in cat_cols if c not in df.columns]
if missing:
    raise KeyError("Missing categorical columns: " + ", ".join(missing))

other_cols = [c for c in df.columns if c not in cat_cols]

# -------------------------------
# ONE-HOT ENCODING (sparse)
# -------------------------------

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
encoded_sparse = encoder.fit_transform(df[cat_cols])

feat_names = encoder.get_feature_names_out(cat_cols)

print("Sparse shape:", encoded_sparse.shape, "nnz:", encoded_sparse.nnz)

# -------------------------------
# SAVE SPARSE MATRIX
# -------------------------------

sp.save_npz(OUT_SPARSE, encoded_sparse, compressed=True)
print("Saved:", OUT_SPARSE)

# -------------------------------
# SAVE FEATURE NAMES
# -------------------------------

with open(OUT_FEATNAMES, "wb") as f:
    pickle.dump(list(feat_names), f)
print("Saved:", OUT_FEATNAMES)

# -------------------------------
# SAVE OTHER (non-encoded) COLUMNS
# -------------------------------

df[other_cols].to_csv(OUT_NONENC, index=False)
print("Saved:", OUT_NONENC)

# -------------------------------
# SAVE ENCODER
# -------------------------------

with open(ENCODER_PATH, "wb") as f:
    pickle.dump(encoder, f)
print("Saved:", ENCODER_PATH)

# -------------------------------
# OPTIONAL: SINGLE BUNDLE NPZ
# -------------------------------

np.savez_compressed(
    OUT_BUNDLE,
    data=encoded_sparse.data,
    indices=encoded_sparse.indices,
    indptr=encoded_sparse.indptr,
    shape=np.array(encoded_sparse.shape),
    other_cols=df[other_cols].to_numpy(),
    feat_names=np.array(feat_names, dtype=object)
)

print("Saved bundle:", OUT_BUNDLE)
print("\nDONE.")


Sparse shape: (108369, 894) nnz: 216738
Saved: encoded_features.npz
Saved: encoded_feature_names.pkl
Saved: encoded_other_columns.csv
Saved: encoder.pkl
Saved bundle: encoded_bundle.npz

DONE.


In [29]:
import numpy as np
import pandas as pd
import pickle
import scipy.sparse as sp

# ----- load sparse -----
sparse = sp.load_npz("encoded_features.npz")

# ----- load feature names -----
with open("encoded_feature_names.pkl", "rb") as f:
    feat_names = pickle.load(f)

# ----- load other columns (CSV version) -----
other_df = pd.read_csv("encoded_other_columns.csv")

# ----- load encoder -----
with open("encoder.pkl", "rb") as f:
    encoder = pickle.load(f)

print("Reload complete.")


Reload complete.


In [30]:
from pathlib import Path

files = {
    "sparse_npz": Path("encoded_features.npz"),
    "feat_names": Path("encoded_feature_names.pkl"),
    "other_csv": Path("encoded_other_columns.csv"),
    "cleaned": Path("cleaned_data.csv"),
    "big_encoded": Path("encoded_data.csv"),
}

for k, p in files.items():
    print(k, "->", p.exists(), str(p))


sparse_npz -> True encoded_features.npz
feat_names -> True encoded_feature_names.pkl
other_csv -> True encoded_other_columns.csv
cleaned -> True cleaned_data.csv
big_encoded -> False encoded_data.csv


In [31]:
# ================================================================
#   FINAL SOLUTION — SINGLE NPZ BUNDLE (NO CSV, NO PARQUET)
# ================================================================

import pandas as pd
import pickle
import scipy.sparse as sp
import numpy as np
from sklearn.preprocessing import OneHotEncoder

INPUT = "cleaned_data.csv"
OUT_BUNDLE = "encoded_all_bundle.npz"
ENCODER_PKL = "encoder.pkl"

cat_cols = ["city", "cuisine"]

# -------------------------------
# LOAD CLEANED DATA
# -------------------------------

df = pd.read_csv(INPUT)
df["orig_index"] = df.index

other_cols = [c for c in df.columns if c not in cat_cols]

# -------------------------------
# ONE-HOT ENCODING (sparse)
# -------------------------------

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
encoded_sparse = encoder.fit_transform(df[cat_cols])
feat_names = encoder.get_feature_names_out(cat_cols)

# -------------------------------
# SAVE ENCODER
# -------------------------------

with open(ENCODER_PKL, "wb") as f:
    pickle.dump(encoder, f)
print("Saved encoder.pkl")

# -------------------------------
# SAVE EVERYTHING IN ONE NPZ
# -------------------------------

np.savez_compressed(
    OUT_BUNDLE,
    data=encoded_sparse.data,
    indices=encoded_sparse.indices,
    indptr=encoded_sparse.indptr,
    shape=np.array(encoded_sparse.shape),
    feat_names=np.array(feat_names, dtype=object),
    other_cols=df[other_cols].to_numpy(),
    other_col_names=np.array(other_cols, dtype=object),
    orig_index=df["orig_index"].to_numpy()
)

print("\nSAVED ALL →", OUT_BUNDLE)
print("This replaces all CSV/parquet files.")


Saved encoder.pkl

SAVED ALL → encoded_all_bundle.npz
This replaces all CSV/parquet files.


In [32]:
# ================================================================
#   RELOAD EVERYTHING FROM SINGLE BUNDLE
# ================================================================

import numpy as np
import scipy.sparse as sp

bundle = np.load("encoded_all_bundle.npz", allow_pickle=True)

# reconstruct sparse matrix
csr = sp.csr_matrix(
    (bundle["data"], bundle["indices"], bundle["indptr"]),
    shape=tuple(bundle["shape"])
)

# restore metadata
feat_names = bundle["feat_names"]
other_columns = bundle["other_cols"]
other_col_names = bundle["other_col_names"]
orig_index = bundle["orig_index"]

print("Sparse reconstructed:", csr.shape)
print("Other columns:", other_columns.shape)
print("First feature name:", feat_names[0])


Sparse reconstructed: (108369, 894)
Other columns: (108369, 10)
First feature name: city_Abids & Koti,Hyderabad


In [37]:
X_final = sp.hstack([X_other_sparse, X_ohe], format="csr")


In [38]:
# ================================================================
#   FIXED ML DATASET CONSTRUCTION (NO OBJECT DTYPE ERRORS)
# ================================================================

import numpy as np
import scipy.sparse as sp
import pandas as pd
import pickle

# Load bundle
bundle = np.load("encoded_all_bundle.npz", allow_pickle=True)

# sparse OHE matrix
X_ohe = sp.csr_matrix(
    (bundle["data"], bundle["indices"], bundle["indptr"]),
    shape=tuple(bundle["shape"])
)

feat_names = list(bundle["feat_names"])
other_cols = bundle["other_cols"]
other_col_names = list(bundle["other_col_names"])
orig_index = bundle["orig_index"]

# --------------------------------------
# FIX NON-ENCODED COLUMNS (object dtype)
# --------------------------------------

other_df = pd.DataFrame(other_cols, columns=other_col_names)

# convert object columns (strings) → numeric codes
for col in other_df.columns:
    if other_df[col].dtype == "object":
        try:
            other_df[col] = pd.to_numeric(other_df[col])
        except:
            other_df[col] = other_df[col].astype("category").cat.codes

# now guaranteed numeric
X_other_dense = other_df.to_numpy().astype(np.float32)
X_other_sparse = sp.csr_matrix(X_other_dense)

# --------------------------------------
# combine other features + OHE
# --------------------------------------

X_final = sp.hstack([X_other_sparse, X_ohe], format="csr")

print("Final matrix:", X_final.shape)
print("Total features:", X_final.shape[1])

# save final
sp.save_npz("X_final_sparse.npz", X_final, compressed=True)
print("Saved X_final_sparse.npz")


Final matrix: (108369, 904)
Total features: 904
Saved X_final_sparse.npz


In [39]:
# SINGLE CELL: auto-branch -> (A) combine existing sparse artifacts OR (B) rebuild then combine
import pandas as pd
import pickle
from pathlib import Path
import scipy.sparse as sp
from sklearn.preprocessing import OneHotEncoder

# ----------------- CONFIG -----------------
SPARSE_NPZ = Path("encoded_features.npz")
FEAT_NAMES_PKL = Path("encoded_feature_names.pkl")
OTHER_CSV = Path("encoded_other_columns.csv")
INPUT_CLEANED = Path("cleaned_data.csv")
OUT_DENSE = Path("encoded_data_dense.csv")   # final wide CSV (be careful with disk+memory)
ENCODER_PATH = Path("encoder.pkl")
cat_cols = ["city", "cuisine"]
batch_size = 2000   # reduce if you get MemoryError
# ------------------------------------------

def stream_combine(sparse_path, featnames_path, other_csv, out_dense, batch_size=2000):
    # Load metadata
    with open(featnames_path, "rb") as f:
        feat_names = pickle.load(f)

    sparse_mtx = sp.load_npz(sparse_path)  # csr_matrix
    n_rows, n_feats = sparse_mtx.shape
    print(f"[combine] sparse shape: {sparse_mtx.shape}, nnz: {sparse_mtx.nnz}")

    # header: other CSV columns + ohe names
    other_hdr = pd.read_csv(other_csv, nrows=0).columns.tolist()
    all_columns = other_hdr + list(feat_names)

    # Write header
    with open(out_dense, "w", encoding="utf-8", newline="") as f:
        f.write(",".join(map(str, all_columns)) + "\n")

    # Check ordering: ensure 'orig_index' exists and, if necessary, reorder other_csv to numeric increasing order
    # We will read other_csv in chunks, but to guarantee correct alignment we need to know if other_csv is in same order
    # Quick check: read first 10 orig_index and compare with 0..9
    other_df_head = pd.read_csv(other_csv, usecols=["orig_index"], nrows=10)
    head_is_seq = (other_df_head["orig_index"].astype(int).tolist() == list(range(len(other_df_head))))
    if not head_is_seq:
        print("[combine] Note: other_csv orig_index does not look like sequential reading index. We'll reorder by orig_index before combining.")
        # Reorder other_csv fully in a memory-safe way into a temp file using chunks (we'll create other_csv._reordered)
        temp_reordered = other_csv.with_name(other_csv.stem + "_reordered.csv")
        # Read entire other_csv into df may be OK; if it's too big we do chunked sort via disk — but here attempt full load first with try
        try:
            odf = pd.read_csv(other_csv)
            odf = odf.sort_values("orig_index").reset_index(drop=True)
            odf.to_csv(temp_reordered, index=False)
            other_csv_used = temp_reordered
            print("[combine] Reordered other_csv in-memory (wrote tmp file):", temp_reordered)
        except MemoryError:
            # Fallback: perform an external sort-like merge via pandas chunks (less efficient). We'll build index->row mapping in disk chunks.
            print("[combine] Reordering via chunked method due to MemoryError. This may take longer.")
            # Read orig_index only to determine order
            idx_series = pd.read_csv(other_csv, usecols=["orig_index"])["orig_index"].astype(int)
            # Create a DataFrame of positions to read in sorted order (this requires random-access read which CSV doesn't support)
            # So fallback to simpler approach: read whole file but with low memory; raise helpful error for now
            raise MemoryError("Chunked external reordering not implemented in this environment. Reduce batch_size or rebuild sparse and ensure other_csv order matches sparse rows.")
    else:
        other_csv_used = other_csv

    # Now stream in chunks and combine
    reader = pd.read_csv(other_csv_used, chunksize=batch_size)
    row_start = 0
    total_written = 0
    for chunk in reader:
        chunk_len = len(chunk)
        row_end = row_start + chunk_len
        # slice sparse
        sparse_slice = sparse_mtx[row_start:row_end]
        dense_slice = sparse_slice.toarray()  # convert only this batch
        ohe_df = pd.DataFrame(dense_slice, columns=feat_names, index=chunk.index)
        out_df = pd.concat([chunk.reset_index(drop=True), ohe_df.reset_index(drop=True)], axis=1)
        out_df.to_csv(out_dense, mode="a", header=False, index=False)
        total_written += chunk_len
        print(f"[combine] Wrote rows {row_start}..{row_end-1}")
        row_start = row_end

    print(f"[combine] Done. Wrote {total_written} rows to {out_dense}")

# --------- MAIN logic: branch ----------
if SPARSE_NPZ.exists() and FEAT_NAMES_PKL.exists() and OTHER_CSV.exists():
    print("Found existing sparse artifacts. Proceeding to stream-combine (Path A).")
    stream_combine(SPARSE_NPZ, FEAT_NAMES_PKL, OTHER_CSV, OUT_DENSE, batch_size=batch_size)
else:
    # Need to rebuild sparse artifacts from cleaned_data.csv
    if not INPUT_CLEANED.exists():
        raise FileNotFoundError("No sparse artifacts found and cleaned_data.csv not present. Cannot proceed.")
    print("Sparse artifacts not found — rebuilding from cleaned_data.csv (Path B). This will create encoded_features.npz, encoded_feature_names.pkl, encoded_other_columns.csv, encoder.pkl")
    df = pd.read_csv(INPUT_CLEANED)
    df["orig_index"] = df.index
    missing = [c for c in cat_cols if c not in df.columns]
    if missing:
        raise KeyError("Missing categorical columns in cleaned_data.csv: " + ", ".join(missing))

    # Fit OneHotEncoder (sparse)
    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    encoded_sparse = encoder.fit_transform(df[cat_cols])

    # Save artifacts
    sp.save_npz(SPARSE_NPZ, encoded_sparse, compressed=True)
    with open(FEAT_NAMES_PKL, "wb") as f:
        pickle.dump(encoder.get_feature_names_out(cat_cols), f)
    with open(ENCODER_PATH, "wb") as f:
        pickle.dump(encoder, f)
    other_cols = [c for c in df.columns if c not in cat_cols]
    df[other_cols].to_csv(OTHER_CSV, index=False)
    print("Rebuilt artifacts and saved. Now combining into dense CSV.")
    # combine now
    stream_combine(SPARSE_NPZ, FEAT_NAMES_PKL, OTHER_CSV, OUT_DENSE, batch_size=batch_size)

print("All done. If you produced encoded_data_dense.csv, check file size and disk space.")


Found existing sparse artifacts. Proceeding to stream-combine (Path A).
[combine] sparse shape: (108369, 894), nnz: 216738
[combine] Wrote rows 0..1999
[combine] Wrote rows 2000..3999
[combine] Wrote rows 4000..5999
[combine] Wrote rows 6000..7999
[combine] Wrote rows 8000..9999
[combine] Wrote rows 10000..11999
[combine] Wrote rows 12000..13999
[combine] Wrote rows 14000..15999
[combine] Wrote rows 16000..17999
[combine] Wrote rows 18000..19999
[combine] Wrote rows 20000..21999
[combine] Wrote rows 22000..23999
[combine] Wrote rows 24000..25999
[combine] Wrote rows 26000..27999
[combine] Wrote rows 28000..29999
[combine] Wrote rows 30000..31999
[combine] Wrote rows 32000..33999
[combine] Wrote rows 34000..35999
[combine] Wrote rows 36000..37999
[combine] Wrote rows 38000..39999
[combine] Wrote rows 40000..41999
[combine] Wrote rows 42000..43999
[combine] Wrote rows 44000..45999
[combine] Wrote rows 46000..47999
[combine] Wrote rows 48000..49999
[combine] Wrote rows 50000..51999
[combi