1. Data Understanding and Cleaning


In [40]:
import re
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import scipy.sparse as sp

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    FunctionTransformer
)
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity

import joblib


In [41]:
df = pd.read_csv("/content/swiggy.csv", on_bad_lines='skip')
print(df.head())

       id               name    city rating     rating_count   cost  \
0  567335     AB FOODS POINT  Abohar     --  Too Few Ratings  ₹ 200   
1  531342  Janta Sweet House  Abohar    4.4      50+ ratings  ₹ 200   
2  158203  theka coffee desi  Abohar    3.8     100+ ratings  ₹ 100   
3  187912          Singh Hut  Abohar    3.7      20+ ratings  ₹ 250   
4  543530      GRILL MASTERS  Abohar     --  Too Few Ratings  ₹ 250   

                      cuisine          lic_no  \
0            Beverages,Pizzas  22122652000138   
1               Sweets,Bakery  12117201000112   
2                   Beverages  22121652000190   
3            Fast Food,Indian  22119652000167   
4  Italian-American,Fast Food  12122201000053   

                                                link  \
0  https://www.swiggy.com/restaurants/ab-foods-po...   
1  https://www.swiggy.com/restaurants/janta-sweet...   
2  https://www.swiggy.com/restaurants/theka-coffe...   
3  https://www.swiggy.com/restaurants/singh-hut-n...  

In [42]:
# Clean cost column by removing currency symbols and non-numeric characters
df["cost"] = df["cost"].astype(str)
df["cost"] = df["cost"].apply(lambda x: re.sub(r"[^\d.]", "", x))

print(df["cost"].head())


0    200
1    200
2    100
3    250
4    250
Name: cost, dtype: object


In [43]:
df["rating_count"] = df["rating_count"].str.extract(r"(\d+)").astype(float)

In [44]:
# Convert rating to numeric first (in case it contains strings)
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# Fill NaN rating using mean rating for each restaurant name
df["rating"] = df.groupby("name")["rating"].transform(
    lambda x: x.fillna(x.mean())
)
print(df["rating"].head())

0    NaN
1    4.4
2    3.8
3    3.7
4    4.0
Name: rating, dtype: float64


In [45]:
df["cuisine"] = df["cuisine"].str.split(",")
df = df.explode("cuisine")


In [46]:
df.isnull().sum()

Unnamed: 0,0
id,0
name,86
city,0
rating,130387
rating_count,148102
cost,0
cuisine,99
lic_no,316
link,0
address,86


In [47]:
# BEFORE
print("=== BEFORE ===")
print("Duplicate rows:", df.duplicated().sum())
print("Total rows:", len(df))

# REMOVE DUPLICATES
df = df.drop_duplicates()

# AFTER
print("\n=== AFTER ===")
print("Duplicate rows:", df.duplicated().sum())
print("Total rows:", len(df))

=== BEFORE ===
Duplicate rows: 0
Total rows: 256611

=== AFTER ===
Duplicate rows: 0
Total rows: 256611


In [48]:
# Convert rating to numeric (important)
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# -------------------------------------------
# 1. Rating null BEFORE
# -------------------------------------------
print("Rating null BEFORE:", df["rating"].isnull().sum())

# -------------------------------------------
# 2. Fill rating NaN with mean rating by NAME
# -------------------------------------------
df["rating"] = df.groupby("name")["rating"].transform(
    lambda x: x.fillna(x.mean())
)

# Fallback: if all rating values for a name were NaN
df["rating"] = df["rating"].fillna(df["rating"].mean())

# -------------------------------------------
# 3. Rating null AFTER
# -------------------------------------------
print("Rating null AFTER:", df["rating"].isnull().sum())

# -------------------------------------------
# 4. Drop null values in ALL OTHER columns
# (Rating will not be dropped because it no longer has NaN)
# -------------------------------------------
df = df.dropna()

# -------------------------------------------
# 5. Final missing value summary
# -------------------------------------------
print("\nNull values column-wise AFTER cleaning:")
print(df.isnull().sum())


Rating null BEFORE: 130387
Rating null AFTER: 0

Null values column-wise AFTER cleaning:
id              0
name            0
city            0
rating          0
rating_count    0
cost            0
cuisine         0
lic_no          0
link            0
address         0
menu            0
dtype: int64


In [49]:
df.head()

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
1,531342,Janta Sweet House,Abohar,4.4,50.0,200,Sweets,12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
1,531342,Janta Sweet House,Abohar,4.4,50.0,200,Bakery,12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100.0,100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20.0,250,Fast Food,22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
3,187912,Singh Hut,Abohar,3.7,20.0,250,Indian,22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json


In [50]:
df.dtypes

Unnamed: 0,0
id,int64
name,object
city,object
rating,float64
rating_count,float64
cost,object
cuisine,object
lic_no,object
link,object
address,object


In [51]:
# Columns
categorical_cols = ["name", "city", "cuisine"]
numerical_cols = ["rating", "rating_count", "cost"]

# -----------------------------
# BEFORE DATA TYPES
# -----------------------------
print("=== BEFORE DATA TYPES ===")
print(df.dtypes)

# -----------------------------
# TYPE CONVERSION
# -----------------------------
# Convert categorical
df[categorical_cols] = df[categorical_cols].astype("category")

# Convert numerical
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors="coerce")

# -----------------------------
# AFTER DATA TYPES
# -----------------------------
print("\n=== AFTER DATA TYPES ===")
print(df.dtypes)


=== BEFORE DATA TYPES ===
id                int64
name             object
city             object
rating          float64
rating_count    float64
cost             object
cuisine          object
lic_no           object
link             object
address          object
menu             object
dtype: object

=== AFTER DATA TYPES ===
id                 int64
name            category
city            category
rating           float64
rating_count     float64
cost             float64
cuisine         category
lic_no            object
link              object
address           object
menu              object
dtype: object


In [52]:
# SAVE CLEANED CSV TO LOCATION
# -----------------------------
save_path = Path("content/cleaned_data.csv")
# Create the directory if it doesn't exist
save_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(save_path, index=False)

print(f"\nCleaned CSV saved at: {save_path}")


Cleaned CSV saved at: content/cleaned_data.csv


2-Data_Preprocessing

In [55]:
# CELL A1 — show header and number of columns (no heavy memory)

fn = ("content/cleaned_data.csv")
hdr = pd.read_csv(fn, nrows=0)
print("Columns count:", len(hdr.columns))
print("First 50 columns:", hdr.columns[:50].tolist())
print("'orig_index' in header?", "orig_index" in hdr.columns)


Columns count: 11
First 50 columns: ['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine', 'lic_no', 'link', 'address', 'menu']
'orig_index' in header? False


In [72]:
# ================================================================
#   FULL SPARSE ENCODING PIPELINE — NO PARQUET, NO EXTRA INSTALLS
# ================================================================


INPUT = "content/cleaned_data.csv"

OUT_SPARSE = "encoded_features.npz"
OUT_FEATNAMES = "encoded_feature_names.pkl"
OUT_NONENC = "encoded_other_columns.csv"       # CSV instead of parquet
ENCODER_PATH = "encoder.pkl"
OUT_BUNDLE = "encoded_bundle.npz"

cat_cols = ["city", "cuisine"]

# -------------------------------
# LOAD CLEANED DATA
# -------------------------------

df = pd.read_csv(INPUT)
df["orig_index"] = df.index

# check cat columns
missing = [c for c in cat_cols if c not in df.columns]
if missing:
    raise KeyError("Missing categorical columns: " + ", ".join(missing))

other_cols = [c for c in df.columns if c not in cat_cols]

# -------------------------------
# ONE-HOT ENCODING (sparse)
# -------------------------------

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
encoded_sparse = encoder.fit_transform(df[cat_cols])

feat_names = encoder.get_feature_names_out(cat_cols)

print("Sparse shape:", encoded_sparse.shape, "nnz:", encoded_sparse.nnz)

# -------------------------------
# SAVE SPARSE MATRIX
# -------------------------------

sp.save_npz(OUT_SPARSE, encoded_sparse, compressed=True)
print("Saved:", OUT_SPARSE)

# -------------------------------
# SAVE FEATURE NAMES
# -------------------------------

with open(OUT_FEATNAMES, "wb") as f:
    pickle.dump(list(feat_names), f)
print("Saved:", OUT_FEATNAMES)

# -------------------------------
# SAVE OTHER (non-encoded) COLUMNS
# -------------------------------

df[other_cols].to_csv(OUT_NONENC, index=False)
print("Saved:", OUT_NONENC)

# -------------------------------
# SAVE ENCODER
# -------------------------------

with open(ENCODER_PATH, "wb") as f:
    pickle.dump(encoder, f)
print("Saved:", ENCODER_PATH)

# -------------------------------
# OPTIONAL: SINGLE BUNDLE NPZ
# -------------------------------

np.savez_compressed(
    OUT_BUNDLE,
    data=encoded_sparse.data,
    indices=encoded_sparse.indices,
    indptr=encoded_sparse.indptr,
    shape=np.array(encoded_sparse.shape),
    other_cols=df[other_cols].to_numpy(),
    feat_names=np.array(feat_names, dtype=object)
)

print("Saved bundle:", OUT_BUNDLE)
print("\nDONE.")

Sparse shape: (108369, 894) nnz: 216738
Saved: encoded_features.npz
Saved: encoded_feature_names.pkl
Saved: encoded_other_columns.csv
Saved: encoder.pkl
Saved bundle: encoded_bundle.npz

DONE.


In [57]:


# ----- load sparse -----
sparse = sp.load_npz("encoded_features.npz")

# ----- load feature names -----
with open("encoded_feature_names.pkl", "rb") as f:
    feat_names = pickle.load(f)

# ----- load other columns (CSV version) -----
other_df = pd.read_csv("encoded_other_columns.csv")

# ----- load encoder -----
with open("encoder.pkl", "rb") as f:
    encoder = pickle.load(f)

print("Reload complete.")


Reload complete.


In [58]:

files = {
    "sparse_npz": Path("encoded_features.npz"),
    "feat_names": Path("encoded_feature_names.pkl"),
    "other_csv": Path("encoded_other_columns.csv"),
    "cleaned": Path("cleaned_data.csv"),
    "big_encoded": Path("encoded_data.csv"),
}

for k, p in files.items():
    print(k, "->", p.exists(), str(p))


sparse_npz -> True encoded_features.npz
feat_names -> True encoded_feature_names.pkl
other_csv -> True encoded_other_columns.csv
cleaned -> False cleaned_data.csv
big_encoded -> False encoded_data.csv


In [59]:
# ================================================================
#   FINAL SOLUTION — SINGLE NPZ BUNDLE (NO CSV, NO PARQUET)
# ================================================================


INPUT = "content/cleaned_data.csv"
OUT_BUNDLE = "encoded_all_bundle.npz"
ENCODER_PKL = "encoder.pkl"

cat_cols = ["city", "cuisine"]

# -------------------------------
# LOAD CLEANED DATA
# -------------------------------

df = pd.read_csv(INPUT)
df["orig_index"] = df.index

other_cols = [c for c in df.columns if c not in cat_cols]

# -------------------------------
# ONE-HOT ENCODING (sparse)
# -------------------------------

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
encoded_sparse = encoder.fit_transform(df[cat_cols])
feat_names = encoder.get_feature_names_out(cat_cols)

# -------------------------------
# SAVE ENCODER
# -------------------------------

with open(ENCODER_PKL, "wb") as f:
    pickle.dump(encoder, f)
print("Saved encoder.pkl")

# -------------------------------
# SAVE EVERYTHING IN ONE NPZ
# -------------------------------

np.savez_compressed(
    OUT_BUNDLE,
    data=encoded_sparse.data,
    indices=encoded_sparse.indices,
    indptr=encoded_sparse.indptr,
    shape=np.array(encoded_sparse.shape),
    feat_names=np.array(feat_names, dtype=object),
    other_cols=df[other_cols].to_numpy(),
    other_col_names=np.array(other_cols, dtype=object),
    orig_index=df["orig_index"].to_numpy()
)

print("\nSAVED ALL →", OUT_BUNDLE)
print("This replaces all CSV/parquet files.")

Saved encoder.pkl

SAVED ALL → encoded_all_bundle.npz
This replaces all CSV/parquet files.


In [60]:
# ================================================================
#   RELOAD EVERYTHING FROM SINGLE BUNDLE
# ================================================================



bundle = np.load("encoded_all_bundle.npz", allow_pickle=True)

# reconstruct sparse matrix
csr = sp.csr_matrix(
    (bundle["data"], bundle["indices"], bundle["indptr"]),
    shape=tuple(bundle["shape"])
)

# restore metadata
feat_names = bundle["feat_names"]
other_columns = bundle["other_cols"]
other_col_names = bundle["other_col_names"]
orig_index = bundle["orig_index"]

print("Sparse reconstructed:", csr.shape)
print("Other columns:", other_columns.shape)
print("First feature name:", feat_names[0])


Sparse reconstructed: (108369, 894)
Other columns: (108369, 10)
First feature name: city_Abids & Koti,Hyderabad


In [61]:
# Assign the one-hot encoded sparse matrix to X_ohe
X_ohe = csr

# Create a DataFrame from 'other_columns' to easily select numeric types
df_other = pd.DataFrame(other_columns, columns=other_col_names)

# Identify numerical columns (id, rating, rating_count, cost, orig_index are numerical)
numeric_other_cols = df_other.select_dtypes(include=np.number).columns

# Convert only the numerical part of other_columns to a sparse matrix
X_other_sparse = sp.csr_matrix(df_other[numeric_other_cols].values)

X_final = sp.hstack([X_other_sparse, X_ohe], format="csr")

In [62]:
# ================================================================
#   FIXED ML DATASET CONSTRUCTION (NO OBJECT DTYPE ERRORS)
# ================================================================



# Load bundle
bundle = np.load("encoded_all_bundle.npz", allow_pickle=True)

# sparse OHE matrix
X_ohe = sp.csr_matrix(
    (bundle["data"], bundle["indices"], bundle["indptr"]),
    shape=tuple(bundle["shape"])
)

feat_names = list(bundle["feat_names"])
other_cols = bundle["other_cols"]
other_col_names = list(bundle["other_col_names"])
orig_index = bundle["orig_index"]

# --------------------------------------
# FIX NON-ENCODED COLUMNS (object dtype)
# --------------------------------------

other_df = pd.DataFrame(other_cols, columns=other_col_names)

# convert object columns (strings) → numeric codes
for col in other_df.columns:
    if other_df[col].dtype == "object":
        try:
            other_df[col] = pd.to_numeric(other_df[col])
        except:
            other_df[col] = other_df[col].astype("category").cat.codes

# now guaranteed numeric
X_other_dense = other_df.to_numpy().astype(np.float32)
X_other_sparse = sp.csr_matrix(X_other_dense)

# --------------------------------------
# combine other features + OHE
# --------------------------------------

X_final = sp.hstack([X_other_sparse, X_ohe], format="csr")

print("Final matrix:", X_final.shape)
print("Total features:", X_final.shape[1])

# save final
sp.save_npz("X_final_sparse.npz", X_final, compressed=True)
print("Saved X_final_sparse.npz")


Final matrix: (108369, 904)
Total features: 904
Saved X_final_sparse.npz


In [63]:
# SINGLE CELL: auto-branch -> (A) combine existing sparse artifacts OR (B) rebuild then combine
# ----------------- CONFIG -----------------
SPARSE_NPZ = Path("encoded_features.npz")
FEAT_NAMES_PKL = Path("encoded_feature_names.pkl")
OTHER_CSV = Path("encoded_other_columns.csv")
INPUT_CLEANED = Path("cleaned_data.csv")
OUT_DENSE = Path("encoded_data_dense.csv")   # final wide CSV (be careful with disk+memory)
ENCODER_PATH = Path("encoder.pkl")
cat_cols = ["city", "cuisine"]
batch_size = 2000   # reduce if you get MemoryError
# ------------------------------------------

def stream_combine(sparse_path, featnames_path, other_csv, out_dense, batch_size=2000):
    # Load metadata
    with open(featnames_path, "rb") as f:
        feat_names = pickle.load(f)

    sparse_mtx = sp.load_npz(sparse_path)  # csr_matrix
    n_rows, n_feats = sparse_mtx.shape
    print(f"[combine] sparse shape: {sparse_mtx.shape}, nnz: {sparse_mtx.nnz}")

    # header: other CSV columns + ohe names
    other_hdr = pd.read_csv(other_csv, nrows=0).columns.tolist()
    all_columns = other_hdr + list(feat_names)

    # Write header
    with open(out_dense, "w", encoding="utf-8", newline="") as f:
        f.write(",".join(map(str, all_columns)) + "\n")

    # Check ordering: ensure 'orig_index' exists and, if necessary, reorder other_csv to numeric increasing order
    # We will read other_csv in chunks, but to guarantee correct alignment we need to know if other_csv is in same order
    # Quick check: read first 10 orig_index and compare with 0..9
    other_df_head = pd.read_csv(other_csv, usecols=["orig_index"], nrows=10)
    head_is_seq = (other_df_head["orig_index"].astype(int).tolist() == list(range(len(other_df_head))))
    if not head_is_seq:
        print("[combine] Note: other_csv orig_index does not look like sequential reading index. We'll reorder by orig_index before combining.")
        # Reorder other_csv fully in a memory-safe way into a temp file using chunks (we'll create other_csv._reordered)
        temp_reordered = other_csv.with_name(other_csv.stem + "_reordered.csv")
        # Read entire other_csv into df may be OK; if it's too big we do chunked sort via disk — but here attempt full load first with try
        try:
            odf = pd.read_csv(other_csv)
            odf = odf.sort_values("orig_index").reset_index(drop=True)
            odf.to_csv(temp_reordered, index=False)
            other_csv_used = temp_reordered
            print("[combine] Reordered other_csv in-memory (wrote tmp file):", temp_reordered)
        except MemoryError:
            # Fallback: perform an external sort-like merge via pandas chunks (less efficient). We'll build index->row mapping in disk chunks.
            print("[combine] Reordering via chunked method due to MemoryError. This may take longer.")
            # Read orig_index only to determine order
            idx_series = pd.read_csv(other_csv, usecols=["orig_index"])["orig_index"].astype(int)
            # Create a DataFrame of positions to read in sorted order (this requires random-access read which CSV doesn't support)
            # So fallback to simpler approach: read whole file but with low memory; raise helpful error for now
            raise MemoryError("Chunked external reordering not implemented in this environment. Reduce batch_size or rebuild sparse and ensure other_csv order matches sparse rows.")
    else:
        other_csv_used = other_csv

    # Now stream in chunks and combine
    reader = pd.read_csv(other_csv_used, chunksize=batch_size)
    row_start = 0
    total_written = 0
    for chunk in reader:
        chunk_len = len(chunk)
        row_end = row_start + chunk_len
        # slice sparse
        sparse_slice = sparse_mtx[row_start:row_end]
        dense_slice = sparse_slice.toarray()  # convert only this batch
        ohe_df = pd.DataFrame(dense_slice, columns=feat_names, index=chunk.index)
        out_df = pd.concat([chunk.reset_index(drop=True), ohe_df.reset_index(drop=True)], axis=1)
        out_df.to_csv(out_dense, mode="a", header=False, index=False)
        total_written += chunk_len
        print(f"[combine] Wrote rows {row_start}..{row_end-1}")
        row_start = row_end

    print(f"[combine] Done. Wrote {total_written} rows to {out_dense}")

# --------- MAIN logic: branch ----------
if SPARSE_NPZ.exists() and FEAT_NAMES_PKL.exists() and OTHER_CSV.exists():
    print("Found existing sparse artifacts. Proceeding to stream-combine (Path A).")
    stream_combine(SPARSE_NPZ, FEAT_NAMES_PKL, OTHER_CSV, OUT_DENSE, batch_size=batch_size)
else:
    # Need to rebuild sparse artifacts from cleaned_data.csv
    if not INPUT_CLEANED.exists():
        raise FileNotFoundError("No sparse artifacts found and cleaned_data.csv not present. Cannot proceed.")
    print("Sparse artifacts not found — rebuilding from cleaned_data.csv (Path B). This will create encoded_features.npz, encoded_feature_names.pkl, encoded_other_columns.csv, encoder.pkl")
    df = pd.read_csv(INPUT_CLEANED)
    df["orig_index"] = df.index
    missing = [c for c in cat_cols if c not in df.columns]
    if missing:
        raise KeyError("Missing categorical columns in cleaned_data.csv: " + ", ".join(missing))

    # Fit OneHotEncoder (sparse)
    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    encoded_sparse = encoder.fit_transform(df[cat_cols])

    # Save artifacts
    sp.save_npz(SPARSE_NPZ, encoded_sparse, compressed=True)
    with open(FEAT_NAMES_PKL, "wb") as f:
        pickle.dump(encoder.get_feature_names_out(cat_cols), f)
    with open(ENCODER_PATH, "wb") as f:
        pickle.dump(encoder, f)
    other_cols = [c for c in df.columns if c not in cat_cols]
    df[other_cols].to_csv(OTHER_CSV, index=False)
    print("Rebuilt artifacts and saved. Now combining into dense CSV.")
    # combine now
    stream_combine(SPARSE_NPZ, FEAT_NAMES_PKL, OTHER_CSV, OUT_DENSE, batch_size=batch_size)

print("All done. If you produced encoded_data_dense.csv, check file size and disk space.")


Found existing sparse artifacts. Proceeding to stream-combine (Path A).
[combine] sparse shape: (108369, 894), nnz: 216738
[combine] Wrote rows 0..1999
[combine] Wrote rows 2000..3999
[combine] Wrote rows 4000..5999
[combine] Wrote rows 6000..7999
[combine] Wrote rows 8000..9999
[combine] Wrote rows 10000..11999
[combine] Wrote rows 12000..13999
[combine] Wrote rows 14000..15999
[combine] Wrote rows 16000..17999
[combine] Wrote rows 18000..19999
[combine] Wrote rows 20000..21999
[combine] Wrote rows 22000..23999
[combine] Wrote rows 24000..25999
[combine] Wrote rows 26000..27999
[combine] Wrote rows 28000..29999
[combine] Wrote rows 30000..31999
[combine] Wrote rows 32000..33999
[combine] Wrote rows 34000..35999
[combine] Wrote rows 36000..37999
[combine] Wrote rows 38000..39999
[combine] Wrote rows 40000..41999
[combine] Wrote rows 42000..43999
[combine] Wrote rows 44000..45999
[combine] Wrote rows 46000..47999
[combine] Wrote rows 48000..49999
[combine] Wrote rows 50000..51999
[combi

3-Recommandation_Methodology

In [64]:
# Paths (adjust if necessary)
SPARSE_NPZ = Path("encoded_features.npz")
FEAT_NAMES_PKL = Path("encoded_feature_names.pkl")
OTHER_CSV = Path("encoded_other_columns.csv")
CLEANED_CSV = Path("content/cleaned_data.csv")

# sanity
for p in [SPARSE_NPZ, FEAT_NAMES_PKL, OTHER_CSV, CLEANED_CSV]:
    if not p.exists():
        raise FileNotFoundError(f"Missing required file: {p}")

# Load sparse OHE and other columns
ohe_sparse = sp.load_npz(SPARSE_NPZ)   # CSR matrix (n_rows x n_ohe_features)
with open(FEAT_NAMES_PKL, "rb") as f:
    ohe_feature_names = pickle.load(f)

other_df = pd.read_csv(OTHER_CSV)     # includes orig_index and numeric cols (+ target)
cleaned_df = pd.read_csv(CLEANED_CSV) # original non-encoded dataset

# Ensure orig_index exists in other_df; if not, create it as reading index
if "orig_index" not in other_df.columns:
    other_df["orig_index"] = other_df.index

# Build X: combine numeric columns from other_df (except orig_index & any non-feature text) with ohe_sparse
# We'll treat non-numeric text columns by converting to category codes if needed (safe fallback).
feature_cols = [c for c in other_df.columns if c not in ("orig_index", )]  # includes the target if present
# Option: remove target from features if present
TARGET = "rating" if "rating" in feature_cols else None
if TARGET:
    feature_cols.remove(TARGET)

# Prepare numeric block from other_df
X_other = other_df[feature_cols].copy()

# Convert non-numeric to category codes or numeric where possible
for col in X_other.columns:
    if not pd.api.types.is_numeric_dtype(X_other[col]):
        # try coercion
        coerced = pd.to_numeric(X_other[col], errors="coerce")
        if coerced.notna().sum() / len(coerced) > 0.5:
            X_other[col] = coerced.fillna(coerced.median())
        else:
            # convert to category codes (makes numeric)
            X_other[col] = X_other[col].astype("category").cat.codes
X_other = X_other.fillna(0)  # final fallback

# Convert to sparse (CSR) and horizontally stack with ohe_sparse
X_other_sparse = sp.csr_matrix(X_other.values) if X_other.shape[1] > 0 else None
if X_other_sparse is not None:
    X = sp.hstack([X_other_sparse, ohe_sparse], format="csr")
else:
    X = ohe_sparse.copy()

print("Built X (sparse) shape:", X.shape)
print("other_df shape:", other_df.shape)
print("cleaned_df shape:", cleaned_df.shape)


Built X (sparse) shape: (108369, 902)
other_df shape: (108369, 10)
cleaned_df shape: (108369, 11)


In [65]:
# Fit NearestNeighbors (cosine metric). Use algorithm='brute' which supports sparse + cosine.
# For faster approximate search on huge data, consider using annoy, faiss, or nmslib.
n_neighbors = 20   # max neighbors to precompute (query can request <= this)
nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine", algorithm="brute", n_jobs=-1)
nn_model.fit(X)   # fits on sparse X

print("NearestNeighbors (cosine) fitted on X.")


NearestNeighbors (cosine) fitted on X.


In [66]:
# Helper: map orig_index -> row position in X / other_df
# other_df.orig_index should match sparse matrix row order (created that way earlier)
orig_to_pos = pd.Series(data=np.arange(len(other_df)), index=other_df["orig_index"].astype(int)).to_dict()

def recommend_by_index(orig_idx, k=10, include_scores=True):
    """
    Recommend top-k similar restaurants for the row with orig_index == orig_idx.
    Returns a DataFrame with columns: orig_index, pos (row in cleaned_df), score (cosine dist)
    """
    if int(orig_idx) not in orig_to_pos:
        raise KeyError(f"orig_index {orig_idx} not found in other_df")
    pos = orig_to_pos[int(orig_idx)]
    # kneighbors returns distances for cosine which are in [0,2] for cosine distance if metric='cosine'.
    distances, indices = nn_model.kneighbors(X[pos], n_neighbors=k+1)  # +1 since first is self
    distances = distances.flatten()
    indices = indices.flatten()
    # Remove self (pos) from results
    mask = indices != pos
    indices = indices[mask][:k]
    distances = distances[mask][:k]
    # Build result mapping back to orig_index and cleaned rows
    res = []
    for idx_pos, dist in zip(indices, distances):
        orig = int(other_df["orig_index"].iat[idx_pos])
        # similarity = 1 - dist if you want similarity instead of distance
        sim = 1 - dist
        row = cleaned_df.iloc[idx_pos] if idx_pos < len(cleaned_df) else None
        res.append({"orig_index": orig, "pos": int(idx_pos), "distance": float(dist), "similarity": float(sim)})
    df_res = pd.DataFrame(res)
    if include_scores:
        return df_res
    else:
        return df_res.drop(columns=["distance","similarity"])

def recommend_by_name(name, k=10, match_mode="exact"):
    """
    Find rows in cleaned_df with given name and return recommendations.
    match_mode: 'exact' or 'contains' (case-insensitive)
    """
    if match_mode == "exact":
        matches = cleaned_df[cleaned_df["name"] == name]
    else:
        matches = cleaned_df[cleaned_df["name"].str.contains(name, case=False, na=False)]
    if matches.empty:
        raise KeyError(f"No matches for name='{name}' (mode={match_mode})")
    # use first match by default (could loop over multiple)
    match_pos = matches.index[0]
    # match_pos should correspond to orig_index if cleaned_df index aligns; otherwise map orig_index value
    # If cleaned_df was read fresh, its reading index likely equals orig_index values; but we prefer to find orig_index column in other_df
    orig_idx = int(match_pos)
    # Use recommend_by_index
    return recommend_by_index(orig_idx, k=k)


In [75]:
from IPython.display import display

# Example by orig_index
example_idx = 123  # change to any valid orig_index present in other_df
try:
    recs = recommend_by_index(example_idx, k=10)
    print("Recommendations for orig_index", example_idx)
    display(recs)
    # show names from cleaned_df
    display(cleaned_df.loc[recs["pos"], ["name","city","cuisine"]].reset_index(drop=True))
except Exception as e:
    print("Error:", e)

# Example by name
try:
    name_query = "Pizza Hut"   # change to desired restaurant name (exact or set match_mode="contains")
    recs_by_name = recommend_by_name(name_query, k=10, match_mode="contains")
    print("Recommendations for name contains:", name_query)
    display(recs_by_name)
    display(cleaned_df.loc[recs_by_name["pos"], ["name","city","cuisine"]].reset_index(drop=True))
except Exception as e:
    print("Error:", e)


Recommendations for orig_index 123


Unnamed: 0,orig_index,pos,distance,similarity
0,108363,108363,0.0,1.0
1,108362,108362,0.0,1.0
2,108361,108361,0.0,1.0
3,108360,108360,0.0,1.0
4,108359,108359,0.0,1.0
5,108358,108358,0.0,1.0
6,11,11,0.0,1.0
7,10,10,0.0,1.0
8,9,9,0.0,1.0
9,8,8,0.0,1.0


Unnamed: 0,name,city,cuisine
0,Ranade Bandhu,Yavatmal,Fast Food
1,Ranade Bandhu,Yavatmal,Sweets
2,Jain Varities & Icecream Corner,Yavatmal,Fast Food
3,Jain Varities & Icecream Corner,Yavatmal,Snacks
4,Beyond Temtation,Yavatmal,Beverages
5,Beyond Temtation,Yavatmal,Fast Food
6,Bharawan Da Dhaba,Abohar,Indian
7,Sethi Milk Badam,Abohar,Desserts
8,Sethi Milk Badam,Abohar,Sweets
9,Hinglaj Kachori Bhandhar,Abohar,Chaat


Recommendations for name contains: Pizza Hut


Unnamed: 0,orig_index,pos,distance,similarity
0,108367,108367,0.0,1.0
1,108366,108366,0.0,1.0
2,108365,108365,0.0,1.0
3,108364,108364,0.0,1.0
4,108363,108363,0.0,1.0
5,108362,108362,0.0,1.0
6,108361,108361,0.0,1.0
7,108360,108360,0.0,1.0
8,108359,108359,0.0,1.0
9,108358,108358,0.0,1.0


Unnamed: 0,name,city,cuisine
0,Suraj Hotel,Yavatmal,Fast Food
1,Suraj Hotel,Yavatmal,North Indian
2,Satkar Dinning Hall,Yavatmal,North Indian
3,Satkar Dinning Hall,Yavatmal,Maharashtrian
4,Ranade Bandhu,Yavatmal,Fast Food
5,Ranade Bandhu,Yavatmal,Sweets
6,Jain Varities & Icecream Corner,Yavatmal,Fast Food
7,Jain Varities & Icecream Corner,Yavatmal,Snacks
8,Beyond Temtation,Yavatmal,Beverages
9,Beyond Temtation,Yavatmal,Fast Food


In [76]:
def assign_clusters(n_clusters=50, n_components=50, random_state=42):
    """
    Reduce dimensionality with TruncatedSVD to n_components, then cluster with MiniBatchKMeans.
    Returns cluster labels (len == n_rows) and stores reduced components & kmeans object.
    """
    print("Running TruncatedSVD (n_components=%d) on sparse X..." % n_components)
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    X_reduced = svd.fit_transform(X)   # result is dense but n_rows x n_components (ok if n_components small)
    print("SVD done. Reduced shape:", X_reduced.shape)

    print("Fitting MiniBatchKMeans (n_clusters=%d)..." % n_clusters)
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=random_state, batch_size=4096)
    labels = kmeans.fit_predict(X_reduced)
    print("KMeans done. Labels shape:", labels.shape)

    # Build DataFrame mapping orig_index -> label and some cluster stats
    mapping = pd.DataFrame({
        "orig_index": other_df["orig_index"].astype(int),
        "pos": np.arange(len(other_df)),
        "cluster": labels
    })
    # Add sample top members per cluster
    sample_per_cluster = mapping.groupby("cluster").head(5).reset_index(drop=True)

    return svd, kmeans, labels, mapping, sample_per_cluster

# Run clustering (tune n_clusters / n_components as needed)
svd, kmeans, labels, cluster_mapping, cluster_samples = assign_clusters(n_clusters=50, n_components=50)

# Save mapping and examples
cluster_mapping.to_csv("cluster_mapping.csv", index=False)
cluster_samples.to_csv("cluster_samples.csv", index=False)
print("Saved cluster_mapping.csv and cluster_samples.csv")


Running TruncatedSVD (n_components=50) on sparse X...
SVD done. Reduced shape: (108369, 50)
Fitting MiniBatchKMeans (n_clusters=50)...
KMeans done. Labels shape: (108369,)
Saved cluster_mapping.csv and cluster_samples.csv


In [77]:
cluster_id = 10  # example cluster id
members = cluster_mapping[cluster_mapping["cluster"] == cluster_id].sort_values("pos")
print("Cluster", cluster_id, "has", len(members), "members (showing first 20):")
display(members.head(20))

# Map to cleaned_df rows and show names/cities
display(cleaned_df.loc[members["pos"].iloc[:20], ["name", "city", "cuisine"]].reset_index(drop=True))


Cluster 10 has 3186 members (showing first 20):


Unnamed: 0,orig_index,pos,cluster
27,27,27,10
28,28,28,10
799,799,799,10
800,800,800,10
1877,1877,1877,10
1878,1878,1878,10
23566,23566,23566,10
43708,43708,43708,10
43709,43709,43709,10
48110,48110,48110,10


Unnamed: 0,name,city,cuisine
0,China Kitchen Fast Food,Adilabad,Chinese
1,China Kitchen Fast Food,Adilabad,Fast Food
2,Mehfil Biryani by Zakir Khan,Agra,Biryani
3,Mehfil Biryani by Zakir Khan,Agra,Mughlai
4,Food Nest,"Bopal,Ahmedabad",North Indian
5,Food Nest,"Bopal,Ahmedabad",South Indian
6,Arooma Curries and Biryani Point,Bodhan-rural,Biryani
7,Sandwedges,"Indirapuram,Delhi",Fast Food
8,Sandwedges,"Indirapuram,Delhi",Italian
9,Scoops,Gadwal,Beverages
