In [8]:
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import scipy.sparse as sp

from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity  # optional


In [9]:
# Paths (adjust if necessary)
SPARSE_NPZ = Path("encoded_features.npz")
FEAT_NAMES_PKL = Path("encoded_feature_names.pkl")
OTHER_CSV = Path("encoded_other_columns.csv")
CLEANED_CSV = Path("cleaned_data.csv")

# sanity
for p in [SPARSE_NPZ, FEAT_NAMES_PKL, OTHER_CSV, CLEANED_CSV]:
    if not p.exists():
        raise FileNotFoundError(f"Missing required file: {p}")

# Load sparse OHE and other columns
ohe_sparse = sp.load_npz(SPARSE_NPZ)   # CSR matrix (n_rows x n_ohe_features)
with open(FEAT_NAMES_PKL, "rb") as f:
    ohe_feature_names = pickle.load(f)

other_df = pd.read_csv(OTHER_CSV)     # includes orig_index and numeric cols (+ target)
cleaned_df = pd.read_csv(CLEANED_CSV) # original non-encoded dataset

# Ensure orig_index exists in other_df; if not, create it as reading index
if "orig_index" not in other_df.columns:
    other_df["orig_index"] = other_df.index

# Build X: combine numeric columns from other_df (except orig_index & any non-feature text) with ohe_sparse
# We'll treat non-numeric text columns by converting to category codes if needed (safe fallback).
feature_cols = [c for c in other_df.columns if c not in ("orig_index", )]  # includes the target if present
# Option: remove target from features if present
TARGET = "rating" if "rating" in feature_cols else None
if TARGET:
    feature_cols.remove(TARGET)

# Prepare numeric block from other_df
X_other = other_df[feature_cols].copy()

# Convert non-numeric to category codes or numeric where possible
for col in X_other.columns:
    if not pd.api.types.is_numeric_dtype(X_other[col]):
        # try coercion
        coerced = pd.to_numeric(X_other[col], errors="coerce")
        if coerced.notna().sum() / len(coerced) > 0.5:
            X_other[col] = coerced.fillna(coerced.median())
        else:
            # convert to category codes (makes numeric)
            X_other[col] = X_other[col].astype("category").cat.codes
X_other = X_other.fillna(0)  # final fallback

# Convert to sparse (CSR) and horizontally stack with ohe_sparse
X_other_sparse = sp.csr_matrix(X_other.values) if X_other.shape[1] > 0 else None
if X_other_sparse is not None:
    X = sp.hstack([X_other_sparse, ohe_sparse], format="csr")
else:
    X = ohe_sparse.copy()

print("Built X (sparse) shape:", X.shape)
print("other_df shape:", other_df.shape)
print("cleaned_df shape:", cleaned_df.shape)


Built X (sparse) shape: (108369, 902)
other_df shape: (108369, 10)
cleaned_df shape: (108369, 11)


In [10]:
# Fit NearestNeighbors (cosine metric). Use algorithm='brute' which supports sparse + cosine.
# For faster approximate search on huge data, consider using annoy, faiss, or nmslib.
n_neighbors = 20   # max neighbors to precompute (query can request <= this)
nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine", algorithm="brute", n_jobs=-1)
nn_model.fit(X)   # fits on sparse X

print("NearestNeighbors (cosine) fitted on X.")


NearestNeighbors (cosine) fitted on X.


In [11]:
# Helper: map orig_index -> row position in X / other_df
# other_df.orig_index should match sparse matrix row order (created that way earlier)
orig_to_pos = pd.Series(data=np.arange(len(other_df)), index=other_df["orig_index"].astype(int)).to_dict()

def recommend_by_index(orig_idx, k=10, include_scores=True):
    """
    Recommend top-k similar restaurants for the row with orig_index == orig_idx.
    Returns a DataFrame with columns: orig_index, pos (row in cleaned_df), score (cosine dist)
    """
    if int(orig_idx) not in orig_to_pos:
        raise KeyError(f"orig_index {orig_idx} not found in other_df")
    pos = orig_to_pos[int(orig_idx)]
    # kneighbors returns distances for cosine which are in [0,2] for cosine distance if metric='cosine'.
    distances, indices = nn_model.kneighbors(X[pos], n_neighbors=k+1)  # +1 since first is self
    distances = distances.flatten()
    indices = indices.flatten()
    # Remove self (pos) from results
    mask = indices != pos
    indices = indices[mask][:k]
    distances = distances[mask][:k]
    # Build result mapping back to orig_index and cleaned rows
    res = []
    for idx_pos, dist in zip(indices, distances):
        orig = int(other_df["orig_index"].iat[idx_pos])
        # similarity = 1 - dist if you want similarity instead of distance
        sim = 1 - dist
        row = cleaned_df.iloc[idx_pos] if idx_pos < len(cleaned_df) else None
        res.append({"orig_index": orig, "pos": int(idx_pos), "distance": float(dist), "similarity": float(sim)})
    df_res = pd.DataFrame(res)
    if include_scores:
        return df_res
    else:
        return df_res.drop(columns=["distance","similarity"])

def recommend_by_name(name, k=10, match_mode="exact"):
    """
    Find rows in cleaned_df with given name and return recommendations.
    match_mode: 'exact' or 'contains' (case-insensitive)
    """
    if match_mode == "exact":
        matches = cleaned_df[cleaned_df["name"] == name]
    else:
        matches = cleaned_df[cleaned_df["name"].str.contains(name, case=False, na=False)]
    if matches.empty:
        raise KeyError(f"No matches for name='{name}' (mode={match_mode})")
    # use first match by default (could loop over multiple)
    match_pos = matches.index[0]
    # match_pos should correspond to orig_index if cleaned_df index aligns; otherwise map orig_index value
    # If cleaned_df was read fresh, its reading index likely equals orig_index values; but we prefer to find orig_index column in other_df
    orig_idx = int(match_pos)
    # Use recommend_by_index
    return recommend_by_index(orig_idx, k=k)


In [12]:
# Example by orig_index
example_idx = 123  # change to any valid orig_index present in other_df
try:
    recs = recommend_by_index(example_idx, k=10)
    print("Recommendations for orig_index", example_idx)
    display(recs)
    # show names from cleaned_df
    display(cleaned_df.loc[recs["pos"], ["name","city","cuisine"]].reset_index(drop=True))
except Exception as e:
    print("Error:", e)

# Example by name
try:
    name_query = "Pizza Hut"   # change to desired restaurant name (exact or set match_mode="contains")
    recs_by_name = recommend_by_name(name_query, k=10, match_mode="contains")
    print("Recommendations for name contains:", name_query)
    display(recs_by_name)
    display(cleaned_df.loc[recs_by_name["pos"], ["name","city","cuisine"]].reset_index(drop=True))
except Exception as e:
    print("Error:", e)


Recommendations for orig_index 123


Unnamed: 0,orig_index,pos,distance,similarity
0,108363,108363,0.0,1.0
1,108362,108362,0.0,1.0
2,108361,108361,0.0,1.0
3,108360,108360,0.0,1.0
4,108359,108359,0.0,1.0
5,108358,108358,0.0,1.0
6,11,11,0.0,1.0
7,10,10,0.0,1.0
8,9,9,0.0,1.0
9,8,8,0.0,1.0


Unnamed: 0,name,city,cuisine
0,Ranade Bandhu,Yavatmal,Fast Food
1,Ranade Bandhu,Yavatmal,Sweets
2,Jain Varities & Icecream Corner,Yavatmal,Fast Food
3,Jain Varities & Icecream Corner,Yavatmal,Snacks
4,Beyond Temtation,Yavatmal,Beverages
5,Beyond Temtation,Yavatmal,Fast Food
6,Bharawan Da Dhaba,Abohar,Indian
7,Sethi Milk Badam,Abohar,Desserts
8,Sethi Milk Badam,Abohar,Sweets
9,Hinglaj Kachori Bhandhar,Abohar,Chaat


Recommendations for name contains: Pizza Hut


Unnamed: 0,orig_index,pos,distance,similarity
0,108367,108367,0.0,1.0
1,108366,108366,0.0,1.0
2,108365,108365,0.0,1.0
3,108364,108364,0.0,1.0
4,108363,108363,0.0,1.0
5,108362,108362,0.0,1.0
6,108361,108361,0.0,1.0
7,108360,108360,0.0,1.0
8,108359,108359,0.0,1.0
9,108358,108358,0.0,1.0


Unnamed: 0,name,city,cuisine
0,Suraj Hotel,Yavatmal,Fast Food
1,Suraj Hotel,Yavatmal,North Indian
2,Satkar Dinning Hall,Yavatmal,North Indian
3,Satkar Dinning Hall,Yavatmal,Maharashtrian
4,Ranade Bandhu,Yavatmal,Fast Food
5,Ranade Bandhu,Yavatmal,Sweets
6,Jain Varities & Icecream Corner,Yavatmal,Fast Food
7,Jain Varities & Icecream Corner,Yavatmal,Snacks
8,Beyond Temtation,Yavatmal,Beverages
9,Beyond Temtation,Yavatmal,Fast Food


In [13]:
def assign_clusters(n_clusters=50, n_components=50, random_state=42):
    """
    Reduce dimensionality with TruncatedSVD to n_components, then cluster with MiniBatchKMeans.
    Returns cluster labels (len == n_rows) and stores reduced components & kmeans object.
    """
    print("Running TruncatedSVD (n_components=%d) on sparse X..." % n_components)
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    X_reduced = svd.fit_transform(X)   # result is dense but n_rows x n_components (ok if n_components small)
    print("SVD done. Reduced shape:", X_reduced.shape)

    print("Fitting MiniBatchKMeans (n_clusters=%d)..." % n_clusters)
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=random_state, batch_size=4096)
    labels = kmeans.fit_predict(X_reduced)
    print("KMeans done. Labels shape:", labels.shape)

    # Build DataFrame mapping orig_index -> label and some cluster stats
    mapping = pd.DataFrame({
        "orig_index": other_df["orig_index"].astype(int),
        "pos": np.arange(len(other_df)),
        "cluster": labels
    })
    # Add sample top members per cluster
    sample_per_cluster = mapping.groupby("cluster").head(5).reset_index(drop=True)

    return svd, kmeans, labels, mapping, sample_per_cluster

# Run clustering (tune n_clusters / n_components as needed)
svd, kmeans, labels, cluster_mapping, cluster_samples = assign_clusters(n_clusters=50, n_components=50)

# Save mapping and examples
cluster_mapping.to_csv("cluster_mapping.csv", index=False)
cluster_samples.to_csv("cluster_samples.csv", index=False)
print("Saved cluster_mapping.csv and cluster_samples.csv")


Running TruncatedSVD (n_components=50) on sparse X...
SVD done. Reduced shape: (108369, 50)
Fitting MiniBatchKMeans (n_clusters=50)...
KMeans done. Labels shape: (108369,)
Saved cluster_mapping.csv and cluster_samples.csv


In [14]:
cluster_id = 10  # example cluster id
members = cluster_mapping[cluster_mapping["cluster"] == cluster_id].sort_values("pos")
print("Cluster", cluster_id, "has", len(members), "members (showing first 20):")
display(members.head(20))

# Map to cleaned_df rows and show names/cities
display(cleaned_df.loc[members["pos"].iloc[:20], ["name", "city", "cuisine"]].reset_index(drop=True))


Cluster 10 has 3186 members (showing first 20):


Unnamed: 0,orig_index,pos,cluster
27,27,27,10
28,28,28,10
799,799,799,10
800,800,800,10
1877,1877,1877,10
1878,1878,1878,10
23566,23566,23566,10
43708,43708,43708,10
43709,43709,43709,10
48110,48110,48110,10


Unnamed: 0,name,city,cuisine
0,China Kitchen Fast Food,Adilabad,Chinese
1,China Kitchen Fast Food,Adilabad,Fast Food
2,Mehfil Biryani by Zakir Khan,Agra,Biryani
3,Mehfil Biryani by Zakir Khan,Agra,Mughlai
4,Food Nest,"Bopal,Ahmedabad",North Indian
5,Food Nest,"Bopal,Ahmedabad",South Indian
6,Arooma Curries and Biryani Point,Bodhan-rural,Biryani
7,Sandwedges,"Indirapuram,Delhi",Fast Food
8,Sandwedges,"Indirapuram,Delhi",Italian
9,Scoops,Gadwal,Beverages
