# hgn_recommendation

## Heterogeneous Graph Construction

In [1]:
import pandas as pd
business_df = pd.read_parquet('../../Data/preprocessed_business.parquet', engine='pyarrow')
business_df.columns

Index(['business_id_index', 'business_id', 'name', 'address', 'city', 'state',
       'postal_code', 'latitude', 'longitude', 'stars', 'review_count',
       'is_open', 'attributes', 'categories', 'hours', 'category_list',
       'category_index', 'categories-cleaned', 'name-cleaned',
       'address-cleaned', 'category_embeddings', 'name_embeddings',
       'address_embeddings'],
      dtype='object')

In [2]:
import pandas as pd
import torch
import torch_geometric.transforms as T
import pickle
from torch_geometric.data import HeteroData
import numpy as np

review_df = pd.read_parquet('../../Data/preprocessed_review.parquet', engine='pyarrow')
user_df = pd.read_parquet('../../Data/preprocessed_user.parquet', engine='pyarrow')
business_df = pd.read_parquet('../../Data/preprocessed_business.parquet', engine='pyarrow')


In [3]:

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

category_embeddings=torch.from_numpy(np.stack(business_df['category_embeddings'].values)).float()

In [4]:
import pandas as pd
import torch
import torch_geometric.transforms as T
import pickle
from torch_geometric.data import HeteroData
version='alll'

review_df = pd.read_parquet('../../Data/preprocessed_review.parquet', engine='pyarrow')
user_df = pd.read_parquet('../../Data/preprocessed_user.parquet', engine='pyarrow')
business_df = pd.read_parquet('../../Data/preprocessed_business.parquet', engine='pyarrow')

# loading business_category_index
with open('../../Data/business_category_index.pickle', 'rb') as handle:
    business_category_index = pickle.load(handle)

### Nodes ###
# User features
user_features=torch.from_numpy(user_df[['review_count', 'useful', 'funny', 'cool', 'fans', 'average_stars']].values).to(torch.float)  # [num_users, num_features_users]
print(user_features.shape)

# Business features
business_simple_features=torch.from_numpy(business_df[['stars', 'review_count', 'is_open', 'latitude', 'longitude']].values).to(torch.float)  # [num_businesses, num_features_businesses]
business_category_embeddings=torch.from_numpy(np.stack(business_df['category_embeddings'].values)).float()
business_name_embeddings=torch.from_numpy(np.stack(business_df['name_embeddings'].values)).float()
business_address_embeddings=torch.from_numpy(np.stack(business_df['address_embeddings'].values)).float()
business_features=torch.cat([business_simple_features,
                             business_category_embeddings,
                             business_name_embeddings,
                             business_address_embeddings,
                             ],
                             dim=1)

print(business_features.shape)

# Business Categories
# category_features = torch.eye(len(business_category_index))

### Edges ###
# Review edge index
review_edge_index=torch.stack([
    torch.tensor(review_df['user_id_index'].values),
    torch.tensor(review_df['business_id_index'].values)]
    , dim=0)
assert review_edge_index.shape == (2, len(review_df))
print('review_edge_index.shape:',review_edge_index.shape)

# Review edge label
rating=torch.from_numpy(review_df['stars'].values).to(torch.float)
print('rating.shape:',rating.shape)

# Friend edge index
user_friend_index = torch.tensor([
    (user, friend)                                      # 3) Tuple of users and friends
    for user, friend_list in user_df['friends'].items() # 1) Parent-loop, across each user in user_df
    for friend in friend_list                           # 2) Child-loop, across each friend of each user
], dtype=torch.long).t().contiguous()                   # 4) Convert to tensor and transpose to required shape
print('user_friend_index.shape:',user_friend_index.shape)

# Business category edge index
business_category_index = torch.tensor([
    (business, category)                                                 # 3) Tuple of businesses and their categories
    for business, category_list in business_df['category_index'].items() # 1) Parent-loop, across each business in business_df
    for category in category_list                                        # 2) Child-loop, across each category in category_list
], dtype=torch.long).t().contiguous()                                    # 4) Convert to tensor and transpose to required shape
print('business_category_index.shape:',business_category_index.shape)

data = HeteroData()
data['user'].x = user_features
data['business'].x = business_features
data['user','rates','business'].edge_index=review_edge_index
data['user','rates','business'].edge_label=rating
# data['user','friends','user'].edge_index=user_friend_index
# data['business','part_of','category'].edge_index=business_category_index

# Add the reverse edges in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)

# With the above transformation we also got reversed labels for the edges.
# We are going to remove them:
del data['business', 'rev_rates', 'user'].edge_label

# Save the HeteroData object
torch.save(data, f'../../Data/hetero_data-{version}.pt')

print(data)

del review_df, user_df, business_df, user_features, business_features, review_edge_index, rating
del user_friend_index, business_category_index, data

torch.Size([227407, 6])
torch.Size([14576, 1157])
review_edge_index.shape: torch.Size([2, 686422])
rating.shape: torch.Size([686422])
user_friend_index.shape: torch.Size([2, 1345962])
business_category_index.shape: torch.Size([2, 63851])
HeteroData(
  user={ x=[227407, 6] },
  business={ x=[14576, 1157] },
  (user, rates, business)={
    edge_index=[2, 686422],
    edge_label=[686422],
  },
  (business, rev_rates, user)={ edge_index=[2, 686422] }
)


## Train-Test split

In [5]:
import torch
import torch_geometric.transforms as T
from torch_geometric.data import HeteroData
import random

# Set seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

data = torch.load('../../Data/hetero_data.pt', weights_only=False)
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'business')],
    rev_edge_types=[('business', 'rev_rates', 'user')],
)(data)
train_data, val_data
data

HeteroData(
  user={ x=[227407, 6] },
  business={ x=[14576, 5] },
  (user, rates, business)={
    edge_index=[2, 686422],
    edge_label=[686422],
  },
  (business, rev_rates, user)={ edge_index=[2, 686422] }
)

## Graph Neural Network

In [6]:
from torch_geometric.nn import SAGEConv, to_hetero

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['business'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Model(hidden_channels=32).to(device)

print(model)

Model(
  (encoder): GraphModule(
    (conv1): ModuleDict(
      (user__rates__business): SAGEConv((-1, -1), 32, aggr=mean)
      (business__rev_rates__user): SAGEConv((-1, -1), 32, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__business): SAGEConv((-1, -1), 32, aggr=mean)
      (business__rev_rates__user): SAGEConv((-1, -1), 32, aggr=mean)
    )
  )
  (decoder): EdgeDecoder(
    (lin1): Linear(in_features=64, out_features=32, bias=True)
    (lin2): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [None]:
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
writer = SummaryWriter()

def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, 
                 train_data.edge_index_dict,
                 train_data['user', 'rates', 'business'].edge_label_index
                 )
    target = train_data['user', 'rates', 'business'].edge_label
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def test(data):
    data = data.to(device)
    model.eval()
    pred = model(data.x_dict,
                 data.edge_index_dict,
                 data['user','rates', 'business'].edge_label_index
                 )
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'rates', 'business'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

best_val = float('inf')
for epoch in range(1, 3001):
    train_data = train_data.to(device)
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)

    writer.add_scalar("Loss", loss, epoch)
    writer.add_scalar("Train rmse", train_rmse, epoch)
    writer.add_scalar("Val rsme", val_rmse, epoch)

    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}')

    if val_rmse < best_val:
        best_val = val_rmse
        best_state = model.state_dict()
        patience = 0
    else:
        patience += 1
        if patience >= 1000:
            print("Early stopping.")
            break

writer.close()
model.load_state_dict(best_state)
torch.save(model.state_dict(), f"../Saved_Models/hgn_model-{version}.pt")

Epoch: 001, Loss: 1.0996, Train: 0.7780, Val: 0.7771
Epoch: 002, Loss: 0.7348, Train: 0.6801, Val: 0.6974
Epoch: 003, Loss: 0.4626, Train: 0.5201, Val: 0.5591
Epoch: 004, Loss: 0.2705, Train: 0.4209, Val: 0.4601
Epoch: 005, Loss: 0.1772, Train: 0.3707, Val: 0.4023
Epoch: 006, Loss: 0.1376, Train: 0.3596, Val: 0.3805
Epoch: 007, Loss: 0.1296, Train: 0.3790, Val: 0.3873
Epoch: 008, Loss: 0.1439, Train: 0.4013, Val: 0.4009
Epoch: 009, Loss: 0.1612, Train: 0.4087, Val: 0.4057
Epoch: 010, Loss: 0.1672, Train: 0.3995, Val: 0.3992
Epoch: 011, Loss: 0.1597, Train: 0.3811, Val: 0.3872
Epoch: 012, Loss: 0.1452, Train: 0.3627, Val: 0.3774
Epoch: 013, Loss: 0.1316, Train: 0.3520, Val: 0.3761
Epoch: 014, Loss: 0.1241, Train: 0.3556, Val: 0.3870
Epoch: 015, Loss: 0.1269, Train: 0.3677, Val: 0.4023
Epoch: 016, Loss: 0.1360, Train: 0.3688, Val: 0.4059
Epoch: 017, Loss: 0.1369, Train: 0.3628, Val: 0.4012
Epoch: 018, Loss: 0.1325, Train: 0.3566, Val: 0.3943
Epoch: 019, Loss: 0.1280, Train: 0.3511, Val: 

In [85]:
@torch.no_grad()
def predict(data, edge_label_index=None):
    """
    Generate predicted ratings for the given user→business edges.

    Args:
        data: HeteroData (must have x_dict & edge_index_dict).
        edge_label_index: Tensor[2, num_edges] of (user,business) pairs.
                          If None, uses data['user','rates','business'].edge_label_index.

    Returns:
        Tensor of shape [num_edges] with ratings in [0,5], on CPU.
    """
    model.eval()
    data = data.to(device)
    if edge_label_index is None:
        edge_label_index = data['user','rates','business'].edge_label_index
    pred = model(data.x_dict, data.edge_index_dict, edge_label_index)
    pred = pred.clamp(min=0, max=5)
    return pred.cpu()

In [None]:
import numpy as np
import pandas as pd

# Haversine distance (returns km)
def haversine(lon1, lat1, lon2, lat2):
    R = 6371.0  # Earth radius in km
    φ1 = np.radians(lat1)
    φ2 = np.radians(lat2)
    Δφ = np.radians(lat2 - lat1)
    Δλ = np.radians(lon2 - lon1)
    a = np.sin(Δφ / 2.0)**2 + np.cos(φ1) * np.cos(φ2) * np.sin(Δλ / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

def recommend(lat, lng, radius, hour=None, top_k=20):
    """
    Return the top_k businesses within `radius` km of (lat, lng), 
    sorted by average stars descending.

    Args:
        lat (float): latitude of the query point
        lng (float): longitude of the query point
        radius (float): radius in kilometers
        hour (int, optional): hour of day (unused in this simple version)
        top_k (int): how many results to return
    Returns:
        pandas.DataFrame of top_k businesses with distance column
    """
    # Load preprocessed businesses (if not already loaded)
    try:
        df = business_df.copy()
    except NameError:
        df = pd.read_parquet('../../Data/preprocessed_business.parquet', engine='pyarrow')
    
    # Compute distance
    df['distance_km'] = haversine(
        lng, lat,
        df['longitude'].values,
        df['latitude'].values
    )
    # Filter and sort
    df = df[df['distance_km'] <= radius]
    df = df.sort_values(['stars', 'distance_km'], ascending=[False, True])
    
    return df.head(top_k)

# Example usage:
top20 = recommend(37.77, -122.42, radius=5)
print(top20[['name','stars','review_count','distance_km']])


Empty DataFrame
Columns: [name, stars, review_count, distance_km]
Index: []


TESTING:

 - root MSE
 - k order ranking 
    - List of edges in test dataset
    - Compare to see if the order is correct:
        - how many are in the top 5
        - how many are in the correct order

- Screen shots
    - GUI
    - Results
    - 

- Looking at different models
    - optiums
    - looking at 
    
