In [None]:
import pandas as pd
import networkx as nx
import numpy as np
import pickle
from itertools import combinations

# Load the merged dataset
data = pd.read_csv('Merged_Data_Part0.csv')  # Replace with your actual merged CSV file name

# Create a Bipartite Graph
def create_bipartite_graph(data):
    """
    Create a bipartite graph where nodes are users and businesses, and edges are reviews.
    Edges are weighted by review scores, with timestamps as edge attributes.
    """
    B = nx.Graph()

    # Add user and business nodes
    B.add_nodes_from(data['user_id'], bipartite=0)  # Layer 0: Users
    B.add_nodes_from(data['business_id'], bipartite=1)  # Layer 1: Businesses

    # Add edges (user-business interactions) with weights and attributes
    for _, row in data.iterrows():
        B.add_edge(
            row['user_id'], 
            row['business_id'], 
            weight=row['stars_x'],  # Weight by review stars
            date=row['date']       # Add review date as an attribute
        )
    
    return B

# Create bipartite graph
bipartite_graph = create_bipartite_graph(data)
print(f"Bipartite Graph: {bipartite_graph.number_of_nodes()} nodes, {bipartite_graph.number_of_edges()} edges")

# Project Graphs
def project_graphs(bipartite_graph, data):
    """
    Create user-user and business-business projected graphs based on shared interactions.
    Weights represent the strength of the connection (e.g., number of shared reviews).
    """
    # Create user-user projection
    user_nodes = {n for n, d in bipartite_graph.nodes(data=True) if d['bipartite'] == 0}
    user_graph = nx.Graph()
    
    for business in data['business_id'].unique():
        # Get users who reviewed the same business
        reviewers = data[data['business_id'] == business]['user_id'].unique()
        # Add edges between users who reviewed the same business
        for u1, u2 in combinations(reviewers, 2):
            if user_graph.has_edge(u1, u2):
                user_graph[u1][u2]['weight'] += 1
            else:
                user_graph.add_edge(u1, u2, weight=1)
    
    # Create business-business projection
    business_nodes = {n for n, d in bipartite_graph.nodes(data=True) if d['bipartite'] == 1}
    business_graph = nx.Graph()
    
    for user in data['user_id'].unique():
        # Get businesses reviewed by the same user
        businesses = data[data['user_id'] == user]['business_id'].unique()
        # Add edges between businesses reviewed by the same user
        for b1, b2 in combinations(businesses, 2):
            if business_graph.has_edge(b1, b2):
                business_graph[b1][b2]['weight'] += 1
            else:
                business_graph.add_edge(b1, b2, weight=1)
    
    return user_graph, business_graph

# Create projected graphs
user_graph, business_graph = project_graphs(bipartite_graph, data)

print(f"User-User Graph: {user_graph.number_of_nodes()} nodes, {user_graph.number_of_edges()} edges")
print(f"Business-Business Graph: {business_graph.number_of_nodes()} nodes, {business_graph.number_of_edges()} edges")

# Save graphs for future use
def save_graph_pickle(graph, filename):
    """Save a NetworkX graph to a file using pickle."""
    with open(filename, 'wb') as f:
        pickle.dump(graph, f)

save_graph_pickle(bipartite_graph, "bipartite_graph.pkl")
save_graph_pickle(user_graph, "user_graph.pkl")
save_graph_pickle(business_graph, "business_graph.pkl")

Bipartite Graph: 89302 nodes, 99197 edges
User-User Graph: 77604 nodes, 3437368 edges
Business-Business Graph: 6531 nodes, 42364 edges


In [11]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.preprocessing import LabelEncoder

# Add prefixes to IDs to ensure uniqueness
data['user_id_prefixed'] = 'u_' + data['user_id'].astype(str)
data['business_id_prefixed'] = 'b_' + data['business_id'].astype(str)

# Encode nodes as integers for PyTorch Geometric
user_nodes = data['user_id_prefixed'].unique()
business_nodes = data['business_id_prefixed'].unique()

node_encoder = LabelEncoder()
all_nodes = np.concatenate([user_nodes, business_nodes])
node_encoder.fit(all_nodes)

data['user_encoded'] = node_encoder.transform(data['user_id_prefixed'])
data['business_encoded'] = node_encoder.transform(data['business_id_prefixed'])

# Create PyTorch Geometric graph
edge_index = torch.tensor(data[['user_encoded', 'business_encoded']].values.T, dtype=torch.long)
edge_attr = torch.tensor(data['stars_x'].values, dtype=torch.float)

# Define PyTorch Geometric Data object
graph_data = Data(edge_index=edge_index)

# Define GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Set parameters
num_nodes = len(all_nodes)
embedding_dim = 64
x = torch.eye(num_nodes)  # Initialize node features as identity matrix

model = GraphSAGE(num_nodes, 128, embedding_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train GraphSAGE
model.train()
for epoch in range(55):
    optimizer.zero_grad()
    embeddings = model(x, edge_index)
    loss = torch.mean(embeddings.norm(dim=1))  # Add a dummy loss (replace as needed)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.44274765253067017
Epoch 2, Loss: 0.2764238119125366
Epoch 3, Loss: 0.1794947236776352
Epoch 4, Loss: 0.14455430209636688
Epoch 5, Loss: 0.08563492447137833
Epoch 6, Loss: 0.1112779974937439
Epoch 7, Loss: 0.10334470868110657
Epoch 8, Loss: 0.07464113086462021
Epoch 9, Loss: 0.06265848875045776
Epoch 10, Loss: 0.04954126477241516
Epoch 11, Loss: 0.045813336968421936
Epoch 12, Loss: 0.056700557470321655
Epoch 13, Loss: 0.05220134183764458
Epoch 14, Loss: 0.03867248818278313
Epoch 15, Loss: 0.03051736205816269
Epoch 16, Loss: 0.023138388991355896
Epoch 17, Loss: 0.03132222220301628
Epoch 18, Loss: 0.031078534200787544
Epoch 19, Loss: 0.02639014832675457
Epoch 20, Loss: 0.021570831537246704
Epoch 21, Loss: 0.015716014429926872
Epoch 22, Loss: 0.017590228468179703
Epoch 23, Loss: 0.017857061699032784
Epoch 24, Loss: 0.017856402322649956
Epoch 25, Loss: 0.013402334414422512
Epoch 26, Loss: 0.010719970799982548
Epoch 27, Loss: 0.012814458459615707
Epoch 28, Loss: 0.0130859063

In [12]:
# Save embeddings
embeddings = embeddings.detach().numpy()
embedding_df = pd.DataFrame(embeddings, index=node_encoder.inverse_transform(range(num_nodes)))
embedding_df.to_csv("graphsage_embeddings.csv", index_label="node")
print("GraphSAGE embeddings saved to graphsage_embeddings.csv")


GraphSAGE embeddings saved to graphsage_embeddings.csv
