In [None]:
####### INSTALATION #######

!pip uninstall torch -y
!pip install torch==1.13.1
# !pip uninstall torch-scatter -y
# !pip uninstall torch-sparse -y
# !pip uninstall pyg-lib -y
# !pip uninstall git+https://github.com/pyg-team/pytorch_geometric.git -y
# !pip uninstall sentence_transformers -y

import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${TORCH}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git
!pip install pandas
!pip install matplotlib
!pip install pyarrow fastparquet
!pip install transformers
!pip install lightfm
!pip install memory-profiler
!pip install imbalanced-learn
!pip install xgboost
!pip install gensim nltk
# !pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
# !pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
# !pip install sentence_transformers==0.1.0

In [1]:
###### IMPORT #######
import numpy as np
import time
import pandas as pd
import random
import copy
# from neo4j import GraphDatabase
from torch_geometric.data import Data
import torch
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
import torch.nn as nn
from tqdm import tqdm
from collections import defaultdict
import json
import re
import multiprocessing
import os
import pickle
import matplotlib.pyplot as plt
from lightfm import LightFM
from lightfm.data import Dataset

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel
from memory_profiler import profile
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from torch_geometric.data import download_url, extract_zip
from torch import Tensor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
########## SETUP ARGS ###########
possible_experiments = {
    0: 'full',
    1: 'effectiveness_emb_size',
    2: 'effectiveness_emb_epoch',
    3: 'ablation',
    4: 'TFIDF',
    5: 'SBERT',
    6: 'word2vec',
    7: 'scalability',
}
experiment = possible_experiments[5]

possible_modes = ['debug', 'experiment']
mode = possible_modes[0]

model_variants = ['random', 'ml', 'gnn']
len_interactions_to_consider = 50000
# model_variant_eval = model_variants[1]

dataset_mode = 'review'
embedding_size = 100



In [None]:
########### NEURAL CLASSIFIER ON SMALL LABELED DATASET #############
df = pd.read_csv('dataset/Labelled Yelp Dataset.csv')

df_label_1 = df[df['Label'] == 1]
df_label_minus_1 = df[df['Label'] == -1]
num_samples = min(len(df_label_1), len(df_label_minus_1))
df_label_1_sampled = df_label_1.sample(n=num_samples, random_state=42)
df_label_minus_1_sampled = df_label_minus_1.sample(n=num_samples, random_state=42)
balanced_df = pd.concat([df_label_1_sampled, df_label_minus_1_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)
balanced_df['Label'] = balanced_df['Label'].apply(lambda x: 0 if x == -1 else 1)

def prepare_dataset(df, tfidf_vectorizer):
    # 1. Label encoding for products
    product_encoder = LabelEncoder()
    df['encoded_product'] = product_encoder.fit_transform(df['Product_id'])

    review_embeddings = tfidf_vectorizer.fit_transform(df['Review']).toarray()

    # Convert to torch tensors and move to appropriate device
    product_ids = torch.tensor(df['encoded_product'].values, dtype=torch.long)
    review_texts = torch.tensor(review_embeddings, dtype=torch.float)
    labels = torch.tensor(df['Label'].values, dtype=torch.long)

    # Create a DataLoader for batching
    dataset = TensorDataset(product_ids, review_texts, labels)
    dataloader = DataLoader(dataset, batch_size=2048, shuffle=True)
    return dataloader, product_encoder, review_embeddings

temp_df = balanced_df[:-1]
tfidf_vectorizer = TfidfVectorizer(max_features=100)
df_train, df_test = train_test_split(temp_df, test_size=0.1, random_state=42)
train_dataloader, train_product_encoder, train_review_embeddings = prepare_dataset(df_train, tfidf_vectorizer)
test_dataloader, _, _ = prepare_dataset(df_test, tfidf_vectorizer)

class SimpleClassifier(nn.Module):
    def __init__(self, num_products, review_embedding_dim):
        super(SimpleClassifier, self).__init__()
        
        embedding_dim = 32  # arbitrary size, adjust as needed
        
        # User and Product Embeddings
        # self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.product_embedding = nn.Embedding(num_products, embedding_dim)
        
        # Linear layers
        self.fc1 = nn.Linear(embedding_dim + review_embedding_dim, 128)  # combining user, product and text embeddings
        self.fc2 = nn.Linear(128, 2)  # output 2 values for binary classification
        
    def forward(self, product_ids, review_embeddings):
        # user_embed = self.user_embedding(user_ids)
        product_embed = self.product_embedding(product_ids)
        
        # Concatenate the embeddings
        combined = torch.cat([product_embed, review_embeddings], dim=1)
        
        # Pass through the linear layers
        x = F.relu(self.fc1(combined))
        x = self.fc2(x)
        
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleClassifier(len(train_product_encoder.classes_), train_review_embeddings.shape[1])
model = model.to(device)  # Move the model to GPU if available
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# def compute_average_loss(model, train_dataloader, criterion):
#     total_loss = 0.0
#     total_samples = 0

#     with torch.no_grad():  # Ensure no gradients are computed
#         for batch_idx, (products, reviews, labels) in enumerate(train_dataloader):
#             outputs = model(products, reviews)
#             loss = criterion(outputs, labels)
#             total_loss += loss.item() * labels.size(0)
#             total_samples += labels.size(0)

#     return total_loss / total_samples

# # Compute and print the loss before training
# initial_loss = compute_average_loss(model, train_dataloader, criterion)
# print(f"Initial Loss (Untrained): {initial_loss:.4f}")

def calculate_class_accuracies(outputs, labels):
    _, predicted = torch.max(outputs, 1)
    correct_class_1 = ((predicted == 1) & (labels == 1)).sum().item()
    correct_class_fake = ((predicted == 0) & (labels == 0)).sum().item()
    total_class_1 = (labels == 1).sum().item()
    total_class_minus_1 = (labels == 0).sum().item()

    acc_class_1 = correct_class_1 / total_class_1  if total_class_1 != 0 else 0
    acc_class_fake = correct_class_fake / total_class_minus_1 if total_class_minus_1 != 0 else 0

    return acc_class_1, acc_class_fake

def calculate_accuracy(outputs, labels):
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == labels).sum().item()
    return correct / labels.size(0)

num_epochs = 5
for epoch in range(num_epochs):
    for batch_idx, (products, reviews, labels) in enumerate(train_dataloader):
        products, reviews, labels = products.to(device), reviews.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(products, reviews)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    total_accuracy = 0
    total_accuracy_class_1 = 0
    total_accuracy_class_fake = 0
    with torch.no_grad():
        for val_products, val_reviews, val_labels in test_dataloader:
            val_products, val_reviews, val_labels = val_products.to(device), val_reviews.to(device), val_labels.to(device)
            val_outputs = model(val_products, val_reviews)
            acc_class_1, acc_class_fake = calculate_class_accuracies(val_outputs, val_labels)
            total_accuracy_class_1 += acc_class_1
            total_accuracy_class_fake += acc_class_fake
            # total_accuracy += calculate_accuracy(val_outputs, val_labels)


    avg_accuracy_class_1 = total_accuracy_class_1 / len(test_dataloader)
    avg_accuracy_class_fake = total_accuracy_class_fake / len(test_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}, \
        Validation Accuracy for Class Not Fake: {avg_accuracy_class_1*100:.2f}%, \
        Validation Accuracy for Class Fake: {avg_accuracy_class_fake*100:.2f}%")

    # avg_accuracy = total_accuracy / len(test_dataloader)
    # print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}, Validation Accuracy: {avg_accuracy*100:.2f}%")





In [None]:
######### LABEL NEW DATASET WITH NEURAL CLASSIFIER #########
def predict_labels(df, model, product_encoder, tfidf_vectorizer):
    model.eval()

    df['encoded_business'] = product_encoder.transform(df['business_id'])
    review_embeddings = tfidf_vectorizer.transform(df['text']).toarray()

    product_ids = torch.tensor(df['encoded_business'].values, dtype=torch.long)
    review_texts = torch.tensor(review_embeddings, dtype=torch.float)

    dataset = TensorDataset(product_ids, review_texts)
    dataloader = DataLoader(dataset, batch_size=2048)

    predicted_labels = []

    with torch.no_grad():
        for products, reviews in dataloader:
            products, reviews = products.to(device), reviews.to(device)
            
            outputs = model(products, reviews)
            _, preds = torch.max(outputs, 1)
            predicted_labels.extend(preds.cpu().numpy())

    df['Predicted_Label'] = predicted_labels

    return df

tfidf_vectorizer = TfidfVectorizer(max_features=100)
unlabeled_df = pd.read_parquet('dataset/yelp_dataset_ver2_small.parquet')
def encode_business_id(unlabeled_df):
    encoder = LabelEncoder()
    unlabeled_df['business_id'] = encoder.fit_transform(unlabeled_df['business_id'])
    return unlabeled_df

unlabeled_df = encode_business_id(unlabeled_df)
updated_df = predict_labels(unlabeled_df, model, train_product_encoder, tfidf_vectorizer)


In [None]:
############# ARCHIVED CREATE DATASET FOR GNN CLASSIFIER #############
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from torch_geometric.data import Data

class CustomDataset:
     
    def __len__(self):
        # Assuming one graph for the entire dataset. 
        # If you have multiple graphs, you should return the number of graphs here.
        return 1

    def __getitem__(self, idx):
        # For this example, I'm assuming you have one big graph for the entire dataset.
        # So, regardless of the index, we return the same graph.
        # This method needs to be adapted if you have multiple graphs.
        data = Data(
            x=self.x, 
            edge_index=torch.tensor(self.edge_index).t().contiguous(), 
            edge_attr=torch.stack(self.edge_attr),
            y=torch.tensor(self.y, dtype=torch.float32)  # Add the edge labels here
        )
        return data

    def __init__(self, df):
        self.df = df
        self.vectorizer = TfidfVectorizer(max_features=100) # Example to limit features to 100, adjust accordingly
        self.vectorizer.fit(df['text'])
        self.edge_index, self.edge_attr, self.y = self.dataframe_to_graph()
        self.x = self.encode_node_features()
        self.y = []

    def dataframe_to_graph(self):
        """
        Convert the DataFrame to a graph structure using user_id, friends, business_id, and stars.
        """
        edge_index = []
        edge_attr = []
        labels = []

        # Creating Review edges
        for _, row in tqdm(self.df.iterrows(), total=len(self.df), desc='processing review edges'):
            user_id = row['user_id']
            business_id = row['business_id'] + max(self.df['user_id']) # Ensuring unique node IDs by offsetting business IDs
            edge_index.append((user_id, business_id))
            
            # Extract TFIDF embedding for the text review
            tfidf_vector = self.get_tfidf_embedding(row['text']) # A method to be implemented
            edge_attr.append(torch.tensor([row['stars']] + tfidf_vector))
            labels.append(row['label'])

        # Creating Friendship edges
        for _, row in tqdm(self.df.iterrows(), total=len(self.df), desc='processing social edges'):
            user_id = row['user_id']
            for friend_id in row['friends']:
                edge_index.append((user_id, friend_id))
                edge_attr.append(torch.tensor([0] + [0]*len(tfidf_vector))) # No specific feature for friendship edge
                labels.append(1)

        # Convert edge_index to a torch tensor of shape [2, num_edges]
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        self.y = torch.tensor(labels, dtype=torch.float32)

        return edge_index, edge_attr

    def get_tfidf_embedding(self, text):
        """
        Convert a given text to its TFIDF embedding.
        """
        # Assuming a global vectorizer, or you can initialize and fit one here.
        vectorized = self.vectorizer.transform([text])
        return vectorized.toarray().flatten().tolist()

    def encode_node_features(self):
        """
        Since nodes don't have specific features other than their IDs,
        a simple one-hot encoding (or an embedding lookup) can be used.
        However, for simplicity, we will use node IDs as features here.
        """
        # num_nodes = max(self.df['user_id']) + max(self.df['business_id'])
        num_nodes = max(self.df['user_id']) + 1 + max(self.df['business_id'])
        x = torch.arange(num_nodes).float().unsqueeze(-1)
        return x
    
    def one_hot_encode_star(self, star, max_value=5):
        """
        One-hot encode the star rating.
        """
        one_hot = [0] * (max_value + 1) # +1 because rating starts from 0
        one_hot[star] = 1
        return one_hot

import torch.optim as optim
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = CustomDataset(train_df)
val_dataset = CustomDataset(val_df)


In [None]:
############# ARCHIVED TRAIN GNN CLASSIFIER V1 #############
import torch.optim as optim 

# 2. Define the GNN Model (from the earlier provided code)
class GraphSAGEClassifier(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_classes):
        super(GraphSAGEClassifier, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# 3. Define loss and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GraphSAGEClassifier(1, 128, 2).to(device)  # Assuming binary classification (labels 0 and 1)
optimizer = optim.Adam(model.parameters(), lr=0.005)
criterion = torch.nn.CrossEntropyLoss()

# 4. Train the model
def train(dataset, model):
    model.train()
    optimizer.zero_grad()
    out = model(dataset.x.to(device), dataset.edge_index.to(device))
    
    # Assuming that you've stored labels for the nodes involved in the reviews.
    mask = [i for i, row in dataset.df.iterrows()]  # Only consider nodes involved in reviews.
    loss = criterion(out[mask], torch.tensor(dataset.df['label'].values).to(device))
    loss.backward()
    optimizer.step()
    return loss.item()

# 5. Evaluate the model
def evaluate(dataset, model):
    model.eval()
    with torch.no_grad():
        out = model(dataset.x.to(device), dataset.edge_index.to(device))
        mask = [i for i, row in dataset.df.iterrows()]
        pred = out[mask].max(1)[1]
        correct = pred.eq(torch.tensor(dataset.df['label'].values).to(device)).sum().item()
        return correct / len(mask)

# Training loop
epochs = 20
best_acc = 0
best_loss = 10000
gnn_model = ''

for epoch in range(epochs):
    loss = train(train_dataset, model)
    train_acc = evaluate(train_dataset, model)
    val_acc = evaluate(val_dataset, model)
    if val_acc > best_acc: gnn_model = model
    if val_acc == best_acc:
        if loss < best_loss: gnn_model = model
    print(f"Epoch: {epoch+1}/{epochs}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")



In [None]:
############# ARCHIVED TRAIN GNN CLASSIFIER V2 #############
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import DataLoader

class EdgeClassifier(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, edge_feature_size):
        super(EdgeClassifier, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        
        # Classifier that operates on the concatenation of both node embeddings + edge features
        self.classifier = torch.nn.Linear(2 * hidden_channels + edge_feature_size, 1)

    def forward(self, x, edge_index, edge_attr):
        x1 = self.conv1(x, edge_index)
        x1 = F.relu(x1)
        
        x2 = self.conv2(x1, edge_index)
        x2 = F.relu(x2)

        # Concatenating the embeddings of source and target nodes along with edge attributes
        src, tgt = edge_index
        edge_embeddings = torch.cat([x2[src], x2[tgt], edge_attr], dim=1)
        return torch.sigmoid(self.classifier(edge_embeddings))

def train(model, loader, optimizer, criterion):
    model.train()
    
    total_loss = 0
    for data in loader:
        data.edge_index = data.edge_index.t().contiguous()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.edge_attr)
        loss = criterion(out.view(-1), data.y.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

# Model, loss function, and optimizer
model = EdgeClassifier(in_channels=1, hidden_channels=64, edge_feature_size=len(train_dataset.edge_attr[0]))
criterion = torch.nn.BCELoss()  # Binary cross entropy for edge classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 50
for epoch in range(epochs):
    loss = train(model, loader, optimizer, criterion)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")

train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

vectorizer = TfidfVectorizer(max_features=100)
train_tfidf = vectorizer.fit_transform(train_df['text']).toarray()

# Training the logistic regression model
lr = LogisticRegression()
lr.fit(train_tfidf, train_df['label'])

test_tfidf = vectorizer.transform(test_df['text']).toarray()
lr_preds = lr.predict(test_tfidf)

test_dataset = CustomDataset(test_df)
gnn_out = gnn_model(test_dataset.x.to(device), test_dataset.edge_index.to(device))
gnn_preds = gnn_out.max(1)[1].cpu().numpy()

lr_accuracy = accuracy_score(test_df['label'], lr_preds)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")

gnn_accuracy = accuracy_score(test_df['label'], gnn_preds)
print(f"GNN Accuracy: {gnn_accuracy:.4f}")

In [5]:
######### START BELOW ###########

(49818, 768)

In [6]:
#### DATA LOADER ####
def data_loader(ratings_df):
    unique_user_id = ratings_df['userId'].unique()
    unique_user_id = pd.DataFrame(data={
        'userId': unique_user_id,
        'mappedID': pd.RangeIndex(len(unique_user_id)),
    })

    unique_item_id = ratings_df['itemId'].unique()
    unique_item_id = pd.DataFrame(data={
        'itemId': unique_item_id,
        'mappedID': pd.RangeIndex(len(unique_item_id)),
    })

    ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                                left_on='userId', right_on='userId', how='left')
    ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
    ratings_item_id = pd.merge(ratings_df['itemId'], unique_item_id,
                                left_on='itemId', right_on='itemId', how='left')
    ratings_item_id = torch.from_numpy(ratings_item_id['mappedID'].values)
    edge_index_user_to_item = torch.stack([ratings_user_id, ratings_item_id], dim=0)

    return unique_user_id, unique_item_id, edge_index_user_to_item

def review_loader():
    df = pd.read_parquet('dataset/labeled_yelp_dataset_ver2_small.parquet')
    def encode_id(df):
        business_encoder = LabelEncoder()
        df['business_id'] = business_encoder.fit_transform(df['business_id'])
        user_encoder = LabelEncoder()
        mapping = dict(zip(df['user_id'] ,user_encoder.fit_transform(df['user_id'])))
        df['user_id'] = user_encoder.fit_transform(df['user_id'])
        def map_friends(friends_str, mapping_dict):
            return [mapping_dict[friend] for friend in friends_str.split(',') if friend in mapping_dict]

        df['friends'] = df['friends'].apply(map_friends, mapping_dict=mapping)
        return df

    df = encode_id(df)
    df = df[['user_id', 'friends', 'business_id', 'stars', 'text', 'label', 'name']]
    df = df.rename(columns={
        'user_id': 'user',
        'business_id': 'item',
        'stars': 'rating',
        'text': 'review_text',
        'name': 'business_name',
    })
    df = df.drop_duplicates(subset=['user', 'item'], keep='first')

    items_ratings_df = df[:len_interactions_to_consider] if mode == 'debug' else df #$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

    # ########## SPARCITY EXPERIMENT ###########
    # def calculate_sparcity_value(df):
    #     num_users = df['user'].nunique()
    #     num_items = df['item'].nunique()
    #     num_interactions = len(df)
    #     total_possible_interactions = num_users * num_items / 100
    #     sparsity = 1 - (num_interactions / total_possible_interactions)
    #     return sparsity
    # def filter_interactions(df, column, k):
    #     valid_entries = df[column].value_counts()
    #     valid_entries = valid_entries[valid_entries >= k]
    #     df = df[df[column].isin(valid_entries.index)]
    #     print(f'{column} sparcity value is:', calculate_sparcity_value(df))
    #     return df
    # if experiment == 'usparsity':
    #     u = 1
    #     items_ratings_df = filter_interactions(items_ratings_df, 'user', u)
    # elif experiment == 'isparsity':
    #     i = 20
    #     items_ratings_df = filter_interactions(items_ratings_df, 'item', i)

    items_df = {}
    items_df['business_name'] = items_ratings_df['business_name'].unique()
    items_df['itemId'], unique_names = pd.factorize(items_df['business_name'])
    # items_df['itemId'] = items_df['itemId'] + 1 #TODO test commenting this line didn't breal anything
    items_df = pd.DataFrame(items_df, columns=['itemId', 'business_name'])

    # We have edge feature
    def get_edge_feat_sbert(df):
        def preprocess_text(text):
            # TODO
            return text

        preprocessed_review_texts = [preprocess_text(review_text[:512]) for review_text in df['review_text']]
        tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
        model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
        device = torch.device("cpu") #"cuda" if torch.cuda.is_available() else "cpu") # NOT enough GPU memory
        model = model.to(device)
        inputs = tokenizer(preprocessed_review_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        review_text_feat = embeddings
        # TODO: calculate the stars embedding, maybe concatenate to review_text_emb as final edge_feat
        edge_feat = None
        
        return edge_feat
    
    def get_edge_feat_tfidf(df):
        #TODO: maybe fit a TFIDF, return the top_keywords for each review_text, create the dummy vector of [len(reviews), unique_top_keywords]
        pass

    def get_edge_feat_word2vec(df):
        #TODO:
        sentences = df['review_text'].apply(lambda x: word_tokenize(x.lower()))

        model = Word2Vec(sentences=sentences, vector_size=embedding_size, window=5, min_count=1, workers=4)
        model.train(sentences, total_examples=len(sentences), epochs=10)

        # Average the word vectors for each sentence for our dataset
        def get_avg_vector(words, model, num_features):
            feature_vector = np.zeros((num_features,), dtype="float32")
            nwords = 0.
            vocabulary = set(model.wv.index_to_key)

            for word in words:
                if word in vocabulary: 
                    nwords = nwords + 1.
                    feature_vector = np.add(feature_vector, model.wv[word])
            
            if nwords:
                feature_vector = np.divide(feature_vector, nwords)
            return feature_vector

        edge_feat = np.array(sentences.apply(lambda x: get_avg_vector(x, model, embedding_size)).tolist())
        return edge_feat
    
    ########### SBERT EXPERIMENT ###########
    if experiment == 'TFIDF':
        #TODO
        edge_feat = None
        # edge_feat = get_edge_feat_tfidf(df)
        # edge_feat = np.load('tfidf_embeddings_full.npy')
        edge_feat = torch.from_numpy(edge_feat[:len(df)]).to(torch.float)
    elif experiment == 'word2vec':
        # edge_feat = get_edge_feat_word2vec(df)
        edge_feat = np.load('word2vec_embeddings_full.npy')
        edge_feat = torch.from_numpy(edge_feat[:len(df)]).to(torch.float)
    elif experiment == 'SBERT':
        # edge_feat = get_edge_feat_sbert(df)
        edge_feat = np.load('dataset/sbert_embeddings_full.npy') #np.load('sbert_embeddings_100k.npy')
        edge_feat = torch.from_numpy(edge_feat[:len(df)]).to(torch.float)
    print(edge_feat.shape)

    # print('edge feature tensor shape', edge_feat.shape)
    items_ratings_df = items_ratings_df.rename(columns={'user': 'userId', 'item': 'itemId'})
    unique_user_id, unique_item_id, edge_index_user_to_item = data_loader(items_ratings_df)
    print('number of unique users', len(unique_user_id))
    print('number of unique items', len(unique_item_id))
    return unique_user_id, unique_item_id, edge_index_user_to_item, items_df, edge_feat, items_ratings_df

loaders = {
    'review_loader': review_loader,
}
unique_user_id, unique_item_id, edge_index_user_to_item, item_df, edge_feat, review_ratings_df = loaders[f'{dataset_mode}_loader']()


torch.Size([49818, 768])
number of unique users 42936
number of unique items 8026


In [7]:
######### LINK CLASS PRED MODEL ##########
def train_test_generator(unique_user_id, unique_item_id, review_ratings_df, edge_feat, edge_index_user_to_item):  
    if edge_feat == None: edge_feat = torch.zeros((edge_index_user_to_item.shape[1], 768)) # 768 as the SBERT dim output
    user_features = torch.zeros((len(review_ratings_df['userId'].unique()), 100))
    item_features = torch.zeros((len(review_ratings_df['itemId'].unique()), 100))
    data = HeteroData()
    data["user"].x = user_features
    data["item"].x = item_features
    data["user", "review", "item"].edge_index = edge_index_user_to_item
    data["user", "review", "item"].edge_label = torch.tensor(review_ratings_df['label'], dtype=torch.float)
    data["user", "review", "item"].edge_attr = edge_feat
    data = T.ToUndirected()(data)
    del data['item', 'rev_review', 'user'].edge_label

    train_data, val_data, test_data = T.RandomLinkSplit(
        num_val=0.1,
        num_test=0.1,
        neg_sampling_ratio=0.0,
        edge_types=[('user', 'review', 'item')],
        rev_edge_types=[('item', 'rev_review', 'user')],
    )(data)

    # TODO: somehow for train_data, its edge_attr is splited, not sure how
    val_data_mask = torch.zeros(data["user", "review", "item"].edge_index.shape[1], dtype=torch.bool)
    test_data_mask = torch.zeros(data["user", "review", "item"].edge_index.shape[1], dtype=torch.bool)
    val_data_edges = set(tuple(edge) for edge in val_data["user", "review", "item"].edge_label_index.t().tolist())
    test_data_edges = set(tuple(edge) for edge in test_data["user", "review", "item"].edge_label_index.t().tolist())

    # Looping through all the edges
    for i, edge in tqdm(enumerate(data["user", "review", "item"].edge_index.t().tolist()), total=len(data["user", "review", "item"].edge_index.t().tolist()), desc='processing edge_attr splitting for train and test'):
        edge_tuple = tuple(edge)
        if edge_tuple in val_data_edges:
            val_data_mask[i] = True
        elif edge_tuple in test_data_edges:
            test_data_mask[i] = True

    # Assigning edge_attr based on the created masks
    val_data["user", "review", "item"].edge_attr = data["user", "review", "item"].edge_attr[val_data_mask]
    test_data["user", "review", "item"].edge_attr = data["user", "review", "item"].edge_attr[test_data_mask]

    return data, train_data, val_data, test_data

def GNN_recommender(data, train_data):
    print('2')

    class GNNEncoder(torch.nn.Module):
        def __init__(self, hidden_channels, out_channels):
            super().__init__()
            self.conv1 = SAGEConv((-1, -1), hidden_channels)
            self.conv2 = SAGEConv((-1, -1), out_channels)

        def forward(self, x, edge_index):
            x = self.conv1(x, edge_index).relu()
            x = self.conv2(x, edge_index)
            return x

    class EdgeDecoder(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            self.lin1 = torch.nn.Linear(2 * hidden_channels + train_data['user', 'review', 'item'].edge_attr.shape[1], hidden_channels)
            self.lin2 = torch.nn.Linear(hidden_channels, 1)

        def forward(self, z_dict, edge_label_index, edge_attr):
            row, col = edge_label_index
            z = torch.cat([z_dict['user'][row], z_dict['item'][col], edge_attr], dim=-1)

            z = self.lin1(z).relu()
            z = self.lin2(z)
            return z.view(-1)
        
    class Model(torch.nn.Module):
        def __init__(self, hidden_channels):
            super().__init__()
            self.encoder = GNNEncoder(hidden_channels, hidden_channels)
            self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
            self.decoder = EdgeDecoder(hidden_channels)

        def forward(self, x_dict, edge_index_dict, edge_attr, edge_label_index):
            z_dict = self.encoder(x_dict, edge_index_dict)
            return self.decoder(z_dict, edge_label_index, edge_attr)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = Model(hidden_channels=32).to(device)
            
    # ########## TRAINING ##########
    import torch.nn.functional as F

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    def train():
        model.train()
        optimizer.zero_grad()
        pred = model(
            x_dict=train_data.x_dict,
            edge_index_dict=train_data.edge_index_dict,
            edge_attr=train_data['user', 'review', 'item'].edge_attr,  # Here's where you pass the edge attributes
            edge_label_index=train_data['user', 'item'].edge_label_index
        )
        target = train_data['user', 'item'].edge_label
        loss = F.mse_loss(pred, target)
        loss.backward()
        optimizer.step()
        return float(loss)

    @torch.no_grad()
    def test(data):
        data = data.to(device)
        model.eval()
        pred = model(data.x_dict, data.edge_index_dict,
                    data['user', 'review', 'item'].edge_attr,
                    data['user', 'item'].edge_label_index)
        pred = pred.clamp(min=0, max=1)
        target = data['user', 'item'].edge_label.float()
        rmse = F.mse_loss(pred, target).sqrt()
        return float(rmse)


    for epoch in range(1, 301):
        train_data = train_data.to(device)
        loss = train()
        train_rmse = test(train_data)
        val_rmse = test(val_data)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
            f'Val: {val_rmse:.4f}')

    return model

In [8]:
########## TRAIN TEST GENERAION ############

####### ITEM FEAT ABLATION EXPRIMENT ####### 
if experiment == 'ablation_edge_feat':
    edge_feat = torch.zeros_like(edge_feat)

# ####### SOCIAL EDGES ABLEATION EXPERIMENT #######
def add_social_edges(edge_index_user_to_item, unique_user_id, unique_item_id, items_ratings_df, item_feat):
    unique_item_id = unique_item_id.copy()
    # Define the filename where the data will be saved
    filename = 'dataset/saved_social_edges_100k.pkl'

    # Check if the file exists
    if os.path.exists(filename):
        # If it does, load the data and return it
        with open(filename, 'rb') as f:
            unique_user_id, unique_item_id_w_users, edge_index_user_to_item, item_feat = pickle.load(f)
        print('Data loaded from file')
    else:
        user_transactions_df = pd.read_parquet('dataset/user_transactions.parquet')
        contract_addresses = pd.read_parquet('dataset/contract_addresses.parquet')
        contract_set = set(contract_addresses['address'])

        # Shifting item_ids
        edge_index_user_to_item[1] = edge_index_user_to_item[1] + len(edge_index_user_to_item[0].unique())
        unique_item_id['mappedID'] = unique_item_id['mappedID'] + len(edge_index_user_to_item[0].unique())
        #Adding user_ids to item_ids since now users can be an item too #TODO if the GNN performance turned to be bad, just add 'to' user addresses to both item_feat and unique_item_ids
        unique_item_id_w_users = pd.concat([unique_user_id.rename(columns={'userId': 'entityId'}), unique_item_id.rename(columns={'itemId': 'entityId'})], axis=0)
        user_feat = torch.zeros((len(edge_index_user_to_item[0].unique()), item_feat.shape[1]))
        item_feat= torch.cat([user_feat, item_feat], dim=0) # Why don't we adding item_feat to user_feat?

        # unique_item_id_w_users['type'] = 'user' or 'item'

        users = items_ratings_df['userId'].unique()

        print('edge index shape before adding social edges:', edge_index_user_to_item.shape)
        count = 0
        #note there is a 200k constraint, delete it
        for i, interaction in tqdm(user_transactions_df.iterrows(), total=len(user_transactions_df)):
            if interaction['from'] not in contract_set and interaction['to'] not in contract_set and interaction['from'] in users and interaction['to'] in users:
                if interaction['from'] == interaction['to']: continue
                from_user_id = unique_item_id_w_users[unique_item_id_w_users['entityId'] == interaction['from']]['mappedID'].iloc[0]
                to_user_id = unique_item_id_w_users[unique_item_id_w_users['entityId'] == interaction['to']]['mappedID'].iloc[0]
                social_edge = torch.tensor([[from_user_id], 
                                            [to_user_id]], dtype=torch.int64)
                edge_index_user_to_item = torch.cat([edge_index_user_to_item, social_edge], dim=1)
                # count += 1
                # if count % 5 == 0: break
        print('edge index shape after adding social edges:', edge_index_user_to_item.shape)
        del user_transactions_df
        del contract_addresses
        del contract_set
        import gc
        gc.collect()

        #uncomment below
        with open(filename, 'wb') as f:
            pickle.dump((unique_user_id, unique_item_id_w_users, edge_index_user_to_item, item_feat), f)
        print('social edges saved to dataset/saved_social_edges_100k.pkl')
    
    return unique_user_id, unique_item_id_w_users, edge_index_user_to_item, item_feat

if experiment == 'add_social_edges':
    # uncomment below after debuging
    unique_user_id_w_social, unique_item_id_w_social, review_ratings_df_w_social, edge_index_user_to_item_w_social, edge_feat_w_social = add_social_edges(edge_index_user_to_item, unique_user_id, unique_item_id, review_ratings_df, edge_feat)
    data, train_data, val_data, test_data = train_test_generator(unique_user_id_w_social, unique_item_id_w_social, edge_feat_w_social, edge_index_user_to_item_w_social)
else:
    data, train_data, val_data, test_data = train_test_generator(unique_user_id, unique_item_id, review_ratings_df, edge_feat, edge_index_user_to_item)



processing edge_attr splitting for train and test: 100%|██████████| 49818/49818 [00:00<00:00, 319335.62it/s]


In [11]:
########## GNN TRAINING ############
#if model_mode == GNN run below
model_gnn = GNN_recommender(data, train_data)


2
Epoch: 001, Loss: 0.3167, Train: 0.7740, Val: 0.7723
Epoch: 002, Loss: 22.1876, Train: 0.7378, Val: 0.7493
Epoch: 003, Loss: 0.9276, Train: 0.6208, Val: 0.6225
Epoch: 004, Loss: 0.4333, Train: 0.5874, Val: 0.5922
Epoch: 005, Loss: 0.3590, Train: 0.5751, Val: 0.5812
Epoch: 006, Loss: 0.3335, Train: 0.5743, Val: 0.5785
Epoch: 007, Loss: 0.3305, Train: 0.5751, Val: 0.5781
Epoch: 008, Loss: 0.3310, Train: 0.5759, Val: 0.5783
Epoch: 009, Loss: 0.3318, Train: 0.5765, Val: 0.5785
Epoch: 010, Loss: 0.3324, Train: 0.5768, Val: 0.5787
Epoch: 011, Loss: 0.3327, Train: 0.5769, Val: 0.5788
Epoch: 012, Loss: 0.3329, Train: 0.5769, Val: 0.5787
Epoch: 013, Loss: 0.3328, Train: 0.5767, Val: 0.5785
Epoch: 014, Loss: 0.3326, Train: 0.5763, Val: 0.5781
Epoch: 015, Loss: 0.3322, Train: 0.5759, Val: 0.5777
Epoch: 016, Loss: 0.3316, Train: 0.5753, Val: 0.5771
Epoch: 017, Loss: 0.3310, Train: 0.5746, Val: 0.5764
Epoch: 018, Loss: 0.3302, Train: 0.5738, Val: 0.5756
Epoch: 019, Loss: 0.3293, Train: 0.5730, Va

In [12]:
######## GNN PRED FOR TEST_DATA ######### 
@torch.no_grad()
def pred_gnn_gen(device, model, data):
    data = data.to(device)
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                    data['user', 'review', 'item'].edge_attr,
                    data['user', 'item'].edge_label_index)
    # pred = pred.clamp(min=0, max=1)
    target = data['user', 'item'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return pred, target, float(rmse)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # To check the memory usage of GNN, put cpu to be comparable with MF models
pred_gnn, ground_truth_gnn, rmse_gnn = pred_gnn_gen(device, model_gnn, test_data)
pred_gnn = pred_gnn.cpu().numpy()

In [13]:
########### DATA PREPRATION FOR ML models #############
'''
For ML models, we need a df of train and test data, 
but from GNN train/test generation, we have a HeteroData
Here we turn a HeteroData to a DataFrame: will cover in this section
How to add review texts to each train and test?
'''

train_data_mask = torch.zeros(data["user", "review", "item"].edge_index.shape[1], dtype=torch.bool)
test_data_mask = torch.zeros(data["user", "review", "item"].edge_index.shape[1], dtype=torch.bool)
train_data_edges = set(tuple(edge) for edge in train_data["user", "review", "item"].edge_label_index.t().tolist())
test_data_edges = set(tuple(edge) for edge in test_data["user", "review", "item"].edge_label_index.t().tolist())

for i, edge in enumerate(data["user", "review", "item"].edge_index.t().tolist()):
    edge_tuple = tuple(edge)
    if edge_tuple in train_data_edges:
        train_data_mask[i] = True
    elif edge_tuple in test_data_edges:
        test_data_mask[i] = True

test_df_index = test_data['user', 'review', 'item'].edge_label_index.cpu().numpy()
test_df_label = test_data['user', 'review', 'item'].edge_label.cpu().numpy()

test_df_index = test_df_index.T 
test_df_ml = pd.DataFrame(test_df_index, columns=['user', 'item'])
test_df_ml['review_label'] = test_df_label
test_df_ml['review_text'] = review_ratings_df.loc[test_data_mask.numpy(), 'review_text'].values

train_df_index = train_data['user', 'review', 'item'].edge_label_index.cpu().numpy()
train_df_label = train_data['user', 'review', 'item'].edge_label.cpu().numpy()
train_df_index = train_df_index.T 
train_df_ml = pd.DataFrame(train_df_index, columns=['user', 'item'])
train_df_ml['review_label'] = train_df_label
train_df_ml['review_text'] = review_ratings_df.loc[train_data_mask.numpy(), 'review_text'].values

if experiment == 'word2vec':
    edge_feat_df = pd.DataFrame(edge_feat.cpu().numpy())
    test_df_ml['review_word2vec'] = edge_feat_df.loc[test_data_mask.numpy(), 'review_word2vec'].values
    train_df_ml['review_word2vec'] = edge_feat_df.loc[train_data_mask.numpy(), 'review_word2vec'].values

In [49]:
######### TRAIN & PRED ML MODELS #########
train_df_ml['review_text'] = train_df_ml['review_text'].apply(lambda x: x[:512])
test_df_ml['review_text'] = test_df_ml['review_text'].apply(lambda x: x[:512])

if experiment == 'TFIDF' or experiment == 'SBERT':
    vectorizer = TfidfVectorizer(max_features=embedding_size)
    X_train = vectorizer.fit_transform(train_df_ml['review_text'])
    X_test = vectorizer.transform(test_df_ml['review_text'])

if experiment == 'word2vec':
    #TODO
    pass

y_train = train_df_ml['review_label']
y_test = test_df_ml['review_label']

models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Decision Trees": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    # "Gradient Boosting": GradientBoostingClassifier(),
    # "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    # "Naive Bayes": MultinomialNB()
}

predictions = {}

for name, model in tqdm(models.items(), desc="Training models"):
    model.fit(X_train, y_train)
    predictions[f"pred_{name}"] = model.predict(X_test)

# TODO: add the GNN results to the predictions dict
predictions['pred_GNN'] = [1 if pred > 0.5 else 0 for pred in pred_gnn]

accuracies = {}
for name, pred in predictions.items():
    accuracies[name] = accuracy_score(y_test, pred)

print('combine accuracy', accuracies)   

accuracies_class_0 = {}

for name, pred in predictions.items():
    y_test_array = np.array(y_test)
    pred_array = np.array(pred)
    
    class_ = 1
    mask = y_test_array == class_
    accuracies_class_0[name] = accuracy_score(y_test_array[mask], pred_array[mask])

print(f'accuracy for class {class_}', accuracies_class_0)

Training models: 100%|██████████| 4/4 [21:43<00:00, 325.99s/it]

combine accuracy {'pred_Logistic Regression': 0.5908452118048585, 'pred_SVM': 0.590443686006826, 'pred_Decision Trees': 0.5312186307970287, 'pred_Random Forest': 0.5824131700461754, 'pred_GNN': 0.503111824934752}
accuracy for class 1 {'pred_Logistic Regression': 0.0, 'pred_SVM': 0.004416094210009814, 'pred_Decision Trees': 0.4210009813542689, 'pred_Random Forest': 0.047105004906771344, 'pred_GNN': 0.3621197252208047}





In [50]:
####### METRIC EVAL #######
eval_metrics = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1 Score': f1_score
}

results = {metric: {} for metric in eval_metrics}

for name, pred in predictions.items():
    for metric_name, metric_func in eval_metrics.items():
        results[metric_name][name] = metric_func(y_test, pred)

for metric_name, model_results in results.items():
    print(f"\n{metric_name}:")
    for model_name, value in model_results.items():
        print(f"{model_name}: {value:.4f}")


Accuracy:
pred_Logistic Regression: 0.5908
pred_SVM: 0.5904
pred_Decision Trees: 0.5312
pred_Random Forest: 0.5824
pred_GNN: 0.5031

Precision:
pred_Logistic Regression: 0.0000
pred_SVM: 0.4500
pred_Decision Trees: 0.4262
pred_Random Forest: 0.4103
pred_GNN: 0.3858

Recall:
pred_Logistic Regression: 0.0000
pred_SVM: 0.0044
pred_Decision Trees: 0.4210
pred_Random Forest: 0.0471
pred_GNN: 0.3621

F1 Score:
pred_Logistic Regression: 0.0000
pred_SVM: 0.0087
pred_Decision Trees: 0.4236
pred_Random Forest: 0.0845
pred_GNN: 0.3736


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
############# DIVERSITY EXPERIMENT ##############
