# install & import

In [1]:
!pip install torch==2.2.1

Collecting torch==2.2.1
  Downloading torch-2.2.1-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m870.3 kB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.1)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.1)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2.1)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0

In [2]:
!pip install faiss-gpu
#!pip install pyg-lib -f https://data.pyg.org/whl/torch-2.2.1+cu121.html
!pip install torch-geometric

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.5.3


In [3]:
import torch
from transformers import BertTokenizer, BertModel
from torch_geometric.data import HeteroData


# graph creation

In [None]:

def create_hetero_data(user_file, job_file, app_file):
    def get_bert_embeddings(texts):
        if isinstance(texts, str):
            texts = [texts]  # Ensure texts is a list
        embeddings = []
        batch_size = 256  # Define batch size
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased').to(device)
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoding = tokenizer.batch_encode_plus(
                batch_texts,
                padding=True,
                truncation=True,
                return_tensors='pt',
                add_special_tokens=True
            )
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask)
                word_embeddings = outputs.last_hidden_state
                sentence_embedding = word_embeddings.mean(dim=1)
                embeddings.append(sentence_embedding)
            print(i)
        return torch.cat(embeddings, dim=0)

    def load_node_csv(path, index_col, encoders=None, **kwargs):
        print("load node")
        df = pd.read_csv(path, index_col=index_col, **kwargs)
        mapping = {index: i for i, index in enumerate(df.index.unique())}
        x = None
        if encoders is not None:
            xs = [encoder(df[col]) for col, encoder in encoders.items()]
            x = xs[0]
        return df, x, mapping

    print("start")

    user_df, user_x, user_mapping = load_node_csv(user_file, index_col='userID', encoders={'text_emb': get_bert_embeddings})
    print("user mapping done")
    job_df, job_x, job_mapping = load_node_csv(job_file, index_col='jobID', encoders={'text_emb': get_bert_embeddings})
    print("job mapping done")

    data = HeteroData()

    # Add user nodes
    data['user'].num_nodes = len(user_mapping)
    data['user'].x = user_x
    data['user'].index = torch.tensor(list(user_mapping.keys()), dtype=torch.long)

    # Add job nodes
    data['job'].num_nodes = len(job_mapping)
    data['job'].x = job_x
    data['job'].index = torch.tensor(list(job_mapping.keys()), dtype=torch.long)

    # Add topic information
    user_topics = user_df['topic'].values
    job_topics = job_df['topic'].values
    data['user'].topic = torch.tensor(user_topics, dtype=torch.long)
    data['job'].topic = torch.tensor(job_topics, dtype=torch.long)

    print("app start")
    apps = pd.read_csv(app_file)

    # Add edges between user and job nodes
    edge_index_user_job = torch.tensor([apps['userID'].map(user_mapping).values, apps['jobID'].map(job_mapping).values], dtype=torch.long)
    data['user', 'applies', 'job'].edge_index = edge_index_user_job

    # Add labels for edges
    num_users = len(user_mapping)
    num_jobs = len(job_mapping)
    edge_label = torch.zeros((num_users, num_jobs), dtype=torch.long)
    edge_label[edge_index_user_job[0], edge_index_user_job[1]] = 1
    data['user', 'applies', 'job'].edge_label = edge_label[edge_index_user_job[0], edge_index_user_job[1]]

    del apps

    # Add edges between similar users
    user_topic_groups = user_df.groupby('topic').indices
    user_user_edges = []
    for topic, user_indices in user_topic_groups.items():
        for i, user_i in enumerate(user_indices):
            for j in user_indices[i + 1:]:
                user_user_edges.append([user_mapping[user_i], user_mapping[j]])
                user_user_edges.append([user_mapping[j], user_mapping[user_i]])
    data['user', 'similar_U', 'user'].edge_index = torch.tensor(user_user_edges, dtype=torch.long).t().contiguous()

    del user_df

    # Add edges between similar jobs
    job_topic_groups = job_df.groupby('topic').indices
    job_job_edges = []
    for topic, job_indices in job_topic_groups.items():
        for i, job_i in enumerate(job_indices):
            for j in job_indices[i + 1:]:
                job_job_edges.append([job_mapping[job_i], job_mapping[j]])
                job_job_edges.append([job_mapping[j], job_mapping[job_i]])
    data['job', 'similar_J', 'job'].edge_index = torch.tensor(job_job_edges, dtype=torch.long).t().contiguous()

    del job_df

    # Ensure the graph is undirected
    data = ToUndirected()(data)

    return data


# new functions test

## embed

In [4]:
def get_bert_embedding(text):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').to(device)

    encoding = tokenizer.encode_plus(
        text,
        padding=True,
        truncation=True,
        return_tensors='pt',
        add_special_tokens=True
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state
        sentence_embedding = word_embeddings.mean(dim=1)

    return sentence_embedding.squeeze().cpu()

## create empty graph

In [5]:
def create_empty_graph():
    data = HeteroData()

    # Initialize user node
    data['user'].num_nodes = 0
    data['user'].x = torch.empty((0, 768))  # Adjust the shape to match the embedding size
    data['user'].topic = torch.empty((0,), dtype=torch.long)
    data['user'].index = torch.empty((0,), dtype=torch.long)

    # Initialize job node
    data['job'].num_nodes = 0
    data['job'].x = torch.empty((0, 768))  # Adjust the shape to match the embedding size
    data['job'].topic = torch.empty((0,), dtype=torch.long)
    data['job'].index = torch.empty((0,), dtype=torch.long)

    # Initialize edges and edge labels
    data['user', 'applies', 'job'].edge_index = torch.empty((2, 0), dtype=torch.long)
    data['user', 'applies', 'job'].edge_label = torch.empty((0,),dtype=torch.long)
    data['user', 'similar_U', 'user'].edge_index = torch.empty((2, 0), dtype=torch.long)
    data['job', 'similar_J', 'job'].edge_index = torch.empty((2, 0), dtype=torch.long)
    data['job', 'rev_applies', 'user'].edge_index = torch.empty((2, 0), dtype=torch.long)
    data['job', 'rev_applies', 'user'].edge_label = torch.empty((0,),dtype=torch.long)

    return data

## add

In [6]:
def add_node_user(data, userID, text, topic):
    # Add user node
    new_index = data['user'].num_nodes
    data['user'].num_nodes += 1
    new_embedding = get_bert_embedding(text)
    if data['user'].x is None:
        data['user'].x = new_embedding.unsqueeze(0)
    else:
        data['user'].x = torch.cat([data['user'].x, new_embedding.unsqueeze(0)], dim=0)
    data['user'].topic = torch.cat([data['user'].topic, torch.tensor([topic], dtype=torch.long)], dim=0)
    data['user'].index = torch.cat([data['user'].index, torch.tensor([userID], dtype=torch.long)], dim=0)

    # Create edges with similar users
    user_indices = (data['user'].topic == topic).nonzero(as_tuple=True)[0]
    new_edges = []
    for idx in user_indices:
        if idx != new_index:
            new_edges.append([new_index, idx.item()])
            new_edges.append([idx.item(), new_index])

    if new_edges:
        new_edges = torch.tensor(new_edges, dtype=torch.long).t().contiguous()
        if data['user', 'similar_U', 'user'].edge_index is None:
            data['user', 'similar_U', 'user'].edge_index = new_edges
        else:
            data['user', 'similar_U', 'user'].edge_index = torch.cat([data['user', 'similar_U', 'user'].edge_index, new_edges], dim=1)


In [7]:
def add_node_job(data, jobID, text, topic):
    # Add job node
    new_index = data['job'].num_nodes
    data['job'].num_nodes += 1
    new_embedding = get_bert_embedding(text)
    if data['job'].x is None:
        data['job'].x = new_embedding.unsqueeze(0)
    else:
        data['job'].x = torch.cat([data['job'].x, new_embedding.unsqueeze(0)], dim=0)
    data['job'].topic = torch.cat([data['job'].topic, torch.tensor([topic], dtype=torch.long)], dim=0)
    data['job'].index = torch.cat([data['job'].index, torch.tensor([jobID], dtype=torch.long)], dim=0)

    # Create edges with similar jobs
    job_indices = (data['job'].topic == topic).nonzero(as_tuple=True)[0]
    new_edges = []
    for idx in job_indices:
        if idx != new_index:
            new_edges.append([new_index, idx.item()])
            new_edges.append([idx.item(), new_index])

    if new_edges:
        new_edges = torch.tensor(new_edges, dtype=torch.long).t().contiguous()
        if data['job', 'similar_J', 'job'].edge_index is None:
            data['job', 'similar_J', 'job'].edge_index = new_edges
        else:
            data['job', 'similar_J', 'job'].edge_index = torch.cat([data['job', 'similar_J', 'job'].edge_index, new_edges], dim=1)

## delete

In [8]:
def delete_node_user(data, userID):
    # Find the index of the userID in the 'user' node index tensor
    idx = (data['user'].index == userID).nonzero(as_tuple=True)[0]
    if idx.numel() > 0:
        idx = idx.item()  # Convert tensor to a scalar index

        # Update user attributes by removing the node
        data['user'].x = torch.cat([data['user'].x[:idx], data['user'].x[idx+1:]], dim=0)
        data['user'].topic = torch.cat([data['user'].topic[:idx], data['user'].topic[idx+1:]], dim=0)
        data['user'].index = torch.cat([data['user'].index[:idx], data['user'].index[idx+1:]], dim=0)
        data['user'].num_nodes -= 1

        # Update edges for 'user_applies_job' relationship
        edges_to_remove = (data['user', 'applies', 'job'].edge_index[0] == idx).nonzero(as_tuple=True)[0]
        keep_edges = torch.ones(data['user', 'applies', 'job'].edge_index.size(1), dtype=torch.bool)
        keep_edges[edges_to_remove] = False
        data['user', 'applies', 'job'].edge_index = data['user', 'applies', 'job'].edge_index[:, keep_edges]
        data['user', 'applies', 'job'].edge_label = data['user', 'applies', 'job'].edge_label[keep_edges]

        # Decrement indices in 'user_applies_job'
        data['user', 'applies', 'job'].edge_index[0] -= (data['user', 'applies', 'job'].edge_index[0] > idx).int()

        # Update edges for 'job_rev_applies_user' relationship
        edges_to_remove = (data['job', 'rev_applies', 'user'].edge_index[1] == idx).nonzero(as_tuple=True)[0]
        keep_edges = torch.ones(data['job', 'rev_applies', 'user'].edge_index.size(1), dtype=torch.bool)
        keep_edges[edges_to_remove] = False
        data['job', 'rev_applies', 'user'].edge_index = data['job', 'rev_applies', 'user'].edge_index[:, keep_edges]
        data['job', 'rev_applies', 'user'].edge_label = data['job', 'rev_applies', 'user'].edge_label[keep_edges]

        # Decrement indices in 'job_rev_applies_user'
        data['job', 'rev_applies', 'user'].edge_index[1] -= (data['job', 'rev_applies', 'user'].edge_index[1] > idx).int()

        # Update edges for 'user_similar_user' relationship
        edges_to_remove = ((data['user', 'similar_U', 'user'].edge_index[0] == idx) | (data['user', 'similar_U', 'user'].edge_index[1] == idx)).nonzero(as_tuple=True)[0]
        keep_edges = torch.ones(data['user', 'similar_U', 'user'].edge_index.size(1), dtype=torch.bool)
        keep_edges[edges_to_remove] = False
        data['user', 'similar_U', 'user'].edge_index = data['user', 'similar_U', 'user'].edge_index[:, keep_edges]

        # Decrement indices in 'user_similar_user'
        data['user', 'similar_U', 'user'].edge_index[0] -= (data['user', 'similar_U', 'user'].edge_index[0] > idx).int()
        data['user', 'similar_U', 'user'].edge_index[1] -= (data['user', 'similar_U', 'user'].edge_index[1] > idx).int()


In [9]:
def delete_node_job(data, jobID):
    # Find the index of the jobID in the 'job' node index tensor
    idx = (data['job'].index == jobID).nonzero(as_tuple=True)[0]
    if idx.numel() > 0:
        idx = idx.item()  # Convert tensor to a scalar index

        # Update job attributes by removing the node
        data['job'].x = torch.cat([data['job'].x[:idx], data['job'].x[idx+1:]], dim=0)
        data['job'].topic = torch.cat([data['job'].topic[:idx], data['job'].topic[idx+1:]], dim=0)
        data['job'].index = torch.cat([data['job'].index[:idx], data['job'].index[idx+1:]], dim=0)
        data['job'].num_nodes -= 1

        # Update edges for 'user_applies_job' relationship
        edges_to_remove = (data['user', 'applies', 'job'].edge_index[1] == idx).nonzero(as_tuple=True)[0]
        keep_edges = torch.ones(data['user', 'applies', 'job'].edge_index.size(1), dtype=torch.bool)
        keep_edges[edges_to_remove] = False
        data['user', 'applies', 'job'].edge_index = data['user', 'applies', 'job'].edge_index[:, keep_edges]
        data['user', 'applies', 'job'].edge_label = data['user', 'applies', 'job'].edge_label[keep_edges]

        # Decrement indices in 'user_applies_job'
        data['user', 'applies', 'job'].edge_index[1] -= (data['user', 'applies', 'job'].edge_index[1] > idx).int()

        # Update edges for 'job_rev_applies_user' relationship
        edges_to_remove = (data['job', 'rev_applies', 'user'].edge_index[0] == idx).nonzero(as_tuple=True)[0]
        keep_edges = torch.ones(data['job', 'rev_applies', 'user'].edge_index.size(1), dtype=torch.bool)
        keep_edges[edges_to_remove] = False
        data['job', 'rev_applies', 'user'].edge_index = data['job', 'rev_applies', 'user'].edge_index[:, keep_edges]
        data['job', 'rev_applies', 'user'].edge_label = data['job', 'rev_applies', 'user'].edge_label[keep_edges]

        # Decrement indices in 'job_rev_applies_user'
        data['job', 'rev_applies', 'user'].edge_index[0] -= (data['job', 'rev_applies', 'user'].edge_index[0] > idx).int()

        # Update edges for 'job_similar_job' relationship
        edges_to_remove = ((data['job', 'similar_J', 'job'].edge_index[0] == idx) | (data['job', 'similar_J', 'job'].edge_index[1] == idx)).nonzero(as_tuple=True)[0]
        keep_edges = torch.ones(data['job', 'similar_J', 'job'].edge_index.size(1), dtype=torch.bool)
        keep_edges[edges_to_remove] = False
        data['job', 'similar_J', 'job'].edge_index = data['job', 'similar_J', 'job'].edge_index[:, keep_edges]

        # Decrement indices in 'job_similar_job'
        data['job', 'similar_J', 'job'].edge_index[0] -= (data['job', 'similar_J', 'job'].edge_index[0] > idx).int()
        data['job', 'similar_J', 'job'].edge_index[1] -= (data['job', 'similar_J', 'job'].edge_index[1] > idx).int()


## modify

In [10]:
def modify_node_user(data, userID, new_text, new_topic):
    # Find the index of the userID in the 'user' node index tensor
    idx = (data['user'].index == userID).nonzero(as_tuple=True)[0]
    if idx.numel() > 0:
        idx = idx.item()  # Convert tensor to a scalar index

        # Update user attributes
        new_embedding = get_bert_embedding(new_text)
        data['user'].x[idx] = new_embedding
        data['user'].topic[idx] = new_topic

        # Update edges for 'user_similar_user' relationship
        # Remove old edges
        edges_to_remove = ((data['user', 'similar_U', 'user'].edge_index[0] == idx) |
                           (data['user', 'similar_U', 'user'].edge_index[1] == idx)).nonzero(as_tuple=True)[0]
        keep_edges = torch.ones(data['user', 'similar_U', 'user'].edge_index.size(1), dtype=torch.bool)
        keep_edges[edges_to_remove] = False
        data['user', 'similar_U', 'user'].edge_index = data['user', 'similar_U', 'user'].edge_index[:, keep_edges]

        # Create new edges based on the new topic
        user_indices = (data['user'].topic == new_topic).nonzero(as_tuple=True)[0]
        new_edges = []
        for user_idx in user_indices:
            if user_idx.item() != idx:
                new_edges.append([idx, user_idx.item()])
                new_edges.append([user_idx.item(), idx])

        if new_edges:
            new_edges = torch.tensor(new_edges, dtype=torch.long).t().contiguous()
            if data['user', 'similar_U', 'user'].edge_index is None:
                data['user', 'similar_U', 'user'].edge_index = new_edges
            else:
                data['user', 'similar_U', 'user'].edge_index = torch.cat([data['user', 'similar_U', 'user'].edge_index, new_edges], dim=1)


In [11]:
def modify_node_job(data, jobID, new_text, new_topic):
    # Find the index of the jobID in the 'job' node index tensor
    idx = (data['job'].index == jobID).nonzero(as_tuple=True)
    if idx[0].numel() > 0:
        idx = idx[0].item()  # Convert tensor to a scalar index

        # Update job attributes
        new_embedding = get_bert_embedding(new_text)
        data['job'].x[idx] = new_embedding
        data['job'].topic[idx] = new_topic

        # Update edges for 'job_similar_job' relationship
        # Remove old edges
        edges_to_remove = ((data['job', 'similar_J', 'job'].edge_index[0] == idx) |
                           (data['job', 'similar_J', 'job'].edge_index[1] == idx)).nonzero(as_tuple=True)[0]
        keep_edges = torch.ones(data['job', 'similar_J', 'job'].edge_index.size(1), dtype=torch.bool)
        keep_edges[edges_to_remove] = False
        data['job', 'similar_J', 'job'].edge_index = data['job', 'similar_J', 'job'].edge_index[:, keep_edges]

        # Create new edges based on the new topic
        job_indices = (data['job'].topic == new_topic).nonzero(as_tuple=True)[0]
        new_edges = []
        for job_idx in job_indices:
            if job_idx.item() != idx:
                new_edges.append([idx, job_idx.item()])
                new_edges.append([job_idx.item(), idx])

        if new_edges:
            new_edges = torch.tensor(new_edges, dtype=torch.long).t().contiguous()
            if data['job', 'similar_J', 'job'].edge_index is None:
                data['job', 'similar_J', 'job'].edge_index = new_edges
            else:
                data['job', 'similar_J', 'job'].edge_index = torch.cat([data['job', 'similar_J', 'job'].edge_index, new_edges], dim=1)


## add app

In [12]:
def add_edge_app(data, userID, jobID):
    # Find the index positions of userID and jobID in their respective node tensors
    user_index = (data['user'].index == userID).nonzero(as_tuple=True)
    job_index = (data['job'].index == jobID).nonzero(as_tuple=True)

    if user_index[0].numel() > 0 and job_index[0].numel() > 0:
        user_index = user_index[0].item()
        job_index = job_index[0].item()

        # Add edge between user and job with edge label 1
        data['user', 'applies', 'job'].edge_index = torch.cat([data['user', 'applies', 'job'].edge_index, torch.tensor([[user_index], [job_index]], dtype=torch.long)], dim=1)
        data['user', 'applies', 'job'].edge_label = torch.cat([data['user', 'applies', 'job'].edge_label, torch.tensor([1], dtype=torch.long)], dim=0)

        # Add reverse edge between job and user (rev_applies) with edge label 1
        data['job', 'rev_applies', 'user'].edge_index = torch.cat([data['job', 'rev_applies', 'user'].edge_index, torch.tensor([[job_index], [user_index]], dtype=torch.long)], dim=1)
        data['job', 'rev_applies', 'user'].edge_label = torch.cat([data['job', 'rev_applies', 'user'].edge_label, torch.tensor([1], dtype=torch.long)], dim=0)


## delete app

In [13]:
def delete_app_edge(data, userID, jobID):
    # Find the index positions of userID and jobID in their respective node tensors
    user_index = (data['user'].index == userID).nonzero(as_tuple=True)
    job_index = (data['job'].index == jobID).nonzero(as_tuple=True)

    if user_index[0].numel() > 0 and job_index[0].numel() > 0:
        user_index = user_index[0].item()
        job_index = job_index[0].item()

        # Remove edge between user and job
        user_applies_job_edges = (data['user', 'applies', 'job'].edge_index[0] == user_index) & (data['user', 'applies', 'job'].edge_index[1] == job_index)
        user_applies_job_edges = user_applies_job_edges.nonzero(as_tuple=True)[0]

        if user_applies_job_edges.numel() > 0:
            user_applies_job_edges = user_applies_job_edges.item()

            data['user', 'applies', 'job'].edge_index = torch.cat([data['user', 'applies', 'job'].edge_index[:, :user_applies_job_edges], data['user', 'applies', 'job'].edge_index[:, user_applies_job_edges+1:]], dim=1)
            data['user', 'applies', 'job'].edge_label = torch.cat([data['user', 'applies', 'job'].edge_label[:user_applies_job_edges], data['user', 'applies', 'job'].edge_label[user_applies_job_edges+1:]], dim=0)

        # Remove reverse edge between job and user (rev_applies)
        job_rev_applies_user_edges = (data['job', 'rev_applies', 'user'].edge_index[0] == job_index) & (data['job', 'rev_applies', 'user'].edge_index[1] == user_index)
        job_rev_applies_user_edges = job_rev_applies_user_edges.nonzero(as_tuple=True)[0]

        if job_rev_applies_user_edges.numel() > 0:
            job_rev_applies_user_edges = job_rev_applies_user_edges.item()

            data['job', 'rev_applies', 'user'].edge_index = torch.cat([data['job', 'rev_applies', 'user'].edge_index[:, :job_rev_applies_user_edges], data['job', 'rev_applies', 'user'].edge_index[:, job_rev_applies_user_edges+1:]], dim=1)
            data['job', 'rev_applies', 'user'].edge_label = torch.cat([data['job', 'rev_applies', 'user'].edge_label[:job_rev_applies_user_edges], data['job', 'rev_applies', 'user'].edge_label[job_rev_applies_user_edges+1:]], dim=0)


## recommend

In [14]:
def recommend_top_k(user_id, data, model, k=10):
    # Find the index of the user in the 'user' node index tensor
    user_index = (data['user'].index == user_id).nonzero(as_tuple=True)[0]
    if user_index.numel() == 0:
        raise ValueError(f"userID {user_id} not found in user nodes")

    user_index = user_index.item()

    # Encode user and job features using the model
    encoded_data = model.encoder(data.x_dict, data.edge_index_dict)

    # Find jobs that the user has interacted with
    user_interacted_jobs = data['user', 'applies', 'job'].edge_index[1][data['user', 'applies', 'job'].edge_index[0] == user_index]

    # Get all job indices
    all_job_indices = torch.arange(data['job'].num_nodes)

    # Remove jobs that the user has interacted with
    candidate_job_indices = all_job_indices[~torch.isin(all_job_indices, user_interacted_jobs)]

    # Adjust k if it exceeds the number of candidate jobs
    k = min(k, len(candidate_job_indices))

    # Create a tensor with the same length as candidate_job_indices, filled with user_index
    user_index_tensor = torch.full((len(candidate_job_indices),), user_index, dtype=torch.long)

    # Calculate recommendation scores using model's decoder
    recommendation_scores = model.decoder(encoded_data, (user_index_tensor, candidate_job_indices))

    # Extract top-k job indices with highest recommendation scores
    top_k_values, top_k_indices = torch.topk(recommendation_scores, k, largest=True, sorted=True)
    top_k_job_indices = candidate_job_indices[top_k_indices]

    # Map job indices to their original IDs in data['job'].index
    top_k_job_ids = data['job'].index[top_k_job_indices].tolist()

    return top_k_job_ids


# create empty graph

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
data=create_empty_graph()
print(data)

HeteroData(
  user={
    num_nodes=0,
    x=[0, 768],
    topic=[0],
    index=[0],
  },
  job={
    num_nodes=0,
    x=[0, 768],
    topic=[0],
    index=[0],
  },
  (user, applies, job)={
    edge_index=[2, 0],
    edge_label=[0],
  },
  (user, similar_U, user)={ edge_index=[2, 0] },
  (job, similar_J, job)={ edge_index=[2, 0] },
  (job, rev_applies, user)={
    edge_index=[2, 0],
    edge_label=[0],
  }
)


In [18]:
torch.save(data, '/content/drive/MyDrive/data/graph/web_graph.pt')

In [20]:
data=torch.load('/content/drive/MyDrive/data/graph/web_graph.pt')
print(data.metadata())

(['user', 'job'], [('user', 'applies', 'job'), ('user', 'similar_U', 'user'), ('job', 'similar_J', 'job'), ('job', 'rev_applies', 'user')])


In [None]:
data

# test

In [None]:
import torch
from torch_geometric.nn import SAGEConv, to_hetero

In [None]:
class GCN_2(torch.nn.Module):
    def __init__(self, input_dim, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(input_dim, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class CosineSimilarityDecoder(torch.nn.Module):
    def forward(self, x_dict, edge_label_index):
        x_src = x_dict['user'][edge_label_index[0]]
        x_dst = x_dict['job'][edge_label_index[1]]
        return torch.cosine_similarity(x_src, x_dst, dim=1)


class Model(torch.nn.Module):
    def __init__(self, input_dim, hidden_channels):
        super().__init__()
        self.encoder = GCN_2(input_dim, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = CosineSimilarityDecoder()

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        x_dict = self.encoder(x_dict, edge_index_dict)
        cosine_similarity = self.decoder(x_dict, edge_label_index)
        return cosine_similarity

In [None]:
model=torch.load('/content/drive/MyDrive/models/Best_model.pt')

In [None]:
data = create_empty_graph()

add_node_user(data, 0, "text", 1)
add_node_user(data, 1, "text", 2)
add_node_user(data, 2, "text", 3)
add_node_job(data, 0, "text", 1)
add_node_job(data, 1, "text", 2)
add_node_job(data, 2, "text", 3)
add_edge_app(data, 0, 0)
add_edge_app(data, 1, 1)
add_edge_app(data, 2, 2)

print(data)

user_id = 0
k = 3
top_k_job_ids = recommend_top_k(user_id, data, model, k)

In [None]:
print(top_k_job_ids)