In [3]:
import pandas as pd
from ast import literal_eval
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Load the edges data
edges_path = 'cora_network.csv'
edges_df = pd.read_csv(edges_path)

# Load the node features data
node_features_path = 'cora_nodes_with_feature.csv'
node_features_df = pd.read_csv(node_features_path)

In [5]:
# Assigning column names to the edges DataFrame
edges_df.columns = ['source', 'target']

# Converting the 'features' column from string representation of lists to actual lists
node_features_df['features'] = node_features_df['features'].apply(lambda x: literal_eval(x))

# Creating a dictionary for node IDs and their corresponding feature vectors for quick lookup
node_features_dict = dict(zip(node_features_df['nodeId'], node_features_df['features']))

# Function to calculate cosine similarity
def calculate_cosine_similarity(source, target, features_dict):
    source_features = np.array(features_dict[source]).reshape(1, -1)
    target_features = np.array(features_dict[target]).reshape(1, -1)
    return cosine_similarity(source_features, target_features)[0][0]

# Calculating cosine similarity for each edge
edges_df['cosine_similarity'] = edges_df.apply(lambda row: calculate_cosine_similarity(row['source'], row['target'], node_features_dict), axis=1)

#save to csv
edges_df.to_csv('cora_edges_cosine.csv', index=False)
