<a href="https://colab.research.google.com/github/akshaygopan/BeComE-A-novel-framework-for-node-classification-in-social-graphs/blob/main/become_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and Installs

In [None]:
%%capture
#!pip install node2vec
!pip install ampligraph
!pip install sentence-transformers
!pip install accelerate -U
!pip install datasets

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from huggingface_hub import notebook_login
from ampligraph.compat import ComplEx
import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_fscore_support, classification_report
from sklearn.metrics import confusion_matrix

# Semantic Embeddings

In [None]:
def introduceLabelAwareness(text, label):
  return  text + '<SEP>The label is ' + str(label)

In [None]:
def getTextData(data, label_col):
  datasets = {'cora': 'Akshayxx/CoraDatasetV6_Two', 'citeseer' :'Akshayxx/citeseerV1'}

  dataset = load_dataset(datasets[data])

  text_df_test = dataset['validation'].to_pandas()
  text_df_train = dataset['train'].to_pandas()
  text_df_train['text'] = text_df_train.apply(lambda x: introduceLabelAwareness(x['text'], x[label_col]), axis = 1)

  return text_df_train, text_df_test

In [None]:
def getTextModel(model_name = 'bert-base-nli-mean-tokens'): #Akshayxx/bert-base-cased-finetuned-cora

    model = SentenceTransformer(model_name)
    return model

In [None]:
def getTextEmbeddings(text_df_train, text_df_test, text_col = 'text', model_name = 'bert-base-nli-mean-tokens'):

  model = getTextModel(model_name)
  text_vecs_train = model.encode(text_df_train[text_col])
  text_vecs_test = model.encode(text_df_test[text_col])

  return text_vecs_train, text_vecs_test

# Structural Embeddings

In [None]:
def getCiteSeerEdges():

  file_path= '/content/CiteSeer_original.cites'

  with open(file_path, 'r') as file:
        lines = file.readlines()

  starts = []
  dests = []

  for i in lines:

        cities = i.split()
        starts.append(cities[0])
        dests.append(cities[1])

  edges = pd.DataFrame({
          'source_id': starts,
          'dest_id': dests
      })

  return edges

In [None]:
def getCoraEdges():

  edges = pd.read_csv('/content/df_cln2.csv')
  return edges

In [None]:
def getStructuralEmbeddings(text_df_train, text_df_test, edges, label_col):

  df = pd.concat([text_df_train, text_df_test])
  present = df.node_id.unique()
  edges = edges[(edges['dest_id'].isin(present)) | (edges['source_id'].isin(present))]
  edges_list = []

  for row in range(len(edges)):

    relation = -1
    source = edges.iloc[row]['source_id']
    dest = edges.iloc[row]['dest_id']

    try:

      source_label = df[df['node_id'] == source].iloc[0][label_col]
      dest_label = df[df['node_id'] == dest].iloc[0][label_col]

    except:

      relation = -1

    #if(source_label == dest_label):
    relation = dest_label

    #if(source in text_df_train.node_id.unique()):

    #    relation = dest_label

    #if(source in text_df_test.node_id.unique()):

     #   relation = dest_label

    #if(dest in text_df_test.node_id.unique()):
    #    relation = -1


    #if(source in text_df_test.node_id.unique()):
    #    relation = -1

    edge = [source, relation, dest]
    edges_list.append(edge)

  embeds_model = getStructualEmbeddingModel(edges_list)
  node_embeddings_train = embeds_model.get_embeddings(text_df_train['node_id'].values, embedding_type='entity')
  node_embeddings_test = embeds_model.get_embeddings(text_df_test['node_id'].values, embedding_type='entity')

  return node_embeddings_train, node_embeddings_test

In [None]:
def getStructualEmbeddingModel(edges_list):

  edges_list = np.array(edges_list)
  embed_dim = 200
  epochs =  200
  batches_count = 250
  verbose = True
  optimizer = "adam"
  optimizer_params = {'lr' :  0.001}
  embeds_model = ComplEx(k = embed_dim, epochs =  epochs, batches_count =  batches_count, verbose=verbose, optimizer = optimizer, optimizer_params = optimizer_params)
  #tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
  embeds_model.fit(edges_list)

  return embeds_model

# Labels

In [None]:
def getLabels(text_df_train, text_df_test, col):

  labels_train = text_df_train[col].to_numpy()
  labels_test = text_df_test[col].to_numpy()
  labels_train = labels_train.reshape(-1, 1)
  labels_test = labels_test.reshape(-1, 1)

  return labels_train, labels_test

# Hybrid Embeddings

In [None]:
def getCombinedEmbeddings(text_vecs_train, node_embeddings_train, labels_train, text_vecs_test, node_embeddings_test, labels_test):

  data_np_train = np.hstack((text_vecs_train, node_embeddings_train, labels_train))
  data_np_test = np.hstack((text_vecs_test, node_embeddings_test, labels_test))

  data_actual_train = pd.DataFrame(data_np_train)
  data_actual_test = pd.DataFrame(data_np_test)

  data_actual_train[data_actual_train.shape[1]-1] = data_actual_train[data_actual_train.shape[1]-1].astype(int)
  data_actual_test[data_actual_test.shape[1]-1] = data_actual_test[data_actual_test.shape[1]-1].astype(int)

  X_train = data_actual_train.drop(columns=[data_actual_train.shape[1]-1])
  y_train = data_actual_train[data_actual_train.shape[1]-1]
  X_test = data_actual_test.drop(columns=[data_actual_test.shape[1]-1])
  y_test = data_actual_test[data_actual_test.shape[1]-1]

  return X_train, y_train, X_test, y_test

# Evaluation

In [None]:
def trainAndPredict(X_train, X_test, y_train, y_test):

  # Initialize the SVM classifier (SVC) with an rbf kernel
  clfi = svm.SVC(kernel='rbf', C = 10, gamma = 'auto')

  # Train the SVM classifier using the training data
  clfi.fit(X_train, y_train)

  # Make predictions on the test data
  y_pred = clfi.predict(X_test)

  # Evaluate the classifier
  accuracy = clfi.score(X_test, y_test)
  print("Accuracy:", accuracy)
  preds = clfi.predict(X_test)
  labels = y_test

  return preds, labels

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
def calculate_tpr_fpr(y_real, y_pred):
    '''
    Calculates the True Positive Rate (tpr) and the True Negative Rate (fpr) based on real and predicted observations

    Args:
        y_real: The list or series with the real classes
        y_pred: The list or series with the predicted classes

    Returns:
        tpr: The True Positive Rate of the classifier
        fpr: The False Positive Rate of the classifier
    '''

    # Calculates the confusion matrix and recover each element
    cm = confusion_matrix(y_real, y_pred)
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    TP = cm[1, 1]

    # Calculates tpr and fpr
    tpr =  TP/(TP + FN) # sensitivity - true positive rate
    fpr = 1 - TN/(TN+FP) # 1-specificity - false positive rate

    return tpr, fpr

In [None]:
def get_all_roc_coordinates(y_real, y_proba):
    '''
    Calculates all the ROC Curve coordinates (tpr and fpr) by considering each point as a threshold for the predicion of the class.

    Args:
        y_real: The list or series with the real classes.
        y_proba: The array with the probabilities for each class, obtained by using the `.predict_proba()` method.

    Returns:
        tpr_list: The list of TPRs representing each threshold.
        fpr_list: The list of FPRs representing each threshold.
    '''
    tpr_list = [0]
    fpr_list = [0]
    for i in range(len(y_proba)):
        threshold = y_proba[i]
        y_pred = y_proba >= threshold
        tpr, fpr = calculate_tpr_fpr(y_real, y_pred)
        tpr_list.append(tpr)
        fpr_list.append(fpr)
    return tpr_list, fpr_list

In [None]:
def getResults(preds, labels, num_classes, results, name):

  y_pred = preds
  # Compute accuracy

  accuracy = accuracy_score(labels, y_pred)
  cm = confusion_matrix(labels, y_pred)
  # We will store the results in a dictionary for easy access later
  per_class_accuracies = {}

  # Calculate the accuracy for each one of our classes
  for cls in range(0, num_classes):
      # True negatives are all the samples that are not our current GT class (not the current row)
      # and were not predicted as the current class (not the current column)
      true_negatives = np.sum(np.delete(np.delete(cm, cls, axis=0), cls, axis=1))

      # True positives are all the samples of our current GT class that were predicted as such
      true_positives = cm[cls, cls]

      # The accuracy for the current class is the ratio between correct predictions to all predictions
      per_class_accuracies[cls] = (true_positives + true_negatives) / np.sum(cm)

  #cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

  # Compute precision, recall, F1-score, support for each class
  #acc = cm.diagonal()
  precision, recall, f1, support = precision_recall_fscore_support(labels, y_pred, average=None)
  # Compute micro-average F1-score and ROC AUC

  f1_micro_average = f1_score(labels, y_pred, average='micro')
  print(accuracy, precision, recall, f1, support, f1_micro_average)

  row = {"Name": name, "accuracy":accuracy, "precision": '', "recall": '', "f1":'', "support":'', "f1_micro_average": f1_micro_average}
  new_df = pd.DataFrame([row])
  results = pd.concat([results, new_df])

  for i in range(1, num_classes+1):

    row = {"Name": "Class" + str(i), "accuracy": per_class_accuracies[i-1], "precision": precision[i-1], "recall": recall[i-1], "f1":f1[i-1], "support":support[i-1], "f1_micro_average":''}
    new_df = pd.DataFrame([row])
    results = pd.concat([results, new_df])

  return results

# Full Pipeline

In [None]:
def pipeline(data, results):

  label_col = {'cora' : 'label', 'citeseer': 'encoded_labels'}
  text_df_train, text_df_test = getTextData(data, label_col = label_col[data])
  text_vecs_train, text_vecs_test = getTextEmbeddings(text_df_train, text_df_test, text_col = 'text', model_name = 'bert-base-nli-mean-tokens')

  if(data =='cora'):
      edges = getCoraEdges()
  else:
      edges = getCiteSeerEdges()

  node_embeddings_train, node_embeddings_test = getStructuralEmbeddings(text_df_train, text_df_test, edges, label_col = label_col[data])
  labels_train, labels_test = getLabels(text_df_train, text_df_test, col = label_col[data])
  X_train, y_train, X_test, y_test = getCombinedEmbeddings(text_vecs_train, node_embeddings_train, labels_train, text_vecs_test, node_embeddings_test, labels_test)
  preds, labels = trainAndPredict(X_train, X_test, y_train, y_test)
  name = data + 'BERT + ComplEx + SVM'
  results = getResults(preds, labels, len(y_train.unique()), results, name)

  return results

# Try it out!

In [None]:
results = pd.DataFrame()
data_list = ['cora', 'citeseer']
for data in data_list:
  results = pipeline(data, results)

Batches:   0%|          | 0/56 [00:00<?, ?it/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Accuracy: 0.7471783295711061
0.7471783295711061 [0.67283951 0.87878788 0.75       0.7037037  0.89473684 0.72916667
 0.88235294] [0.84496124 0.74358974 0.77142857 0.6440678  0.796875   0.68627451
 0.48387097] [0.74914089 0.80555556 0.76056338 0.67256637 0.84297521 0.70707071
 0.625     ] [129  39  70  59  64  51  31] 0.7471783295711061


Batches:   0%|          | 0/80 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Accuracy: 0.6959247648902821
0.6959247648902821 [0.5        0.77235772 0.72340426 0.74390244 0.65986395 0.62992126] [0.1875     0.84070796 0.74452555 0.62244898 0.7578125  0.70175439] [0.27272727 0.80508475 0.73381295 0.67777778 0.70545455 0.66390041] [ 48 113 137  98 128 114] 0.6959247648902821


In [None]:
results.to_excel('resultsQWW.xlsx')

In [None]:
label_col = {'cora' : 'label', 'citeseer': 'encoded_labels'}
text_df_train, text_df_test = getTextData(data, label_col = label_col[data])
text_vecs_train, text_vecs_test = getTextEmbeddings(text_df_train, text_df_test, text_col = 'text', model_name = 'bert-base-nli-mean-tokens')
if(data =='cora'):
      edges = getCoraEdges()
else:
      edges = getCiteSeerEdges()

node_embeddings_train, node_embeddings_test = getStructuralEmbeddings(text_df_train, text_df_test, edges, label_col = label_col[data])
labels_train, labels_test = getLabels(text_df_train, text_df_test, col = label_col[data])
X_train, y_train, X_test, y_test = getCombinedEmbeddings(text_vecs_train, node_embeddings_train, labels_train, text_vecs_test, node_embeddings_test, labels_test)
preds, labels = trainAndPredict(X_train, X_test, y_train, y_test)
name = data + 'BERT + ComplEx + SVM'
results = getResults(preds, labels, len(y_train.unique()), results, name)