In [43]:
dataset_path = 'Replace with path to the Dataset folder'

In [None]:
!pip install torch_geometric sentence_transformers jsonargparse catboost seaborn



In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import json
import os
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
import torch
from torch_geometric.data import HeteroData
from torch_geometric.nn import GraphSAGE, GCNConv, GATConv, RGATConv, SAGEConv, GINConv, GATv2Conv, CuGraphSAGEConv, to_hetero
from torch_geometric.data import DataLoader
import torch.nn.functional as F
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import nltk
from nltk.corpus import stopwords
import torch
import torch.nn as nn

# Define training and test paths as Path objects
path_to_training = Path(dataset_path + '/training')
path_to_test = Path(dataset_path + '/test')

# Set the seed for NumPy
np.random.seed(42)

# Set the seed for PyTorch
torch.manual_seed(42)

# If you using CUDA, set the seed for it
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Download NLTK stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Load the pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
lm_model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
print ("module %s loaded")

# Define Bert Encoder
bert = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
## Define helper functions

def embed(input):
  return use_model(input)

def remove_stop_words(input_list):
    stop_words = set(stopwords.words('english'))
    filtered_list = []

    for sentence in input_list:
        # Tokenize the sentence into words
        words = nltk.word_tokenize(sentence)

        # Remove stop words
        filtered_words = [word for word in words if word.lower() not in stop_words]

        # Join the filtered words back into a sentence
        filtered_sentence = ' '.join(filtered_words)

        # Append the filtered sentence to the result list
        filtered_list.append(filtered_sentence)

    return filtered_list

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

## Data Loading

In [None]:
## Read the JSON files

#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])


## Training data
y_training = []
with open(dataset_path +  "/training_labels.json", "r") as file:
    training_labels = json.load(file)

X_training = []
for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    # Encode the speaker within the text itself
    for utterance in transcription:
        X_training.append(utterance["text"])

    y_training += training_labels[transcription_id]

## Test data
X_test = []
cnt = 0
idx = []
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    for utterance in transcription:
        X_test.append(utterance["text"])
        cnt+=1
    idx.append(cnt)
    cnt=0

## Data Cleaning

In [None]:
## Remove <vocalsound>

for i in range(len(X_training)):
  X_training[i] = X_training[i].replace('<vocalsound> ','')

for i in range(len(X_test)):
  X_test[i] = X_test[i].replace('<vocalsound>','')

## Remove stop words
X_training = remove_stop_words(X_training)
X_test = remove_stop_words(X_test)

## Sentence embeddings

In [None]:
X_training = bert.encode(X_training, show_progress_bar=True)
X_test = bert.encode(X_test, show_progress_bar=True)

## Normalize data

In [None]:
scaler = StandardScaler()
X_training_scaled = scaler.fit_transform(X_training)
X_test_scaled = scaler.transform(X_test)

## Train / validation split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_training_scaled, y_training, test_size=0.2, random_state=42)

# Split the shuffled training set into training and validation sets
train_size = int(0.8 * len(X_training))
train_set_size = int(0.8 * len(training_set))
training_set_train = training_set[:train_set_size]
training_set_val = training_set[train_set_size:]

## Knowledge Graph construction

### Compute relationships

In [None]:
def get_relationships(path, training_set):
  relationships = []
  for transcription_id in training_set:
      with open(path / f"{transcription_id}.txt", "r") as file:
          lines = file.readlines()
      relationships.append([line.strip().split() for line in lines])

  return flatten(relationships)

def get_types(relationships):
  types = list(set([edge[1] for edge in relationships]))
  type_to_number = {edge_type: number for number, edge_type in enumerate(types)}
  return types, type_to_number

relationships_X_training = get_relationships(path_to_training, training_set) # Whole data
relationships_X_train = get_relationships(path_to_training, training_set_train) # Sub train
relationships_X_val = get_relationships(path_to_training, training_set_val) # validation

types, _ = get_types(relationships_X_train)

### Construct Graph
```
Node --> Sentence
Edge --> Sentences relationships to each other
```

In [None]:
def construct_graph(X, relationships, types, y=[], test = False):
  # Get edge indices
  edge_indices = {edge_type: [] for edge_type in types}
  for type in types:
      for edge in relationships:
          source, edge_type, target = edge
          if edge_type == type:
              edge_indices[type].append(torch.tensor([int(source), int(target)], dtype=torch.long).unsqueeze(dim=1))
      edge_indices[type] = torch.cat(edge_indices[type], dim=1)

  # HeteroData for different types of relationships
  data = HeteroData()

  data["sentence"].x = torch.tensor(X,  dtype=torch.float32)
  if not test:
    data["sentence"].y = torch.tensor(y, dtype=torch.long)

  for type in types:
      data["sentence", type, "sentence"].edge_index = edge_indices[type]
  return data

## Whole Data
data = construct_graph(X_training_scaled, relationships_X_training, get_types(relationships_X_training)[0], y_training)
## Use GPU
data = data.to('cuda:0')

## Train data
data_train = construct_graph(X_train, relationships_X_train, get_types(relationships_X_train)[0], y_train)
data_train = data_train.to("cuda:0")

## Validation Data
data_val = construct_graph(X_val, relationships_X_val, get_types(relationships_X_val)[0], y_val)
data_val = data_val.to("cuda:0")

## GNN Models

### GraphSAGE

In [None]:
class GraphSAGEClassifier(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_classes):
        super(GraphSAGEClassifier, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels, num_layers=2)
        self.conv2 = SAGEConv(hidden_channels, num_classes, num_layers=4)

    def forward(self, x, edge_idx):
        x = self.conv1(x, edge_idx)
        x = F.leaky_relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_idx)

        return F.log_softmax(x, dim=1)

## GraphGATv2

In [None]:
class GraphGATv2Classifier(torch.nn.Module):
  def __init__(self, in_channels, hidden_channels, num_classes):
        super(GraphGATv2Classifier, self).__init__()
        self.conv1 = GATv2Conv(in_channels, hidden_channels, num_layers=8)
        self.conv2 = GATv2Conv(hidden_channels, num_classes, num_layers=4)

  def forward(self, x, edge_idx):
        x = self.conv1(x, edge_idx)
        x = F.leaky_relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_idx)
        return F.log_softmax(x, dim=1)

## GIN

In [None]:
class GraphGINClassifier(torch.nn.Module):
  def __init__(self, in_channels, hidden_channels, num_classes):
        super(GraphGINClassifier, self).__init__()
        self.conv1 = GINConv(nn.Sequential(nn.Linear(in_channels, hidden_channels), nn.PReLU()))
        self.conv2 = GINConv(nn.Sequential(nn.Linear(hidden_channels, num_classes), nn.PReLU()))

  def forward(self, x, edge_idx):
        x = self.conv1(x, edge_idx)
        x = F.leaky_relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_idx)
        return F.log_softmax(x, dim=1)

## Training

In [None]:
## Define hyperparameters
config = {"lr" : 0.0001, "epochs" : 2500, "batch_size" : 8}

## Definition of train and predict functions

In [None]:
def train_model(model, data):
    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
    # criterion = torch.nn.CrossEntropyLoss()

    loader = DataLoader([data], batch_size=config["batch_size"], shuffle=True)

    # Training loop
    for e in tqdm(range(config["epochs"])):
        model.train()
        train_outs = []
        train_targets = []
        train_losses = []
        for batch in loader:
            batch = batch.to("cuda:0")
            out = model(batch.x_dict, batch.edge_index_dict)
            train_outs.append(out["sentence"])
            train_targets.append(batch["sentence"].y.tolist())
            loss = F.cross_entropy(
                out["sentence"], batch["sentence"].y, torch.tensor([1.0, 5.0], device="cuda:0")
            )
            print("Loss: ", loss)
            optimizer.zero_grad()
            train_losses.append(loss.item())
            loss.backward()
            optimizer.step()

    train_targets = np.array(train_targets).squeeze()
    train_preds = np.argmax(torch.cat(train_outs).detach().cpu().numpy(), axis=1)

    # Calculate F1 score
    f1 = f1_score(train_targets, train_preds)

    print("Train F1 Score:", f1)


def predict(model, data_test):
    model.eval()
    test_outs = []
    test_loader = DataLoader(
        [data_test], batch_size=config["batch_size"], shuffle=False
    )

    with torch.no_grad():
        for batch in test_loader:
            test_out = model(batch.x_dict, batch.edge_index_dict)
            test_outs.append(test_out["sentence"])

    test_preds = np.argmax(torch.cat(test_outs).detach().cpu().numpy(), axis=1)
    return test_preds

## Train model on train data only

In [None]:
model_SAGE = GraphSAGEClassifier(in_channels=data["sentence"].x.shape[1], hidden_channels=64, num_classes=2)
model_SAGE = to_hetero(model_SAGE, data.metadata(), aggr="mean")
model_SAGE = model_SAGE.to('cuda:0')

model_GATv2 = GraphGATv2Classifier(in_channels=data["sentence"].x.shape[1], hidden_channels=64, num_classes=2)
model_GATv2 = to_hetero(model_GATv2, data.metadata(), aggr="mean")
model_GATv2 = model_GATv2.to('cuda:0')

model_GIN = GraphGINClassifier(in_channels=data["sentence"].x.shape[1], hidden_channels=64, num_classes=2)
model_GIN = to_hetero(model_GIN, data.metadata(), aggr="mean")
model_GIN = model_GIN.to('cuda:0')

train_model(model_SAGE, data_train)
train_model(model_GATv2, data_train)
train_model(model_GIN, data_train)

## Evaluate model on validation data

In [None]:
# Model Evaluation
val_preds_SAGE = predict(model_SAGE, data_val)
val_preds_GAT = predict(model_GATv2, data_val)
val_preds_GIN = predict(model_GIN, data_val)

for i in range(len(X_val)):
  if (len(X_val[i]) < 4):
    val_preds_SAGE[i] == 0
    val_preds_GAT[i] == 0
    val_preds_GIN[i] == 0


val_preds_vote = np.mean([val_preds_SAGE, val_preds_GAT, val_preds_GIN], axis = 0)
val_preds_vote[val_preds_vote >= 0.5] = 1.
val_preds_vote[val_preds_vote < 0.5] = 0.


# Calculate F1 score for validation dataset
f1_SAGE = f1_score(y_val, val_preds_SAGE)
f1_GAT = f1_score(y_val, val_preds_GAT)
f1_GIN = f1_score(y_val, val_preds_GIN)
f1_vote = f1_score(y_val, val_preds_vote)

print("Validation F1 Score SAGE:", f1_SAGE)
print("Validation F1 Score GAT:", f1_GAT)
print("Validation F1 Score GIN:", f1_GIN)
print("Validation F1 Score Vote:", f1_vote)

conf_matrix = confusion_matrix(y_val, val_preds_vote)
print('Confusion Matrix:')
print(conf_matrix)

### Main Model Training

In [None]:
model_SAGE_main = GraphSAGEClassifier(in_channels=data["sentence"].x.shape[1], hidden_channels=64, num_classes=2)
model_SAGE_main = to_hetero(model_SAGE_main, data.metadata(), aggr="mean")
model_SAGE_main = model_SAGE.to('cuda:0')

model_GAT_main = GraphGATv2Classifier(in_channels=data["sentence"].x.shape[1], hidden_channels=64, num_classes=2)
model_GAT_main = to_hetero(model_GAT_main, data.metadata(), aggr="mean")
model_GAT_main = model_GATv2.to('cuda:0')

model_GIN_main = GraphGINClassifier(in_channels=data["sentence"].x.shape[1], hidden_channels=64, num_classes=2)
model_GIN_main = to_hetero(model_GIN_main, data.metadata(), aggr="mean")
model_GIN_main = model_GIN.to('cuda:0')

train_model(model_SAGE_main, data)
train_model(model_GAT_main, data)
train_model(model_GIN_main, data)

### Predict test labels

In [None]:
test_relationships = get_relationships(path_to_test, test_set)
data_test = construct_graph(X_test_scaled, test_relationships, get_types(test_relationships)[0], test = True)
data_test = data_test.to('cuda:0')

y_test_SAGE = predict(model_SAGE_main, data_test)
y_test_GAT = predict(model_GAT_main, data_test)
y_test_GIN = predict(model_GIN_main, data_test)

for i in range(len(y_test_SAGE)):
  if (len(X_test[i]) < 5):
    y_test_SAGE[i] = 0
    y_test_GAT[i] = 0
    y_test_GIN[i] = 0

y_test_vote = np.mean([y_test_SAGE, y_test_GAT, y_test_GIN], axis = 0)
y_test_vote[y_test_vote >= 0.5] = 1.
y_test_vote[y_test_vote < 0.5] = 0.

### Generate Submission

In [None]:
test_labels = {}
last_idx = 0
for i, transcription_id in enumerate(test_set):
    test_labels[transcription_id] = y_test_vote[last_idx:last_idx + idx[i]].tolist()
    last_idx += idx[i]

def make_submission():
    file = open("submission.csv", "w")
    file.write("id,target_feature\n")
    for key, value in test_labels.items():
        u_id = [key + "_" + str(i) for i in range(len(value))]
        target = map(str, value)
        for row in zip(u_id, target):
            file.write(",".join(row))
            file.write("\n")
    file.close()

make_submission()