In [2]:
import networkx as nx
from torch_geometric.data import Data
import torch

from sentence_transformers import SentenceTransformer

import json
from pathlib import Path
import numpy as np

import matplotlib.pyplot as plt
import networkx as nx

import torch.nn.functional as F
from torch_geometric.nn import GCNConv

from sklearn.metrics import f1_score


In [3]:
path_to_training = Path("training")
path_to_test = Path("test")

# Recup des ids des diaoganls
transcription_ids = []
transcripts = path_to_training.glob('*.json')
for transcript in transcripts:
    transcription_ids.append(transcript.name[:-5])

In [4]:
# les labels du training
with open("training_labels.json", 'r') as f:
    transcription_labels = json.load(f)

In [5]:
bert = SentenceTransformer('distilbert-base-uncased')

No sentence-transformers model found with name /Users/dabereabasse/.cache/torch/sentence_transformers/distilbert-base-uncased. Creating a new one with MEAN pooling.


## Définir les arêtes et les types d'arêtes

In [17]:
def get_edge_list(transcription_id, path_to_data):
    discourse_graph = [] # list, i attribute j
    with open(path_to_data / f"{transcription_id}.txt", 'r') as f:
        for line in f:
            tmp = line.split()
            discourse_graph.append((int(tmp[0]), int(tmp[2]), tmp[1]))
    return discourse_graph

def get_replique(transcription_id, path_to_data):
    with open(path_to_data / f"{transcription_id}.json", 'r') as f:
        transcription = json.load(f)
    ret = []
    for i in range(len(transcription)):
        text = transcription[i]['text']
        ret.append(text)
    return ret

def create_data_object(transcription_id, labels=None, path_to_data = path_to_training):
    edge_list = get_edge_list(transcription_id, path_to_data)
    # Créer un graphe NetworkX
    G = nx.DiGraph()
    for src, dest, edge_type in edge_list:
        G.add_edge(src, dest, relation=edge_type)

    # Encoder les types d'arêtes
    edge_types = list(set([edge[2] for edge in edge_list]))
    edge_type_index = {edge_type: i for i, edge_type in enumerate(edge_types)}

    # Préparer les données pour PyTorch Geometric
    edges = []
    edge_attrs = []
    for src, dest, edge_type in G.edges(data='relation'):
        edges.append((src, dest))
        edge_attrs.append(edge_type_index[edge_type])

    # Conversion en Tenseurs PyTorch
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attrs, dtype=torch.float)

    x = torch.tensor(bert.encode(get_replique(transcription_id, path_to_data), show_progress_bar=True))
    y = torch.tensor(labels) if labels else None
    data = Data(edge_index=edge_index, edge_attr=edge_attr, x = x, y=y)
    
    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[:int(data.num_nodes*0.8)] = 1
    test_mask[int(data.num_nodes*0.8):] = 1

    data.train_mask = train_mask
    data.test_mask = test_mask
    # Créer un objet Data pour PyTorch Geometric
    return data

In [18]:
# Créer un Data object pour chaque dialogue
graphs_data = [create_data_object(dialogue, transcription_labels[dialogue]) for dialogue in transcription_ids]

Batches: 100%|██████████| 24/24 [00:02<00:00,  9.67it/s]
Batches: 100%|██████████| 41/41 [00:03<00:00, 11.60it/s]
Batches: 100%|██████████| 11/11 [00:01<00:00, 10.24it/s]
Batches: 100%|██████████| 18/18 [00:01<00:00,  9.96it/s]
Batches: 100%|██████████| 13/13 [00:01<00:00, 10.18it/s]
Batches: 100%|██████████| 24/24 [00:02<00:00, 11.81it/s]
Batches: 100%|██████████| 43/43 [00:03<00:00, 12.57it/s]
Batches: 100%|██████████| 29/29 [00:03<00:00,  9.38it/s]
Batches: 100%|██████████| 38/38 [00:03<00:00, 11.82it/s]
Batches: 100%|██████████| 19/19 [00:01<00:00, 11.31it/s]
Batches: 100%|██████████| 31/31 [00:02<00:00, 11.73it/s]
Batches: 100%|██████████| 8/8 [00:00<00:00, 10.62it/s]
Batches: 100%|██████████| 4/4 [00:00<00:00,  8.99it/s]
Batches: 100%|██████████| 21/21 [00:01<00:00, 10.80it/s]
Batches: 100%|██████████| 21/21 [00:01<00:00, 12.88it/s]
Batches: 100%|██████████| 36/36 [00:02<00:00, 12.72it/s]
Batches: 100%|██████████| 8/8 [00:00<00:00,  9.78it/s]
Batches: 100%|██████████| 23/23 [00:0

In [19]:
graphs_train = graphs_data[:int(len(graphs_data)*0.8)]
graphs_test = graphs_data[int(len(graphs_data)*0.8):]

## Modèle GCN

In [20]:
class GCNModel(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        #print(x.shape)  # Doit être de la forme [num_nodes, num_features]
        #print(edge_index.shape)  # Doit être de la forme [2, num_edges]
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

## Entraînement du Modèle

In [21]:
NUM_FEATURES = graphs_data[0].x.shape[1]
NUM_CLASSES = 2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCNModel(num_features=NUM_FEATURES, num_classes=NUM_CLASSES).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(50):
    total_loss = 0
    for data in graphs_train:  # Remplacez par vos données d'entraînement
        data = data.to(device)
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch}, Loss: {total_loss / len(graphs_data)}')


Epoch 0, Loss: 0.4159202443570206
Epoch 1, Loss: 0.38784941109185367
Epoch 2, Loss: 0.38682616064228964
Epoch 3, Loss: 0.38520228217557534
Epoch 4, Loss: 0.3862647706085874
Epoch 5, Loss: 0.38541838465277684
Epoch 6, Loss: 0.3854576647281647
Epoch 7, Loss: 0.38489022758818164
Epoch 8, Loss: 0.3838904533804077
Epoch 9, Loss: 0.3857198895252857
Epoch 10, Loss: 0.3853466307994017
Epoch 11, Loss: 0.38619133154141533
Epoch 12, Loss: 0.3847476084207751
Epoch 13, Loss: 0.38492779080400763
Epoch 14, Loss: 0.38607559867740904
Epoch 15, Loss: 0.3854608462028897
Epoch 16, Loss: 0.3852760069763538
Epoch 17, Loss: 0.38563971851289885
Epoch 18, Loss: 0.38524357713374896
Epoch 19, Loss: 0.38496773697666287
Epoch 20, Loss: 0.38678804992400495
Epoch 21, Loss: 0.3853058753554354
Epoch 22, Loss: 0.38633228392945124
Epoch 23, Loss: 0.3858658133708325
Epoch 24, Loss: 0.38608175639024717
Epoch 25, Loss: 0.3846877435433496
Epoch 26, Loss: 0.3847021583429317
Epoch 27, Loss: 0.3861591505635645
Epoch 28, Loss: 

## Évaluer le Modèle

In [30]:
model.eval()
all_preds = []
all_labels = []

for data in graphs_test:
    data = data.to(device)
    out = model(data)
    pred = out.argmax(dim=1)
    all_preds.extend(pred.cpu().numpy())
    all_labels.extend(data.y.cpu().numpy())

In [31]:
f1 = f1_score(all_labels, all_preds, average='weighted')
print(f"F1-Score: {f1}")

F1-Score: 0.7660841248951766


In [36]:
set(all_preds)

{0}

## Nouvelles Données Test

In [25]:
# Recup des ids
test_transcript_ids = []

test_transcripts = path_to_test.glob('*.json')
for test_transcript in test_transcripts:
    test_transcript_ids.append(test_transcript.name[:-5])

In [26]:
test_data = [create_data_object(dialogue, path_to_data =path_to_test) for dialogue in test_transcript_ids]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Batches: 100%|██████████| 20/20 [00:01<00:00, 12.50it/s]
Batches: 100%|██████████| 17/17 [00:01<00:00,  9.76it/s]
Batches: 100%|██████████| 20/20 [00:02<00:00,  9.17it/s]
Batches: 100%|██████████| 25/25 [00:02<00:00, 10.41it/s]
Batches: 100%|██████████| 24/24 [00:02<00:00, 10.87it/s]
Batches: 100%|██████████| 20/20 [00:01<00:00, 12.61it/s]
Batches: 100%|██████████| 24/24 [00:02<00:00,  9.69it/s]
Batches: 100%|██████████| 15/15 [00:01<00:00,  8.90it/s]
Batches: 100%|██████████| 10/10 [00:01<00:00,  9.97it/s]
Batches: 100%|██████████| 19/19 [00:01<00:00, 10.99it/s]
Batches: 100%|██████████| 22/22 [00:02<00:00, 10.72it/s]
Batches: 100%|██████████| 20/20 [00:02<00:00,  9.92it/s]
Batches: 100%|██████████| 14/14 [00:01<00:00, 10.89it/s]
Batches: 100%|██████████| 21/21 [00:01<00:00, 11.68it/s]
Batches: 100%|██████████| 15/15 [00:01<00:00, 11.33it/s]
Batches: 100%|██████████| 22/22 [00:02<00:00, 10.20it/s]
Batches: 100%|██████████| 35/35 [00:03<00:00, 10.28it/s]
Batches: 100%|██████████| 14/14

In [27]:
test_labels_gcn = {}
for i in range(len(test_transcript_ids)):
    transcription_id = test_transcript_ids[i]
    data = test_data[i]
    data = data.to(device)
    out = model(data)
    pred = out.argmax(dim=1)
    pred = pred.cpu().numpy()
    test_labels_gcn[transcription_id] = pred.tolist()

In [28]:
uniques = set()
for keys in test_labels_gcn.keys():
    uniques.update(test_labels_gcn[keys])

In [29]:
uniques

{0}