In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from keras.models import Sequential
from keras.layers import Input, Dense, Dropout
import tensorflow as tf

import json
from pathlib import Path

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-a/2021/abasse.dabere/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /users/eleves-a/2021/abasse.dabere/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
path_to_training = Path("training")
path_to_test = Path("test")

## Combinaison Transcription+graphe pour chaque dialogue

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]*>|(?:um|uh)', '', text)

    # Tokenization des mots
    words = word_tokenize(text)

    # Supprimer les mots vides (stop words)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming (réduction à la racine des mots)
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(word) for word in words]

    # Rejoindre les mots traités en une seule chaîne de texte
    processed_text = ' '.join(words)

    return processed_text


def get_xi(transcription_id):
       discourse_graph = [] # list, i attribute j
       attributes = dict() #dict, i -> "attribute1 attribute2"
       # transcription = [] # list, idx -> dict("speaker", "text", "index")
       x_i = [] # list, speaker: attribute: text
       attr_i = [] # list, attributes

       with open(path_to_training / f"{transcription_id}.json", 'r') as f:
              transcription = json.load(f)

       with open(path_to_training / f"{transcription_id}.txt", 'r') as f:
              for line in f: discourse_graph.append(line.strip())

       for line in discourse_graph:
              tmp = line.split()
              idx = int(tmp[-1])
              attributes[idx] = attributes.get(idx, "")+ tmp[1]

       for i in range(len(transcription)):
              replique = transcription[i]
              text = preprocess_text(replique['text'])
              attr_i.append(attributes.get(i,""))
              x_i.append(text)
       
       return x_i, attr_i


## Concatenater des x_i -> X et concatenation des y_i -> y

In [4]:
# Recup des ids
transcription_ids = []

transcripts = path_to_training.glob('*.json')
for transcript in transcripts:
    transcription_ids.append(transcript.name[:-5])

In [5]:
# creation de X contenant les repliques x_i
X = [] #list des repliques
A = [] #list des attributs
for transcription_id in  transcription_ids:
    x_i, attr_i = get_xi(transcription_id)
    X.extend(x_i)
    A.extend(attr_i)

In [6]:
print(len(X))
print(len(A))

72623
72623


In [7]:
Z = np.concatenate([np.array(X).reshape(-1,1), np.array(A).reshape(-1,1)], axis=1)
Z.shape

(72623, 2)

In [8]:
# creation de y contenant les labels pour chaque x_i
y = [] # concatenation des labels
with open("training_labels.json", 'r') as f:
    labels = json.load(f)

for transcription_id in transcription_ids:
    y.extend(labels[transcription_id])

## Séperation en Train et Valid

In [9]:
Z_train, Z_valid, y_train, y_valid = train_test_split(Z, y, test_size=0.3, random_state=42)

In [10]:
A_train = Z_train[:,1]
X_train = Z_train[:,0]

A_valid = Z_valid[:,1]
X_valid = Z_valid[:,0]

y_train = np.array(y_train)
y_valid = np.array(y_valid)

## Encoder

In [11]:
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
X_train_bert = bert.encode(X_train, show_progress_bar=True)
X_valid_bert = bert.encode(X_valid, show_progress_bar=True)

Batches: 100%|██████████| 1589/1589 [00:09<00:00, 169.75it/s]
Batches: 100%|██████████| 681/681 [00:03<00:00, 179.56it/s]


In [15]:
A_train_df = pd.Series(A_train)
A_valid_df = pd.Series(A_valid)

In [16]:
attributes = list(A_train_df.unique())
dic = dict(zip(attributes, [str(i) for i in range(len(attributes))]))

def b_dummies(x):
    return dic.get(x, '-1')

In [17]:
ordered_col = [str(i) for i in range(-1,len(attributes))] # with -1

A_train_df = A_train_df.apply(b_dummies)
A_train_df = pd.get_dummies(A_train_df, dtype=float)
A_train_df = A_train_df.reindex(columns=ordered_col, fill_value=0)
A_train = A_train_df.values

A_valid_df = A_valid_df.apply(b_dummies)
A_valid_df = pd.get_dummies(A_valid_df, dtype=float)
A_valid_df = A_valid_df.reindex(columns=ordered_col, fill_value=0)
A_valid = A_valid_df.values

In [18]:
Z_train = np.concatenate([X_train_bert, A_train], axis=1)
Z_valid = np.concatenate([X_valid_bert, A_valid], axis=1)

## Modele avec DNN

In [19]:
Z_valid.shape

(21787, 402)

In [20]:
model = tf.keras.Sequential([
    Dense(256, input_dim=402, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid'),
])

2023-11-25 18:37:09.812635: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-25 18:37:09.838992: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-25 18:37:09.839200: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [21]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_history = model.fit(Z_train, y_train, epochs=20, batch_size=32, validation_data=(Z_valid, y_valid))

Epoch 1/20


2023-11-25 18:37:12.899633: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f2bac9552e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-11-25 18:37:12.899662: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2023-11-25 18:37:12.943602: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-25 18:37:13.027145: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
I0000 00:00:1700933833.119790 1667159 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [22]:
y_pred = model.predict(Z_valid)
y_pred = np.where(y_pred>=0.5, 1, 0)

print(f1_score(y_valid, y_pred))

  1/681 [..............................] - ETA: 46s

0.5025030442429982


## Modele avec DNN avec class_weight

In [23]:
model = Sequential([
    Dense(256, input_dim=402, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid'),
])

In [24]:
count_class_0 = len(y_train) - np.sum(y_train)
count_class_1 = np.sum(y_train)

total = len(y_train)

frequency_class_0 = count_class_0 / total
frequency_class_1 = count_class_1 / total

inverse_weight_class_0 = 1 / frequency_class_0
inverse_weight_class_1 = 1 / frequency_class_1

class_weights = {0: inverse_weight_class_0, 1: inverse_weight_class_1}


In [25]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_history = model.fit(Z_train, y_train, epochs=10, batch_size=32, validation_data=(Z_valid, y_valid), class_weight= class_weights)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
y_pred = model.predict(Z_valid)
y_pred = np.where(y_pred>=0.5, 1, 0)

print(f1_score(y_valid, y_pred))

0.5652958152958153
