In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from keras.models import Sequential
from keras.layers import Input, Dense, Dropout
import tensorflow as tf

import json
from pathlib import Path

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-a/2021/abasse.dabere/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /users/eleves-a/2021/abasse.dabere/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2023-11-25 18:44:56.729095: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-25 18:44:56.729139: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-25 18:44:56.730301: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-25 18:44:56.738447: I tensorflow/core

In [2]:
path_to_training = Path("training")
path_to_test = Path("test")

## Combinaison Transcription+graphe pour chaque dialogue

In [36]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]*>|(?:um|uh)', '', text)

    # Tokenization des mots
    words = word_tokenize(text)

    # Supprimer les mots vides (stop words)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming (réduction à la racine des mots)
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(word) for word in words]

    # Rejoindre les mots traités en une seule chaîne de texte
    processed_text = ' '.join(words)

    return processed_text


def get_xi(transcription_id):
       discourse_graph = [] # list, i attribute j
       with open(path_to_training / f"{transcription_id}.txt", 'r') as f:
              for line in f: discourse_graph.append(line.strip())
       
       with open(path_to_training / f"{transcription_id}.json", 'r') as f:
              transcription = json.load(f)

       x_i = [] # list, speaker: attribute: text
       prefix_i = ["" for i in range(len(transcription))] # list des prelations pour lesquelles i est prefixes
       suffix_j = ["" for i in range(len(transcription))] # list des prelations pour lesquelles j est suffixes

       for line in discourse_graph:
              tmp = line.split()
              i = int(tmp[0])
              j = int(tmp[-1])

              if prefix_i[i]!= "": prefix_i[i]+=','
              prefix_i[i] += ("p"+tmp[1])

              if suffix_j[j]!= "": suffix_j[j]+=','
              suffix_j[j] += ("s"+tmp[1])

       for i in range(len(transcription)):
              replique = transcription[i]
              text = preprocess_text(replique['text'])
              x_i.append(text)

       return x_i, prefix_i, suffix_j


## Concatenater des: x_i -> X, prefix_i -> A, suffix_i -> B

In [37]:
# Recup des ids
transcription_ids = []

transcripts = path_to_training.glob('*.json')
for transcript in transcripts:
    transcription_ids.append(transcript.name[:-5])

In [38]:
# creation de X contenant les repliques x_i
X = [] #list des repliques
A = [] #list des prefixes
B = [] #list des suffixes

for transcription_id in  transcription_ids:
    x_i, prefix_i, suffix_j = get_xi(transcription_id)
    X.extend(x_i)
    A.extend(prefix_i)
    B.extend(suffix_j)

In [39]:
print("len(X):", len(X))
print("len(A):", len(A))
print("len(B):", len(B))

len(X): 72623
len(A): 72623
len(B): 72623


## Concatenation X+A+B -> Z, recuperation de y

In [42]:
Z = np.concatenate([np.array(X).reshape(-1,1), np.array(A).reshape(-1,1), np.array(B).reshape(-1,1)], axis=1)
Z.shape

(72623, 3)

In [48]:
# creation de y contenant les labels pour chaque x_i
y = [] # concatenation des labels
with open("training_labels.json", 'r') as f:
    labels = json.load(f)

for transcription_id in transcription_ids:
    y.extend(labels[transcription_id])

y = np.array(y).reshape(-1,1)

print(y.shape)

(72623, 1)


## Séperation en Train et Valid

In [49]:
Z_train, Z_valid, y_train, y_valid = train_test_split(Z, y, test_size=0.3, random_state=42)

## Recup de A_train, B_train et transformation en variables dummies

In [50]:
X_train = Z_train[:,0]
A_train = Z_train[:,1]
B_train = Z_train[:,2]

In [52]:
A_train = pd.DataFrame(A_train, columns=['variable'])
A_train = A_train['variable'].str.get_dummies(sep=',')

B_train = pd.DataFrame(B_train, columns=['variable'])
B_train = B_train['variable'].str.get_dummies(sep=',')

In [59]:
A_train_cols = A_train.columns
B_train_cols = B_train.columns

A_train = A_train.values
B_train = B_train.values

In [64]:
A_train[:5]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

In [63]:
A_train_cols

Index(['pAcknowledgement', 'pAlternation', 'pBackground',
       'pClarification_question', 'pComment', 'pConditional', 'pContinuation',
       'pContrast', 'pCorrection', 'pElaboration', 'pExplanation',
       'pNarration', 'pParallel', 'pQ-Elab', 'pQuestion-answer_pair',
       'pResult'],
      dtype='object')

## Vectorisation de X_train et Reconstruction de Z_train = X_train + A_train + B_train

In [65]:
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

X_train_bert = bert.encode(X_train, show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 1589/1589 [00:20<00:00, 76.39it/s]


In [66]:
Z_train = np.concatenate([X_train_bert, A_train, B_train], axis=1)

## Recup de A_valid, B_valid et transformation en variables dummies avec les memes colonnes que A_train, B_train

In [67]:
X_valid = Z_valid[:,0]
A_valid = Z_valid[:,1]
B_valid = Z_valid[:,2]

In [68]:
# variables dummies
A_valid = pd.DataFrame(A_valid, columns=['variable'])
A_valid = A_valid['variable'].str.get_dummies(sep=',')

B_valid = pd.DataFrame(B_valid, columns=['variable'])
B_valid = B_valid['variable'].str.get_dummies(sep=',')

In [69]:
A_valid.head()

Unnamed: 0,pAcknowledgement,pAlternation,pBackground,pClarification_question,pComment,pConditional,pContinuation,pContrast,pCorrection,pElaboration,pExplanation,pNarration,pParallel,pQ-Elab,pQuestion-answer_pair,pResult
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [78]:
# rearangement des colonnes de A_valid et B_valid pour que ça soit pareil avec celles des train

A_valid = A_valid.reindex(columns=A_train_cols).fillna(0)
B_valid = B_valid.reindex(columns=B_train_cols).fillna(0)

A_valid = A_valid.values
B_valid = B_valid.values

## Vectorisation de X_valid et Reconstruction de Z_valid = X_valid + A_valid + B_valid

In [79]:
X_valid_bert = bert.encode(X_valid, show_progress_bar=True)

Batches: 100%|██████████| 681/681 [00:08<00:00, 80.67it/s]


In [80]:
Z_valid = np.concatenate([X_valid_bert, A_valid, B_valid], axis=1)

# Modele avec DNN

In [88]:
print('Z_train.shape:', Z_train.shape)
print('Z_valid.shape:', Z_valid.shape)

Z_train.shape: (50836, 416)
Z_valid.shape: (21787, 416)


In [93]:
model = tf.keras.Sequential([
    Dense(200, input_dim=416, activation='relu'),
    Dropout(0.5),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(50, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid'),
])

In [100]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_history = model.fit(Z_train, y_train, epochs=10, batch_size=32, validation_data=(Z_valid, y_valid))

2023-11-25 20:19:58.569432: W external/local_tsl/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 80.67MiB (rounded to 84591104)requested by op _EagerConst
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-11-25 20:19:58.569471: I external/local_tsl/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2023-11-25 20:19:58.569484: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 77, Chunks in use: 77. 19.2KiB allocated for chunks. 19.2KiB in use in bin. 2.7KiB client-requested in use in bin.
2023-11-25 20:19:58.569489: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 5, Chunks in use: 5. 2.5KiB allocated for chunks. 2.5KiB in use in bin. 2.4KiB client-requested in use in bin.
2023-11-25 20:19:58.5694

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [None]:
y_pred = model.predict(Z_valid)
y_pred = np.where(y_pred>=0.5, 1, 0)

print(f1_score(y_valid, y_pred))

0.4777496651287394


## Modele avec DNN avec class_weight

In [None]:
model = Sequential([
    Dense(256, input_dim=402, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid'),
])

count_class_0 = len(y_train) - np.sum(y_train)
count_class_1 = np.sum(y_train)
total = len(y_train)
frequency_class_0 = count_class_0 / total
frequency_class_1 = count_class_1 / total
inverse_weight_class_0 = 1 / frequency_class_0
inverse_weight_class_1 = 1 / frequency_class_1
class_weights = {0: inverse_weight_class_0, 1: inverse_weight_class_1}

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_history = model.fit(Z_train, y_train, epochs=10, batch_size=32, validation_data=(Z_valid, y_valid), class_weight= class_weights)

In [None]:
y_pred = model.predict(Z_valid)
y_pred = np.where(y_pred>=0.5, 1, 0)

print(f1_score(y_valid, y_pred))