In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import numpy as np

import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from keras.models import Sequential
from keras.layers import Input, Dense, Dropout
import tensorflow as tf

import json
from pathlib import Path

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-a/2021/abasse.dabere/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /users/eleves-a/2021/abasse.dabere/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2023-11-23 16:32:49.773722: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-23 16:32:49.773763: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-23 16:32:50.006350: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-23 16:32:50.466225: I tensorflow/core

In [2]:
path_to_training = Path("training")
path_to_test = Path("test")

## Combinaison Transcription+graphe pour chaque dialogue

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]*>|(?:um|uh)', '', text)

    # Tokenization des mots
    words = word_tokenize(text)

    # Supprimer les mots vides (stop words)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming (réduction à la racine des mots)
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(word) for word in words]

    # Rejoindre les mots traités en une seule chaîne de texte
    processed_text = ' '.join(words)

    return processed_text


def get_xi(transcription_id):
       discourse_graph = [] # list, i attribute j
       attributes = dict() #dict, i -> "attribute1 attribute2"
       # transcription = [] # list, idx -> dict("speaker", "text", "index")
       x_i = [] # list, speaker: attribute: text
       attr_i = [] # list, attributes

       with open(path_to_training / f"{transcription_id}.json", 'r') as f:
              transcription = json.load(f)

       with open(path_to_training / f"{transcription_id}.txt", 'r') as f:
              for line in f: discourse_graph.append(line.strip())

       for line in discourse_graph:
              tmp = line.split()
              idx = int(tmp[-1])
              attributes[idx] = attributes.get(idx, "")+ tmp[1]

       for i in range(len(transcription)):
              replique = transcription[i]
              text = preprocess_text(replique['text'])
              attr_i.append(attributes.get(i,""))
              x_i.append(text)
       
       return x_i, attr_i


## Concatenater des x_i -> X et concatenation des y_i -> y

In [4]:
# Recup des ids
transcription_ids = []

transcripts = path_to_training.glob('*.json')
for transcript in transcripts:
    transcription_ids.append(transcript.name[:-5])

In [5]:
# creation de X contenant les repliques x_i
X = [] #list des repliques
A = [] #list des attributs
for transcription_id in  transcription_ids:
    x_i, attr_i = get_xi(transcription_id)
    X.extend(x_i)
    A.extend(attr_i)

In [6]:
print(len(X))
print(len(A))

72623
72623


In [12]:
Z = np.concatenate([np.array(X).reshape(-1,1), np.array(A).reshape(-1,1)], axis=1)
Z.shape

(72623, 2)

In [13]:
# creation de y contenant les labels pour chaque x_i
y = [] # concatenation des labels
with open("training_labels.json", 'r') as f:
    labels = json.load(f)

for transcription_id in transcription_ids:
    y.extend(labels[transcription_id])

## Séperation en Train et Valid

In [20]:
Z_train, Z_valid, y_train, y_valid = train_test_split(Z, y, test_size=0.3, random_state=42)

In [21]:
A_train = Z_train[:,1]
X_train = Z_train[:,0]

A_valid = Z_valid[:,1]
X_valid = Z_valid[:,0]

## Encoder

In [22]:
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

In [23]:
X_train = bert.encode(X_train, show_progress_bar=True)
X_valid = bert.encode(X_valid, show_progress_bar=True)

Batches: 100%|██████████| 1589/1589 [00:06<00:00, 237.66it/s]
Batches: 100%|██████████| 681/681 [00:02<00:00, 231.90it/s]


In [29]:
import pandas as pd

In [93]:
A_train_df = pd.Series(A_train)
A_valid_df = pd.Series(A_valid)

In [94]:
attributes = list(A_train_df.unique())
dic = dict(zip(attributes, [str(i) for i in range(len(attributes))]))

def b_dummies(x):
    return dic.get(x, '-1')

In [95]:
ordered_col = [str(i) for i in range(-1,len(attributes))] # with -1

A_train_df = A_train_df.apply(b_dummies)
A_train_df = pd.get_dummies(A_train_df, dtype=float)
A_train_df = A_train_df.reindex(columns=ordered_col, fill_value=0)
A_train = A_train_df.values

A_valid_df = A_valid_df.apply(b_dummies)
A_valid_df = pd.get_dummies(A_valid_df, dtype=float)
A_valid_df = A_valid_df.reindex(columns=ordered_col, fill_value=0)
A_valid = A_valid_df.values

In [99]:
Z_train = np.concatenate([X_train, A_train], axis=1)
Z_valid = np.concatenate([X_valid, A_valid], axis=1)

## Modele avec RandomForest

In [114]:
clf = RandomForestClassifier()
clf.fit(Z_train, y_train)

y_pred = clf.predict(Z_valid)
print("le f1_score est: "+ str(f1_score(y_valid, y_pred)))

le f1_score est: 0.26717101333864224


## Modele avec DNN

In [118]:
Z_valid.shape

(21787, 402)

In [126]:
model = tf.keras.Sequential([
    Dense(256, input_dim=402, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid'),
])

In [127]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [128]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_history = model.fit(Z_train, y_train, epochs=20, batch_size=32, validation_data=(Z_valid, y_valid))

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [129]:
y_pred = model.predict(Z_valid)
y_pred = np.where(y_pred>=0.5, 1, 0)

print(f1_score(y_valid, y_pred))

0.5055176037834997


In [131]:
model_history.history.keys()

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])

## Model 2

In [135]:
model = Sequential([
    Dense(256, input_dim=402, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid'),
])

In [136]:
count_class_0 = len(y_train) - np.sum(y_train)
count_class_1 = np.sum(y_train)

total = len(y_train)

frequency_class_0 = count_class_0 / total
frequency_class_1 = count_class_1 / total

inverse_weight_class_0 = 1 / frequency_class_0
inverse_weight_class_1 = 1 / frequency_class_1

class_weights = {0: inverse_weight_class_0, 1: inverse_weight_class_1}


In [137]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_history = model.fit(Z_train, y_train, epochs=10, batch_size=32, validation_data=(Z_valid, y_valid), class_weight= class_weights)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [138]:
y_pred = model.predict(Z_valid)
y_pred = np.where(y_pred>=0.5, 1, 0)

print(f1_score(y_valid, y_pred))

0.5651635720601238


## Model 3

In [189]:
from sklearn.preprocessing import StandardScaler

In [190]:
scaler = StandardScaler()
Z_train_scaled = scaler.fit_transform(Z_train)
Z_valid_scaled = scaler.transform(Z_valid)

In [191]:
model = Sequential([
    Dense(256, input_dim=402, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid'),
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_history = model.fit(Z_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(Z_valid_scaled, y_valid), class_weight= class_weights)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [192]:
y_pred = model.predict(Z_valid_scaled)
y_pred = np.where(y_pred>=0.5, 1, 0)

print(f1_score(y_valid, y_pred))

0.5553819587205
