In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

import re
from sentence_transformers import SentenceTransformer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.models import Sequential
from keras.layers import Input, Dense, Dropout

import jsonk
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
path_to_training = Path("training")
path_to_test = Path("test")

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]*>|(?:um|uh)', '', text)

    # Tokenization des mots
    words = word_tokenize(text)

    # Supprimer les mots vides (stop words)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming (réduction à la racine des mots)
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(word) for word in words]

    # Rejoindre les mots traités en une seule chaîne de texte
    processed_text = ' '.join(words)

    return processed_text


def get_xi(transcription_id, path_to_data = path_to_training):
       discourse_graph = [] # list, i attribute j
       with open(path_to_data / f"{transcription_id}.txt", 'r') as f:
              for line in f: discourse_graph.append(line.strip())
       
       with open(path_to_data / f"{transcription_id}.json", 'r') as f:
              transcription = json.load(f)

       x_i = [] # list, speaker: attribute: text
       prefix_i = ["" for i in range(len(transcription))] # list des prelations pour lesquelles i est prefixes
       suffix_j = ["" for i in range(len(transcription))] # list des prelations pour lesquelles j est suffixes

       for line in discourse_graph:
              tmp = line.split()
              i = int(tmp[0])
              j = int(tmp[-1])

              if prefix_i[i]!= "": prefix_i[i]+=','
              prefix_i[i] += ("p"+tmp[1])

              if suffix_j[j]!= "": suffix_j[j]+=','
              suffix_j[j] += ("s"+tmp[1])

       for i in range(len(transcription)):
              replique = transcription[i]
              text = preprocess_text(replique['text'])
              x_i.append(text)

       return x_i, prefix_i, suffix_j

In [5]:
# Recup des ids
transcription_ids = []

transcripts = path_to_training.glob('*.json')
for transcript in transcripts:
    transcription_ids.append(transcript.name[:-5])

In [6]:
# creation de X contenant les repliques x_i
X = [] #list des repliques
A = [] #list des prefixes
B = [] #list des suffixes

for transcription_id in  transcription_ids:
    x_i, prefix_i, suffix_j = get_xi(transcription_id)
    X.extend(x_i)
    A.extend(prefix_i)
    B.extend(suffix_j)

In [7]:
Z = np.concatenate([np.array(X).reshape(-1,1), np.array(A).reshape(-1,1), np.array(B).reshape(-1,1)], axis=1)
Z.shape

(72623, 3)

In [8]:
# creation de y contenant les labels pour chaque x_i
y = [] # concatenation des labels
with open("training_labels.json", 'r') as f:
    labels = json.load(f)

for transcription_id in transcription_ids:
    y.extend(labels[transcription_id])

y = np.array(y).reshape(-1,1)

print(y.shape)

(72623, 1)


### Séparation en train_test_spli

In [9]:
Z_train, Z_valid, y_train, y_valid = train_test_split(Z, y, test_size=0.3, random_state=42)

### Recup de A_train, B_train et transformation en variables dummies

In [10]:
X_train = Z_train[:,0]
A_train = Z_train[:,1]
B_train = Z_train[:,2]

A_train = pd.DataFrame(A_train, columns=['variable'])
A_train = A_train['variable'].str.get_dummies(sep=',')

B_train = pd.DataFrame(B_train, columns=['variable'])
B_train = B_train['variable'].str.get_dummies(sep=',')

A_train_cols = A_train.columns
B_train_cols = B_train.columns

A_train = A_train.values
B_train = B_train.values

## Vectorisation de X_train et Reconstruction de Z_train = X_train + A_train + B_train

In [11]:
bert = SentenceTransformer('distilbert-base-uncased')
X_train_bert = bert.encode(X_train, show_progress_bar=True)

Z_train = np.concatenate([X_train_bert, A_train, B_train], axis=1)

No sentence-transformers model found with name /Users/dabereabasse/.cache/torch/sentence_transformers/distilbert-base-uncased. Creating a new one with MEAN pooling.
Batches: 100%|██████████| 1589/1589 [01:40<00:00, 15.74it/s]


### Recup de A_valid, B_valid et transformation en variables dummies avec les memes colonnes que A_train, B_train

In [12]:
X_valid = Z_valid[:,0]
A_valid = Z_valid[:,1]
B_valid = Z_valid[:,2]

# variables dummies
A_valid = pd.DataFrame(A_valid, columns=['variable'])
A_valid = A_valid['variable'].str.get_dummies(sep=',')

B_valid = pd.DataFrame(B_valid, columns=['variable'])
B_valid = B_valid['variable'].str.get_dummies(sep=',')

# rearangement des colonnes de A_valid et B_valid pour que ça soit pareil avec celles des train

A_valid = A_valid.reindex(columns=A_train_cols).fillna(0)
B_valid = B_valid.reindex(columns=B_train_cols).fillna(0)

A_valid = A_valid.values
B_valid = B_valid.values

In [13]:
X_valid_bert = bert.encode(X_valid, show_progress_bar=True)
Z_valid = np.concatenate([X_valid_bert, A_valid, B_valid], axis=1)

Batches: 100%|██████████| 681/681 [00:43<00:00, 15.58it/s]


## DNN

In [14]:
count_class_0 = len(y_train) - np.sum(y_train)
count_class_1 = np.sum(y_train)

total = len(y_train)

frequency_class_0 = count_class_0 / total
frequency_class_1 = count_class_1 / total

inverse_weight_class_0 = 1 / frequency_class_0
inverse_weight_class_1 = 1 / frequency_class_1

class_weights = {0: inverse_weight_class_0, 1: inverse_weight_class_1}

In [15]:
arch = [400, 200, 100]
p = [0.16496388, 0.49430627, 0.37350436]
trashold = 0.6696969696969697
# f1_score = 0.5905423347972628

model = Sequential([
    Dense(arch[0], input_dim=800, activation='relu'),
    Dropout(p[0]),
    Dense(arch[1], activation='relu'),
    Dropout(p[1]),
    Dense(arch[2], activation='relu'),
    Dropout(p[2]),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(Z_train, y_train, epochs=30, batch_size=32, class_weight=class_weights)

#y_pred_dnn = model.predict(Z_valid)
#y_pred = np.where(y_pred_dnn>=trashold, 1, 0)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x2b4061730>

In [16]:
y_pred_dnn = model.predict(Z_valid)
y_pred = np.where(y_pred_dnn>=trashold, 1, 0)

f1_score(y_valid, y_pred)



0.5687980574666127

## Prediction sur le test

In [17]:
# Recup des ids
transcription_ids = []

transcripts = path_to_test.glob('*.json')
for transcript in transcripts:
    transcription_ids.append(transcript.name[:-5])

In [18]:
test_labels_model4 = {}

for transcription_id in transcription_ids:
    X, A, B = get_xi(transcription_id, path_to_test)
    Z_test = np.concatenate([np.array(X).reshape(-1,1), np.array(A).reshape(-1,1), np.array(B).reshape(-1,1)], axis=1)

    X_test = Z_test[:,0]
    A_test = Z_test[:,1]
    B_test = Z_test[:,2]

    # variables dummies
    A_test = pd.DataFrame(A_test, columns=['variable'])
    A_test = A_test['variable'].str.get_dummies(sep=',')

    B_test = pd.DataFrame(B_test, columns=['variable'])
    B_test = B_test['variable'].str.get_dummies(sep=',')

    # rearangement des colonnes de A_valid et B_valid pour que ça soit pareil avec celles des train

    A_test = A_test.reindex(columns=A_train_cols).fillna(0)
    B_test = B_test.reindex(columns=B_train_cols).fillna(0)

    A_test = A_test.values
    B_test = B_test.values

    X_test_bert = bert.encode(X_test, show_progress_bar=True)
    Z_test = np.concatenate([X_test_bert, A_test, B_test], axis=1)

    test_pred = model.predict(Z_test)
    test_pred = np.where(test_pred>=trashold, 1, 0)

    test_labels_model4[transcription_id] = test_pred.reshape(-1,).tolist()

Batches: 100%|██████████| 20/20 [00:01<00:00, 15.51it/s]




Batches: 100%|██████████| 17/17 [00:01<00:00, 12.69it/s]




Batches: 100%|██████████| 20/20 [00:01<00:00, 10.43it/s]




Batches: 100%|██████████| 25/25 [00:01<00:00, 13.04it/s]




Batches: 100%|██████████| 24/24 [00:01<00:00, 16.13it/s]




Batches: 100%|██████████| 20/20 [00:01<00:00, 16.01it/s]




Batches: 100%|██████████| 24/24 [00:01<00:00, 13.08it/s]




Batches: 100%|██████████| 15/15 [00:01<00:00, 12.62it/s]




Batches: 100%|██████████| 10/10 [00:00<00:00, 13.39it/s]




Batches: 100%|██████████| 19/19 [00:01<00:00, 13.49it/s]




Batches: 100%|██████████| 22/22 [00:01<00:00, 14.40it/s]




Batches: 100%|██████████| 20/20 [00:01<00:00, 14.32it/s]

 1/20 [>.............................] - ETA: 0s






Batches: 100%|██████████| 14/14 [00:00<00:00, 16.03it/s]




Batches: 100%|██████████| 21/21 [00:01<00:00, 16.63it/s]




Batches: 100%|██████████| 15/15 [00:00<00:00, 16.05it/s]




Batches: 100%|██████████| 22/22 [00:01<00:00, 14.92it/s]




Batches: 100%|██████████| 35/35 [00:02<00:00, 14.39it/s]




Batches: 100%|██████████| 14/14 [00:01<00:00, 12.64it/s]




Batches: 100%|██████████| 27/27 [00:02<00:00, 12.97it/s]




Batches: 100%|██████████| 30/30 [00:02<00:00, 13.60it/s]




Batches: 100%|██████████| 29/29 [00:02<00:00, 14.23it/s]




Batches: 100%|██████████| 22/22 [00:01<00:00, 15.24it/s]

 1/22 [>.............................] - ETA: 0s






Batches: 100%|██████████| 22/22 [00:01<00:00, 15.12it/s]




Batches: 100%|██████████| 52/52 [00:02<00:00, 17.35it/s]

 1/52 [..............................] - ETA: 0s






Batches: 100%|██████████| 30/30 [00:01<00:00, 15.64it/s]




Batches: 100%|██████████| 8/8 [00:00<00:00, 12.26it/s]




Batches: 100%|██████████| 12/12 [00:00<00:00, 13.17it/s]




Batches: 100%|██████████| 25/25 [00:01<00:00, 13.68it/s]




Batches: 100%|██████████| 37/37 [00:02<00:00, 16.96it/s]




Batches: 100%|██████████| 40/40 [00:02<00:00, 16.78it/s]




Batches: 100%|██████████| 24/24 [00:01<00:00, 14.58it/s]




Batches: 100%|██████████| 32/32 [00:01<00:00, 17.84it/s]




Batches: 100%|██████████| 43/43 [00:02<00:00, 17.98it/s]




Batches: 100%|██████████| 9/9 [00:00<00:00, 14.18it/s]




Batches: 100%|██████████| 10/10 [00:00<00:00, 13.25it/s]




Batches: 100%|██████████| 34/34 [00:02<00:00, 16.10it/s]




Batches: 100%|██████████| 38/38 [00:02<00:00, 15.27it/s]




Batches: 100%|██████████| 37/37 [00:02<00:00, 15.84it/s]




Batches: 100%|██████████| 24/24 [00:01<00:00, 12.88it/s]




Batches: 100%|██████████| 46/46 [00:03<00:00, 13.95it/s]

 1/46 [..............................] - ETA: 0s






## make_submission

In [19]:
def make_submission(test_labels, filename= "submission"):
    file = open(filename+".csv", "w")
    file.write("id,target_feature\n")
    for key, value in test_labels.items():
        u_id = [key + "_" + str(i) for i in range(len(value))]
        target = map(str, value) 
        for row in zip(u_id, target):
            file.write(",".join(row))
            file.write("\n")
    file.close()

In [20]:
make_submission(test_labels_model4, "submission_final")