In [7]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import spacy  # Pour la lemmatisation


In [8]:
import pandas as pd

# Load the CSV file
df_csv = pd.read_csv("questions_dataset.csv", header=None, names=["data"])

# Prepare an empty list to store transformed rows
transformed_data = []

# Iterate through each row
for _, row in df_csv.iterrows():
    # Split category into Category and Subcategory
    category_parts = row['data'].split(":")
    classe=category_parts[0]
    question_split=category_parts[1]
    # Split Question into Label and the rest
    label_question= question_split.split(' ')
    label = label_question[0]  
    question_rest =  " ".join(label_question[1:])  # The rest as the question  # Remaining part of the question
    
    # Append the transformed data as a tuple
    transformed_data.append(( question_rest,classe, label))

# Create a new DataFrame with the transformed data
df = pd.DataFrame(transformed_data, columns=["Question","Category", "Subcategory"])

# Save the transformed DataFrame to a new CSV file
df.to_csv("csv_data_file.csv", index=False)

In [9]:

df['Question'] = df['Question'].str.lower()  # Convertir le texte en minuscules
df['Question'] = df['Question'].str.replace(r'[^\w\s]', '', regex=True)  # Supprimer les ponctuations
df['Question'] = df['Question'].str.strip()  # Supprimer les espaces inutiles


In [12]:
# Charger les données
X_train, X_test, y_train, y_test = train_test_split(df['Question'], df['Category'], test_size=0.2, random_state=42)
nlp = spacy.load("en_core_web_sm")

# Tokenization et token de WH questions
def tokenize_and_process(text):
    tokens = nlp(text)
    processed_tokens = []
    for token in tokens:
        if token.is_alpha and not token.is_stop:
            if token.tag_ in ["WP", "WDT", "WP$", "WRB"]:  # WH questions POS tags
                processed_tokens.append(token.text.lower())  # Garder les WH tels quels
            else:
                lemmatized_token = token.lemma_.lower()  # Lemmatization pour les autres mots
                processed_tokens.append(lemmatized_token)
    return processed_tokens

# Formation du modèle Word2Vec
def train_word2vec(corpus, vector_size=100, window=4, min_count=5, workers=4):
    sentences = [tokenize_and_process(text) for text in corpus]
    model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    return model

# Transformer les questions en vecteurs avec Word2Vec
def vectorize_with_word2vec(texts, model, size=100):
    vectorized_texts = []
    for text in texts:
        tokens = tokenize_and_process(text)
        vectors = [model.wv[token] for token in tokens if token in model.wv]  # Garde uniquement les mots présents dans le modèle
        if vectors:
            vectorized_texts.append(np.mean(vectors, axis=0))  # Moyenne des vecteurs
        else:
            vectorized_texts.append(np.zeros(size))  # Remplacer par un vecteur zéro si aucun token trouvé
    return np.array(vectorized_texts)

In [21]:
# Entraînement du modèle Word2Vec
word2vec_model = train_word2vec(X_train, vector_size=100, window=4, min_count=5, workers=4)

# Vectoriser le train et test set
X_train_vectors = vectorize_with_word2vec(X_train, word2vec_model)
X_test_vectors = vectorize_with_word2vec(X_test, word2vec_model)

# Classification avec Logistic Regression
classifier = LogisticRegression()
classifier.fit(X_train_vectors, y_train)

# Prédictions et évaluation
y_pred = classifier.predict(X_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using Word2Vec: {accuracy}")

Accuracy using Word2Vec: 0.28780934922089824


In [14]:
from sklearn.preprocessing import LabelEncoder
# Encode les labels (les catégories)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


In [15]:

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# Définir le modèle ANN
def build_ann_model(input_size, hidden_layer_sizes, output_size, activation='relu'):
    model = Sequential()
    model.add(Dense(hidden_layer_sizes[0], input_dim=input_size, activation=activation))
    for units in hidden_layer_sizes[1:]:
        model.add(Dense(units, activation=activation))
    model.add(Dense(output_size, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Configuration des hyperparamètres
input_size = 100  # taille des vecteurs générés par Word2Vec
hidden_layer_sizes = [128, 64]  # Taille des couches cachées
output_size = len(label_encoder.classes_)  # Nombre d'étiquettes uniques


In [18]:
# Construction du modèle ANN
ann_model = build_ann_model(input_size, hidden_layer_sizes, output_size)

# Entraîner le modèle
history = ann_model.fit(X_train_vectors, y_train_encoded, epochs=100, batch_size=64, validation_split=0.2, verbose=1)

# Prédictions sur le jeu de test
y_pred_ann = ann_model.predict(X_test_vectors)
y_pred_classes_ann = np.argmax(y_pred_ann, axis=1)

# Évaluation du modèle
accuracy_ann = accuracy_score(y_test_encoded, y_pred_classes_ann)
print(f"Test set accuracy with ANN: {accuracy_ann:.4f}")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2235 - loss: 1.7540 - val_accuracy: 0.2222 - val_loss: 1.6560
Epoch 2/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.2538 - loss: 1.6436 - val_accuracy: 0.2314 - val_loss: 1.6313
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.2603 - loss: 1.6302 - val_accuracy: 0.2658 - val_loss: 1.6132
Epoch 4/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.2903 - loss: 1.6061 - val_accuracy: 0.2955 - val_loss: 1.5979
Epoch 5/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3185 - loss: 1.5838 - val_accuracy: 0.2864 - val_loss: 1.5871
Epoch 6/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3296 - loss: 1.5704 - val_accuracy: 0.3013 - val_loss: 1.5752
Epoch 7/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━

In [20]:
from sklearn.metrics import classification_report, accuracy_score

# Prédictions sur le jeu de test
y_pred_ann = ann_model.predict(X_test_vectors)
y_pred_classes_ann = np.argmax(y_pred_ann, axis=1)

# Evaluation du modèle ANN
accuracy_ann = accuracy_score(y_test_encoded, y_pred_classes_ann)
print(f"Test set accuracy with ANN: {accuracy_ann:.4f}")

# Générer le rapport de classification
target_names = label_encoder.classes_  # Obtenir les noms des catégories
report_ann = classification_report(y_test_encoded, y_pred_classes_ann, target_names=target_names, output_dict=True)

# Rapport final
print("Classification Report:\n")
report_ann

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 592us/step
Test set accuracy with ANN: 0.5133
Classification Report:



{'ABBR': {'precision': 0.7692307692307693,
  'recall': 0.43478260869565216,
  'f1-score': 0.5555555555555556,
  'support': 23.0},
 'DESC': {'precision': 0.5060728744939271,
  'recall': 0.5506607929515418,
  'f1-score': 0.5274261603375527,
  'support': 227.0},
 'ENTY': {'precision': 0.4954545454545455,
  'recall': 0.44308943089430897,
  'f1-score': 0.4678111587982833,
  'support': 246.0},
 'HUM': {'precision': 0.43125,
  'recall': 0.575,
  'f1-score': 0.4928571428571429,
  'support': 240.0},
 'LOC': {'precision': 0.6447368421052632,
  'recall': 0.5903614457831325,
  'f1-score': 0.6163522012578616,
  'support': 166.0},
 'NUM': {'precision': 0.5755395683453237,
  'recall': 0.42328042328042326,
  'f1-score': 0.4878048780487805,
  'support': 189.0},
 'accuracy': 0.5132905591200734,
 'macro avg': {'precision': 0.5703807666049715,
  'recall': 0.5028624502675099,
  'f1-score': 0.5246345161425294,
  'support': 1091.0},
 'weighted avg': {'precision': 0.5258991407800067,
  'recall': 0.51329055912