In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import spacy  # Pour la lemmatisation


In [11]:
import pandas as pd

# Load the CSV file
df_train_csv = pd.read_csv("questions_dataset.csv", header=None, names=["data"])

# Prepare an empty list to store transformed rows
transformed_data = []

# Iterate through each row
for _, row in df_train_csv.iterrows():
    # Split category into Category and Subcategory
    category_parts = row['data'].split(":")
    classe=category_parts[0]
    question_split=category_parts[1]
    # Split Question into Label and the rest
    label_question= question_split.split(' ')
    label = label_question[0]  
    question_rest =  " ".join(label_question[1:])  # The rest as the question  # Remaining part of the question
    
    # Append the transformed data as a tuple
    transformed_data.append(( question_rest,classe, label))

# Create a new DataFrame with the transformed data
df_train = pd.DataFrame(transformed_data, columns=["Question","Category", "Subcategory"])

# Save the transformed DataFrame to a new CSV file
df_train.to_csv("csv_data_file_for_test3.csv", index=False)

In [12]:
import pandas as pd

# Open the text file and read line by line
with open("test_dataset.txt", "r") as file:
    lines = file.readlines()

# Prepare an empty list to store transformed rows
transformed_data = []

# Iterate through each line in the file
for line in lines:
    line = line.strip()  # Remove any extra spaces or newlines at the beginning and end
    
    # Split the line into category and question based on the first ':'
    if ":" in line:
        category_parts = line.split(":", 1)  # Split into category and the rest of the question
        classe = category_parts[0].strip()  # The category part
        question = category_parts[1].strip()  # The rest as the question
        
        # Split the question into label and the rest of the question
        label_question = question.split(' ', 1)
        label = label_question[0].strip() if len(label_question) > 0 else ""
        question_rest = label_question[1].strip() if len(label_question) > 1 else ""
        
        # Append the transformed data as a tuple
        transformed_data.append((question_rest, classe, label))

# Create a new DataFrame with the transformed data
df_test = pd.DataFrame(transformed_data, columns=["Question", "Category", "Subcategory"])

# Save the transformed DataFrame to a new CSV file (still can save as CSV even if original was TXT)
df_test.to_csv("test_txt_for_test3.csv", index=False)


In [13]:
import spacy
from nltk.stem import PorterStemmer

nlp = spacy.load("en_core_web_sm")  # Load spaCy model for text processing
stemmer = PorterStemmer()  # Use the Porter stemming algorithm

def preprocess_text_keep_significant_words(text):
    doc = nlp(text)  # Analyze the text using spaCy
    stemmed_tokens = []
    
    for token in doc:
        # Skip stop words, except for WH-questions
        if token.is_stop and token.tag_ not in ["WP", "WDT", "WP$", "WRB"]:
            continue  # Skip the stop words that are not WH-questions
        
        # Stemming of WH-questions
        elif token.tag_ in ["WP", "WDT", "WP$", "WRB"]:  # WH-question POS tags
            lemmatized_token = token.lemma_  # Stemming and lemma WH-questions
            stemmed_tokens.append(lemmatized_token)  # Add the stemmed WH-question
        else:
            lemmatized_token = token.lemma_  # Stemming and lemma for other words
            stemmed_tokens.append(lemmatized_token)  # Add the stemmed word
    
    return ' '.join(stemmed_tokens)  # Return the preprocessed sentence as a string


In [14]:
X_train = df_train["Question"]
X_test = df_test["Question"]

# Step 1: Classify into Category
y_train_category = df_train["Category"]
y_test_category = df_test["Category"]

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialiser le TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2),max_features=900)


In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
import numpy as np


# Data preparation (same as before)
df_train["Combined_Label"] = df_train["Category"] + "_" + df_train["Subcategory"]
df_test["Combined_Label"] = df_test["Category"] + "_" + df_test["Subcategory"]

# Text preprocessing
X_train_processed = df_train["Question"].apply(preprocess_text_keep_significant_words)
X_test_processed = df_test["Question"].apply(preprocess_text_keep_significant_words)

# Tokenize the processed text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_processed)
X_train_seq = tokenizer.texts_to_sequences(X_train_processed)
X_test_seq = tokenizer.texts_to_sequences(X_test_processed)

# Pad sequences to ensure equal input length
max_sequence_length = 100  # Adjust based on your dataset
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Convert labels to categorical
y_train_combined = df_train["Combined_Label"]
y_test_combined = df_test["Combined_Label"]

# Encode labels (you can also use label encoding or one-hot encoding)
labels = list(set(y_train_combined))
label_to_idx = {label: idx for idx, label in enumerate(labels)}
y_train_encoded = np.array([label_to_idx[label] for label in y_train_combined])
y_test_encoded = np.array([label_to_idx[label] for label in y_test_combined])

# One-hot encoding (optional)
y_train_one_hot = to_categorical(y_train_encoded, num_classes=len(labels))
y_test_one_hot = to_categorical(y_test_encoded, num_classes=len(labels))

# RNN model with LSTM
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(len(labels), activation='softmax'))  # Softmax for multi-class classification

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train_one_hot, epochs=20, batch_size=32, validation_data=(X_test_pad, y_test_one_hot))

# Make predictions
y_pred_prob = model.predict(X_test_pad)
y_pred_encoded = np.argmax(y_pred_prob, axis=1)

# Convert predictions back to labels
idx_to_label = {idx: label for label, idx in label_to_idx.items()}
y_pred_combined = [idx_to_label[idx] for idx in y_pred_encoded]

# Add predictions to the test dataframe
df_test["Predicted_Combined_Label"] = y_pred_combined

# Split combined predictions back into Category and Subcategory
df_test["Predicted_Category"] = df_test["Predicted_Combined_Label"].apply(lambda x: x.split("_")[0])
df_test["Predicted_Subcategory"] = df_test["Predicted_Combined_Label"].apply(lambda x: x.split("_")[1])

# Evaluation
print("Final Classification Results:")
print(df_test[["Question", "Category", "Subcategory", "Predicted_Category", "Predicted_Subcategory"]])

# Classification report
print("Combined Label Classification Report:")
print(classification_report(y_test_combined, y_pred_combined))


Epoch 1/20




[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.1569 - loss: 3.4171 - val_accuracy: 0.4200 - val_loss: 2.6904
Epoch 2/20
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - accuracy: 0.3724 - loss: 2.4825 - val_accuracy: 0.5560 - val_loss: 1.9642
Epoch 3/20
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - accuracy: 0.5691 - loss: 1.6227 - val_accuracy: 0.6180 - val_loss: 1.6415
Epoch 4/20
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - accuracy: 0.7120 - loss: 1.0524 - val_accuracy: 0.6540 - val_loss: 1.5161
Epoch 5/20
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.8265 - loss: 0.6644 - val_accuracy: 0.6800 - val_loss: 1.4452
Epoch 6/20
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.8986 - loss: 0.4176 - val_accuracy: 0.6900 - val_loss: 1.4685
Epoch 7/20
[1m171/171[0m [32m━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
