In [1]:
import warnings
warnings.filterwarnings('ignore')

# Approche classique: embeddings de mots

In [2]:
import pandas as pd

In [3]:
train_df = pd.read_csv("./data/train_df.csv")
train_df = train_df.iloc[:, 1:]
test_df = pd.read_csv("./data/test_df.csv")
test_df = test_df.iloc[:, 1:]

In [4]:
train_df.shape

(3397, 14)

In [5]:
test_df.shape

(1457, 14)

In [6]:
y_train = train_df["target"]
y_test = test_df["target"]

### Embeddings de comptage des mots

En réalisant les embeddings de façon indépendante sur le jeu de train et de test on s'assure qu'il n'y a pas de fuite de données et que l'appréciation de performance du modèle ne sera pas biaisée. Cependant il y a un fort risque d'avoir des OOV lors du test.

In [7]:
train_df .head()#["preprocessed_text"].isna().sum()

Unnamed: 0,target,ids,date,flag,user,text,sentiment_score,cleaned_text,tokenized,preprocessed_text,preprocessed_tokenized,length_tokenized,length_preprocessed_tokenized,sia_sentiment
0,1,2177492915,Mon Jun 15 06:12:09 PDT 2009,NO_QUERY,Ayyaya,"@bradhfh well, I hope you don't even if you do...",0,"<mention> well, I hope you do not even if you ...","['<mention>', 'well', ',', 'I', 'hope', 'you',...",<mention> well hope even think sleep airport ...,"['<', 'mention', '>', 'well', 'hope', 'even', ...",27,14,0
1,1,2046797519,Fri Jun 05 12:42:25 PDT 2009,NO_QUERY,ShyHustla,@MamaMisfit Yea but im still grounded for life...,0,<mention> Yea but im still grounded for life a...,"['<mention>', 'Yea', 'but', 'im', 'still', 'gr...",<mention> yea im still grounded life going mak...,"['<', 'mention', '>', 'yea', 'im', 'still', 'g...",20,12,1
2,1,1558166592,Sun Apr 19 07:29:47 PDT 2009,NO_QUERY,shirlise,Driving Alex to the airport Then back to work...,0,Driving Alex to the airport Then back to work ...,"['Driving', 'Alex', 'to', 'the', 'airport', 'T...",driving alex airport back work finish report ...,"['driving', 'alex', 'airport', 'back', 'work',...",31,16,0
3,0,1827398448,Sun May 17 10:37:37 PDT 2009,NO_QUERY,ceibner,big fan of the Morrisey PJ's we got - was so t...,4,Big fan of the Morrisey PJ's we got - was so t...,"['Big', 'fan', 'of', 'the', 'Morrisey', 'PJ', ...",big fan morrisey pj 's got - tempted stay & sn...,"['big', 'fan', 'morrisey', 'pj', ""'s"", 'got', ...",33,15,0
4,1,1553668748,Sat Apr 18 14:53:30 PDT 2009,NO_QUERY,firemanlv,PLEASE tell me they can put me on another earl...,0,PLEASE tell me they can put me on another earl...,"['PLEASE', 'tell', 'me', 'they', 'can', 'put',...",please tell put another earlier flight since m...,"['please', 'tell', 'put', 'another', 'earlier'...",27,14,1


#### CountVectorizer

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
train_count_sparse = count_vectorizer.fit_transform(train_df['preprocessed_text'])

test_count_sparse = count_vectorizer.transform(test_df['preprocessed_text'])


In [9]:
# # Précaution à prendre avec les matrices creuses
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler(with_mean=False)
# train_count_scaled = scaler.fit_transform(train_count_sparse)
# test_count_scaled = scaler.fit_transform(test_count_sparse)

In [10]:
import scipy.sparse

# train_count = pd.DataFrame.sparse.from_spmatrix(train_count_sparse, columns=count_vectorizer.get_feature_names_out())

# test_count = pd.DataFrame.sparse.from_spmatrix(test_count_sparse, columns=count_vectorizer.get_feature_names_out())
train_count = pd.DataFrame(train_count_sparse.toarray(), columns=count_vectorizer.get_feature_names_out())
test_count = pd.DataFrame(test_count_sparse.toarray(), columns=count_vectorizer.get_feature_names_out())


In [11]:
from scipy.sparse import csr_matrix, hstack
train_count["target"] = y_train.values
test_count["target"] = y_test.values

### Modélisation des embeddings : test rapides avec Pycaret

In [12]:
# important pycaret supporte python 3.9 à 3.11
import sys
print(sys.version)


3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]


In [None]:
# ! pip install pycaret
# ! pip install pycaret[full]

In [13]:
import pycaret

In [None]:
# import pycaret classification and init setup
from pycaret.classification import *
s = setup(data=train_count, target='target', test_data=test_count, session_id = 123, index=False, use_gpu=True)

In [15]:
# import ClassificationExperiment and init the class
from pycaret.classification import ClassificationExperiment
exp = ClassificationExperiment()

In [16]:
# check the type of exp A CHANGER POUR MLFLOW à ce moment là le setup se fait avec 
type(exp)
# exp.setup(data, target = 'Class variable', session_id = 123)

pycaret.classification.oop.ClassificationExperiment

In [18]:
# compare baseline models and returns the best 6 ones
best_models = compare_models(sort='Recall',n_select=6 ) #, include=["nb", "lr"]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.5013,0.5,1.0,0.5013,0.6678,0.0,0.0,0.55
nb,Naive Bayes,0.5558,0.5542,0.7992,0.5389,0.6435,0.1105,0.1256,0.834
gbc,Gradient Boosting Classifier,0.6759,0.7606,0.727,0.6606,0.6916,0.3516,0.3543,12.591
lr,Logistic Regression,0.7115,0.7838,0.6947,0.7201,0.7064,0.4231,0.4241,1.148
svm,SVM - Linear Kernel,0.6821,0.7521,0.6818,0.6864,0.6823,0.3642,0.366,1.315
ridge,Ridge Classifier,0.6906,0.7565,0.6782,0.6975,0.6869,0.3813,0.3823,1.124
lightgbm,Light Gradient Boosting Machine,0.6862,0.767,0.6671,0.6951,0.6798,0.3725,0.3737,1.021
et,Extra Trees Classifier,0.7109,0.7875,0.6507,0.7426,0.6916,0.4221,0.427,2.053
rf,Random Forest Classifier,0.6997,0.7862,0.6225,0.7389,0.6743,0.3998,0.406,1.338
dt,Decision Tree Classifier,0.6456,0.6515,0.6102,0.6583,0.6325,0.2914,0.2927,1.756


In [19]:
# Sélection manuelle de modèles pour stacking
nb = best_models[1]  # naives bayes 
lr = best_models[3]  # logistic regression 
gbc = best_models[2]  # Gradient Boosting Classifier

In [20]:
# Construire le modèle Stacked avec ces modèles (on peut aussi créer des modèles lr = create_model("lr"))
stacked_model = stack_models([nb, lr, gbc])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7353,0.8213,0.6901,0.7613,0.7239,0.4709,0.473
1,0.6941,0.7789,0.6725,0.7055,0.6886,0.3884,0.3888
2,0.6706,0.7696,0.6199,0.6928,0.6543,0.3416,0.3435
3,0.7088,0.8116,0.6941,0.7152,0.7045,0.4176,0.4178
4,0.7176,0.7886,0.7353,0.7102,0.7225,0.4353,0.4356
5,0.7441,0.818,0.7353,0.7485,0.7418,0.4882,0.4883
6,0.7176,0.7958,0.7412,0.7079,0.7241,0.4353,0.4358
7,0.6785,0.7499,0.6412,0.6943,0.6667,0.3571,0.3581
8,0.705,0.7759,0.7059,0.7059,0.7059,0.41,0.41
9,0.7404,0.8017,0.7647,0.7303,0.7471,0.4807,0.4813


In [22]:
stacked_model

Le staking combine les forces des différents modèles en donnant des performances homogènes. Pour cet embedding nous allons utiliser le modèle composite.

In [25]:
# Amélioration du meilleur modèle (logistic regression)
tuned_model = tune_model(estimator=best_models[2], 
                         optimize="Recall", choose_better=True,
                         verbose=False)

KeyboardInterrupt: 

In [None]:
tuned_model
# ou plot_model(tuned_model, plot="parameter")

In [None]:
# plot confusion matrix
plot_model(tuned_model, plot = 'confusion_matrix')

In [None]:
plot_model(tuned_model, plot = 'auc') # Erreur sur les matrices creuses, même en normalisant.

In [None]:
plot_model(tuned_model, plot = 'class_report') # boundary: Erreur sur les matrices creuses, même en normalisant.

In [None]:
pred_holdouts = predict_model(tuned_model)
pred_holdouts.head()

In [None]:
pred_holdouts.shape

In [None]:
# Finalize the model (train on the entire dataset)
finalize_model(tuned_model)

In [None]:
# save_model(best, model_name='CountVectorizer_Best_Model')

import os
from datetime import date, datetime

# os.chdir("C:/users/Cecil/Documents/oc_aiep7")
PATH = os.getcwd()+os.sep
current_time = datetime.now().strftime("%m-%d-%Y_%H-%M")
save_model(tuned_model, PATH + "models" + os.sep + "best_model_CountVectorizer_" + current_time)


# loaded_bestmodel = load_model('CountVectorizer_Best_Model')

In [None]:
# Générer des prédictions pour le jeu d'entrainement
predictions = predict_model(tuned_model, data=train_count)

In [None]:
df = predictions

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.decomposition import PCA

# Séparer les features des labels
features = df.drop(columns=['target', 'prediction_label', 'prediction_score'])
true_labels = df['target']
predicted_labels = df['prediction_label']

# Appliquer Neighborhood Component Analysis (NCA) pour réduire à 2 dimensions
nca = NeighborhoodComponentsAnalysis(n_components=2, random_state=42)
nca_transformed = nca.fit_transform(features, true_labels)

# Création d'un DataFrame pour la visualisation
nca_df = pd.DataFrame(nca_transformed, columns=['NCA1', 'NCA2'])
nca_df['True Labels'] = true_labels
nca_df['Predicted Labels'] = predicted_labels

# Identifier les points mal classés
nca_df['Misclassified'] = nca_df['True Labels'] != nca_df['Predicted Labels']

# Identifier les faux positifs et les faux négatifs
nca_df['False Positive'] = (nca_df['True Labels'] == 0) & (nca_df['Predicted Labels'] == 1)
nca_df['False Negative'] = (nca_df['True Labels'] == 1) & (nca_df['Predicted Labels'] == 0)

# Retrouver les textes mal interprétés pour les faux positifs et les faux négatifs
false_positive_texts = train_df.loc[nca_df[nca_df['False Positive']].index, 'text']
false_negative_texts = train_df.loc[nca_df[nca_df['False Negative']].index, 'text']

# Graphique 1 : Projection NCA avec les labels réels
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.scatterplot(x='NCA1', y='NCA2', hue='True Labels', data=nca_df, palette={0: 'green', 1: 'orange'}, s=100)
plt.title('NCA Projection with True Labels')

# Graphique 2 : Projection NCA avec les labels prédits et les points mal classés
plt.subplot(1, 2, 2)

# Points correctement classés (cercles)
sns.scatterplot(x='NCA1', y='NCA2', hue='Predicted Labels', data=nca_df[~nca_df['Misclassified']], 
                palette={0: 'green', 1: 'orange'}, s=100, marker='o', label='Correctly Classified')

# Points mal classés (croix "x")
sns.scatterplot(x='NCA1', y='NCA2', hue='Predicted Labels', data=nca_df[nca_df['Misclassified']], 
                palette={0: 'red', 1: 'black'}, s=60, marker='x', edgecolor='red', linewidth=2, label='Misclassified')

plt.title('NCA Projection with Predicted Labels and Misclassified Points')

# Ajustement de la légende
plt.legend(title='Classification à partir de CountVectorizer', loc='upper left')

# Afficher les graphes
plt.tight_layout()
plt.show()

# Afficher les textes mal interprétés par groupe

print("\nFaux Positifs (Prédit comme 1, mais vrai label 0) :")
for text in false_positive_texts:
    print(f"- {text}")
    
print("\nFaux Négatifs (Prédit comme 0, mais vrai label 1) :")
for text in false_negative_texts:
    print(f"- {text}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler

# Supposons que vous ayez un DataFrame nommé 'df' contenant vos embeddings et les colonnes mentionnées
# et un DataFrame 'train_df' contenant les textes dans la colonne 'text'.
# df = pd.read_csv('path_to_your_data.csv')  # Chargez votre dataset si nécessaire
# train_df = pd.read_csv('path_to_train_data.csv')  # Chargez votre dataset si nécessaire

# Séparer les features des labels
features = df.drop(columns=['target', 'prediction_label', 'prediction_score'])
true_labels = df['target']
predicted_labels = df['prediction_label']

# Standardiser les données avant d'appliquer le Kernel PCA
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Appliquer Kernel PCA avec un noyau gaussien (RBF kernel) pour capturer les non-linéarités
kpca = KernelPCA(n_components=2, kernel='rbf', gamma=0.001, random_state=42)
kpca_transformed = kpca.fit_transform(features_scaled)

# Création d'un DataFrame pour la visualisation
kpca_df = pd.DataFrame(kpca_transformed, columns=['KPCA1', 'KPCA2'])
kpca_df['True Labels'] = true_labels
kpca_df['Predicted Labels'] = predicted_labels

# Identifier les points mal classés
kpca_df['Misclassified'] = kpca_df['True Labels'] != kpca_df['Predicted Labels']

# Identifier les faux positifs et les faux négatifs
kpca_df['False Positive'] = (kpca_df['True Labels'] == 0) & (kpca_df['Predicted Labels'] == 1)
kpca_df['False Negative'] = (kpca_df['True Labels'] == 1) & (kpca_df['Predicted Labels'] == 0)

# Retrouver les textes mal interprétés pour les faux positifs et les faux négatifs
false_positive_texts = train_df.loc[kpca_df[kpca_df['False Positive']].index, 'text']
false_negative_texts = train_df.loc[kpca_df[kpca_df['False Negative']].index, 'text']

# Graphique 1 : Projection KPCA avec les labels réels
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.scatterplot(x='KPCA1', y='KPCA2', hue='True Labels', data=kpca_df, palette={0: 'blue', 1: 'red'}, s=100)
plt.title('Kernel PCA Projection with True Labels')

# Graphique 2 : Projection KPCA avec les labels prédits et les points mal classés
plt.subplot(1, 2, 2)

# Points correctement classés (cercles)
sns.scatterplot(x='KPCA1', y='KPCA2', hue='Predicted Labels', data=kpca_df[~kpca_df['Misclassified']], 
                palette={0: 'green', 1: 'orange'}, s=100, marker='o')

# Points mal classés (croix "x")
sns.scatterplot(x='KPCA1', y='KPCA2', hue='Predicted Labels', data=kpca_df[kpca_df['Misclassified']], 
                palette={0: 'green', 1: 'orange'}, s=100, marker='x', edgecolor='red', linewidth=2)

plt.title('Kernel PCA Projection with Predicted Labels and Misclassified Points')

# Afficher les graphes
plt.tight_layout()
plt.show()

# Afficher les textes mal interprétés par groupe

print("\nFaux Négatifs (Prédit comme 0, mais vrai label 1) :")
for text in false_negative_texts:
    print(f"- {text}")

print("\nFaux Positifs (Prédit comme 1, mais vrai label 0) :")
for text in false_positive_texts:
    print(f"- {text}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import KernelPCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

# Supposons que vous ayez un DataFrame nommé 'df' contenant vos embeddings et les colonnes mentionnées
# et un DataFrame 'train_df' contenant les textes dans la colonne 'text'.
# df = pd.read_csv('path_to_your_data.csv')  # Chargez votre dataset si nécessaire
# train_df = pd.read_csv('path_to_train_data.csv')  # Chargez votre dataset si nécessaire

# Séparer les features des labels
features = df.drop(columns=['target', 'prediction_label', 'prediction_score'])
true_labels = df['target']
predicted_labels = df['prediction_label']

# Standardiser les données avant d'appliquer le Kernel PCA
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Appliquer Kernel PCA avec un noyau gaussien (RBF kernel) pour réduire à 50 dimensions
kpca = KernelPCA(n_components=50, kernel='rbf', gamma=0.001, random_state=42)
kpca_transformed = kpca.fit_transform(features_scaled)

# Appliquer t-SNE pour réduire les 50 dimensions de Kernel PCA à 2 dimensions
tsne = TSNE(n_components=2, random_state=42, perplexity=5, n_iter=1000)
tsne_transformed = tsne.fit_transform(kpca_transformed)

# Création d'un DataFrame pour la visualisation
tsne_df = pd.DataFrame(tsne_transformed, columns=['tSNE1', 'tSNE2'])
tsne_df['True Labels'] = true_labels
tsne_df['Predicted Labels'] = predicted_labels

# Identifier les points mal classés
tsne_df['Misclassified'] = tsne_df['True Labels'] != tsne_df['Predicted Labels']

# Identifier les faux positifs et les faux négatifs
tsne_df['False Positive'] = (tsne_df['True Labels'] == 0) & (tsne_df['Predicted Labels'] == 1)
tsne_df['False Negative'] = (tsne_df['True Labels'] == 1) & (tsne_df['Predicted Labels'] == 0)

# Retrouver les textes mal interprétés pour les faux positifs et les faux négatifs
false_positive_texts = train_df.loc[tsne_df[tsne_df['False Positive']].index, 'text']
false_negative_texts = train_df.loc[tsne_df[tsne_df['False Negative']].index, 'text']

# Graphique 1 : Projection t-SNE avec les labels réels
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.scatterplot(x='tSNE1', y='tSNE2', hue='True Labels', data=tsne_df, palette={0: 'blue', 1: 'red'}, s=100)
plt.title('t-SNE Projection with True Labels')

# Graphique 2 : Projection t-SNE avec les labels prédits et les points mal classés
plt.subplot(1, 2, 2)

# Points correctement classés (cercles)
sns.scatterplot(x='tSNE1', y='tSNE2', hue='Predicted Labels', data=tsne_df[~tsne_df['Misclassified']], 
                palette={0: 'green', 1: 'orange'}, s=100, marker='o')

# Points mal classés (croix "x")
sns.scatterplot(x='tSNE1', y='tSNE2', hue='Predicted Labels', data=tsne_df[tsne_df['Misclassified']], 
                palette={0: 'green', 1: 'orange'}, s=100, marker='x', edgecolor='red', linewidth=2)

plt.title('t-SNE Projection with Predicted Labels and Misclassified Points')

# Afficher les graphes
plt.tight_layout()
plt.show()

# Afficher les textes mal interprétés par groupe

print("\nFaux Négatifs (Prédit comme 0, mais vrai label 1) :")
for text in false_negative_texts:
    print(f"- {text}")

print("\nFaux Positifs (Prédit comme 1, mais vrai label 0) :")
for text in false_positive_texts:
    print(f"- {text}")


#### TFIdF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df = 2,max_df = 0.5,ngram_range = (1,2))
train_tfidf_sparse = tfidf.fit_transform(train_df['preprocessed_text'])
test_tfidf_sparse = tfidf.transform(test_df['preprocessed_text'])

In [None]:
import scipy.sparse
train_tfidf = pd.DataFrame(train_tfidf_sparse.toarray(), columns=tfidf.get_feature_names_out())
test_tfidf = pd.DataFrame(test_tfidf_sparse.toarray(), columns=tfidf.get_feature_names_out())

In [None]:
from scipy.sparse import csr_matrix, hstack
train_tfidf["target"] = y_train.values
test_tfidf["target"] = y_test.values

In [None]:
# import pycaret classification and init setup
from pycaret.classification import *
s_tf = setup(data=train_tfidf, target='target', test_data=test_tfidf, session_id = 123, index=False, use_gpu=True)

In [None]:
# compare baseline models and returns the best 6 ones
best_models_tf = compare_models(sort='Precision', n_select=6)

In [None]:
# Amélioration du meilleur modèle (logistic regression)
tuned_model_tf = tune_model(estimator=best_models_tf[0], 
                         optimize="Precision", choose_better=True,
                         verbose=False)

In [None]:
tuned_model_tf

In [None]:
# plot confusion matrix
plot_model(tuned_model_tf, plot = 'confusion_matrix')

In [None]:
plot_model(tuned_model_tf, plot = 'class_report')

In [None]:
pred_holdouts_tf = predict_model(tuned_model_tf)
pred_holdouts_tf.head()

In [None]:
# Finalize the model (train on the entire dataset)
finalize_model(tuned_model_tf)

In [None]:
# save_model(best, model_name='CountVectorizer_Best_Model')

import os
from datetime import date, datetime

# os.chdir("C:/users/Cecil/Documents/oc_aiep7")
PATH = os.getcwd()+os.sep
current_time = datetime.now().strftime("%m-%d-%Y_%H-%M")
save_model(tuned_model_tf, PATH + "models" + os.sep + "best_model_TFIdF_" + current_time)


# loaded_bestmodel = load_model('CountVectorizer_Best_Model')

In [None]:
# Générer des prédictions pour le jeu d'entrainement
predictions_tf = predict_model(tuned_model_tf, data=train_tfidf)

In [None]:
df = predictions_tf

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.decomposition import PCA

# Supposons que vous ayez un DataFrame nommé 'df' contenant vos embeddings et les colonnes mentionnées
# et un DataFrame 'train_df' contenant les textes dans la colonne 'text'.
# df = pd.read_csv('path_to_your_data.csv')  # Chargez votre dataset si nécessaire
# train_df = pd.read_csv('path_to_train_data.csv')  # Chargez votre dataset si nécessaire

# Séparer les features des labels
features = df.drop(columns=['target', 'prediction_label', 'prediction_score'])
true_labels = df['target']
predicted_labels = df['prediction_label']

# Appliquer Neighborhood Component Analysis (NCA) pour réduire à 2 dimensions
nca = NeighborhoodComponentsAnalysis(n_components=2, random_state=42)
nca_transformed = nca.fit_transform(features, true_labels)

# Création d'un DataFrame pour la visualisation
nca_df = pd.DataFrame(nca_transformed, columns=['NCA1', 'NCA2'])
nca_df['True Labels'] = true_labels
nca_df['Predicted Labels'] = predicted_labels

# Identifier les points mal classés
nca_df['Misclassified'] = nca_df['True Labels'] != nca_df['Predicted Labels']

# Identifier les faux positifs et les faux négatifs
nca_df['False Positive'] = (nca_df['True Labels'] == 0) & (nca_df['Predicted Labels'] == 1)
nca_df['False Negative'] = (nca_df['True Labels'] == 1) & (nca_df['Predicted Labels'] == 0)

# Retrouver les textes mal interprétés pour les faux positifs et les faux négatifs
false_positive_texts = train_df.loc[nca_df[nca_df['False Positive']].index, 'text']
false_negative_texts = train_df.loc[nca_df[nca_df['False Negative']].index, 'text']

# Graphique 1 : Projection NCA avec les labels réels
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.scatterplot(x='NCA1', y='NCA2', hue='True Labels', data=nca_df, palette={0: 'green', 1: 'orange'}, s=100)
plt.title('NCA Projection with True Labels')

# Graphique 2 : Projection NCA avec les labels prédits et les points mal classés
plt.subplot(1, 2, 2)

# Points correctement classés (cercles)
sns.scatterplot(x='NCA1', y='NCA2', hue='Predicted Labels', data=nca_df[~nca_df['Misclassified']], 
                palette={0: 'green', 1: 'orange'}, s=100, marker='o', label='Correctly Classified')

# Points mal classés (croix "x")
sns.scatterplot(x='NCA1', y='NCA2', hue='Predicted Labels', data=nca_df[nca_df['Misclassified']], 
                palette={0: 'red', 1: 'black'}, s=60, marker='x', edgecolor='red', linewidth=2, label='Misclassified')

plt.title('NCA Projection with Predicted Labels and Misclassified Points')

# Ajustement de la légende
plt.legend(title='Classification', loc='upper left')

# Afficher les graphes
plt.tight_layout()
plt.show()

# Afficher les textes mal interprétés par groupe

print("\nFaux Positifs (Prédit comme 1, mais vrai label 0) :")
for text in false_positive_texts:
    print(f"- {text}")
    
print("\nFaux Négatifs (Prédit comme 0, mais vrai label 1) :")
for text in false_negative_texts:
    print(f"- {text}")