## Data Loading

In [None]:
import pandas as pd
import numpy as np


data = pd.read_csv('Referentiel_Skilit_Nov-2023.csv', encoding='latin1', sep=';')
data_array = data.values
data_annotated = data_array[data_array[:, 2] > 0]
data_non_annotated = data_array[data_array[:, 2] == 0]
print("Données annotées:")
print(data_annotated)
print("\nDonnées non annotées:")
print(data_non_annotated)


In [None]:
import numpy as np
unique_axes = np.unique(data_annotated[:, 2])
data_by_axis = {}
for axis in unique_axes:
    data_by_axis[axis] = data_annotated[data_annotated[:, 2] == axis]
    
print("Données pour l'axe 16:")
print(data_by_axis[16])

print("\nDonnées pour l'axe 2:")
print(data_by_axis[2])


## Word Embedding

#### Calculation of Word Embeddings for Words Using CamemBERT

In [None]:
from transformers import CamembertModel, CamembertTokenizer
import torch

model = CamembertModel.from_pretrained('camembert-base')
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
def calculate_embeddings(text):
    try:
        encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    except RuntimeError as e:
        print(f"Error When Transforming Text'{text}' into Tensor : {str(e)}")
        return None
    
    with torch.no_grad():
        output = model(**encoded_input)
    
    if output is not None:
        return output.last_hidden_state.mean(dim=1).squeeze(0)
    else:
        return None

transformed_embeddings_by_axis = {}
for row in data_annotated:
    original_text, transformed_text, original_axis = row
    transformed_embedding = calculate_embeddings(transformed_text)
    if original_axis not in transformed_embeddings_by_axis:
        transformed_embeddings_by_axis[original_axis] = []
    transformed_embeddings_by_axis[original_axis].append((transformed_embedding, original_axis))


## Visualization
#### Here, we transform the 768 dimensions of Camembert Word Embeddings into 2D for visualization using PCA.

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import torch


embeddings = []
labels = []

for axis, emb_list in transformed_embeddings_by_axis.items():
    for emb, _ in emb_list:
        embeddings.append(emb.cpu().numpy())  
        labels.append(axis)


embeddings = np.array(embeddings)
labels = np.array(labels)


pca = PCA(n_components=2)
pca_result = pca.fit_transform(embeddings)


plt.figure(figsize=(10, 8))

cmap = plt.get_cmap('tab20')


unique_axes = np.unique(labels)  
colors = plt.cm.tab20(np.linspace(0, 1, len(unique_axes)))  


legend_patches = [mpatches.Patch(color=colors[i], label=axis) for i, axis in enumerate(unique_axes)]


plt.figure(figsize=(37.5, 30))
for i, axis in enumerate(unique_axes):
    axis_data = pca_result[labels == axis]
    plt.scatter(axis_data[:, 0], axis_data[:, 1], color=colors[i], label=axis)


plt.legend(handles=legend_patches, title='Axe')

plt.title('Projection of CamemBERT Embeddings Using PCA')

plt.show()

#### Here, we transform the 768 dimensions of Camembert Word Embeddings into 2D for visualization using TSNE.

In [None]:

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import torch
from sklearn.manifold import TSNE


embeddings = []
labels = []

for axis, emb_list in transformed_embeddings_by_axis.items():
    for emb, _ in emb_list:
        embeddings.append(emb.cpu().numpy())  
        labels.append(axis)


embeddings = np.array(embeddings)
labels = np.array(labels)


tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(embeddings)


plt.figure(figsize=(10, 8))

cmap = plt.get_cmap('tab20')


unique_axes = np.unique(labels)  
colors = plt.cm.tab20(np.linspace(0, 1, len(unique_axes)))  


legend_patches = [mpatches.Patch(color=colors[i], label=axis) for i, axis in enumerate(unique_axes)]


plt.figure(figsize=(37.5, 30))
for i, axis in enumerate(unique_axes):
    axis_data = tsne_result[labels == axis]
    plt.scatter(axis_data[:, 0], axis_data[:, 1], color=colors[i], label=axis)


plt.legend(handles=legend_patches, title='Axe')

plt.title('Projection of CamemBERT Embeddings Using TSNE')

plt.show()

## Centroids
#### Calculation of the centroids for each of the 16 axes.

In [None]:
centroids = {}
for axis, embeddings_list in transformed_embeddings_by_axis.items():
    stack = torch.stack([emb[0] for emb in embeddings_list])  
    print(f"Axe {axis}: {stack.size()}")
    centroids[axis] = torch.mean(stack, dim=0)  


In [None]:
for axis, centroid in centroids.items():
    print(f"Centroid of : {axis}:")
    print(centroid)

#### Visualizing the centroids in relation to other terms in each axis.

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


pca = PCA(n_components=2)

fig, axs = plt.subplots(4, 4, figsize=(50, 50))  
axs = axs.flatten()  


for idx, (axis, embeddings_list) in enumerate(transformed_embeddings_by_axis.items()):
    
    embeddings_for_axis = torch.stack([emb[0] for emb in embeddings_list])
    pca_result_for_axis = pca.fit_transform(embeddings_for_axis.cpu().numpy())
    
    
    centroid_for_axis = centroids[axis].cpu().numpy()
    centroid_pca_for_axis = pca.transform([centroid_for_axis])
    
    
    axs[idx].scatter(pca_result_for_axis[:, 0], pca_result_for_axis[:, 1], color='black', alpha=0.5)
    
    
    axs[idx].scatter(centroid_pca_for_axis[0, 0], centroid_pca_for_axis[0, 1], color='red', s=100)
    
    
    axs[idx].set_title(f"Axe {axis}")


plt.tight_layout()
plt.show()


## Distance Calculation and Error Rate
#### Calculating the distance and error rate to determine whether this method can be used for data augmentation.

In [None]:

def cosine_distance_torch(vec1, vec2):
    return 1 - torch.nn.functional.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0), dim=1)[0]

total_words = 0
errors = 0


for axis, embeddings_list in transformed_embeddings_by_axis.items():
    for transformed_embedding, original_axis in embeddings_list:
        total_words += 1
        distances = {ax: cosine_distance_torch(transformed_embedding, centroid) for ax, centroid in centroids.items()}
        closest_axis = min(distances, key=distances.get)
        
        if closest_axis != original_axis:
            errors += 1


error_percentage = (errors / total_words) * 100
print(errors)
print(total_words)
print(f"Pourcentage des erreurs: {error_percentage:.2f}%")


## Visualizations of Some Errors

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt

axes = ['Axe' + str(i) for i in range(1, 5)]  
def cosine_distance_torch(vec1, vec2):
    return 1 - torch.nn.functional.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0), dim=1)[0]

total_words = 0
errors = 0
example_errors = [] 

for axis, embeddings_list in transformed_embeddings_by_axis.items():
    for transformed_embedding, original_axis in embeddings_list:
        total_words += 1
        distances = {ax: cosine_distance_torch(transformed_embedding, centroid) for ax, centroid in centroids.items()}
        closest_axis = min(distances, key=distances.get)
        
        if closest_axis != original_axis:
            example_errors.append((original_axis, closest_axis, distances))
            
            
            if len(example_errors) == 6:
                break
    if len(example_errors) == 6:
        break



selected_errors = example_errors[:6]

true_distances = [details[2][details[0]] for details in selected_errors]
false_distances = [details[2][details[1]] for details in selected_errors]
labels = [f"True: {details[0]}, False: {details[1]}" for details in selected_errors]

fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(true_distances)) 
width = 0.35 

rects1 = ax.bar(x - width/2, true_distances, width, label='True Centroid')
rects2 = ax.bar(x + width/2, false_distances, width, label='Closest Incorrect Centroid')

ax.set_ylabel('Cosine Distance')
ax.set_title('Cosine Distances to True and Closest Incorrect Centroid for Error Examples')
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.legend()

plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm  




fig, axes = plt.subplots(4, 4, figsize=(30, 30)) 
axes = axes.flatten()

error_count = 0
axes_used = set()  

for axis, embeddings_list in transformed_embeddings_by_axis.items():
    for transformed_embedding, original_axis in embeddings_list:
        if original_axis in axes_used:
            continue  
        
        distances = {ax: cosine_distance_torch(transformed_embedding, centroid) for ax, centroid in centroids.items()}
        closest_axis = min(distances, key=distances.get)

        if closest_axis != original_axis:
            ax = axes[error_count]
            
            
            original_centroid_2d = centroids[original_axis][:2].numpy()
            closest_centroid_2d = centroids[closest_axis][:2].numpy()
            transformed_embedding_2d = transformed_embedding[:2].numpy()
            
            
            ax.scatter(*original_centroid_2d, color='green', label=f'Original Centroide: {original_axis}')
            ax.scatter(*closest_centroid_2d, color='red', label=f'Closest Centroide: {closest_axis}')
            ax.scatter(*transformed_embedding_2d, color='black', label=f'Mot: {axis}')
            
            
            ax.plot([transformed_embedding_2d[0], original_centroid_2d[0]],
                    [transformed_embedding_2d[1], original_centroid_2d[1]],
                    'g--', label='Distance to Original Centroide')
            ax.plot([transformed_embedding_2d[0], closest_centroid_2d[0]],
                    [transformed_embedding_2d[1], closest_centroid_2d[1]],
                    'r-', label='Distance to Closest Centroide')

            error_count += 1
            axes_used.add(original_axis)  

            if error_count == 16:
                break  

    if error_count == 16:
        break


for i, ax in enumerate(axes):
    if i < error_count:
        ax.set_title(f'Error in Axis {list(transformed_embeddings_by_axis.keys())[i]}')
        ax.legend()

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm  

fig, axes = plt.subplots(4, 4, figsize=(30, 30))  
axes = axes.flatten()

error_count = 0
axes_used = set()  


colors = cm.rainbow(np.linspace(0, 1, len(centroids)))

for axis, embeddings_list in transformed_embeddings_by_axis.items():
    if error_count >= 16:
        break  

    for transformed_embedding, original_axis in embeddings_list:
        if original_axis in axes_used:
            continue  

        distances = {ax: cosine_distance_torch(transformed_embedding, centroid) for ax, centroid in centroids.items()}
        closest_axis = min(distances, key=distances.get)

        if closest_axis != original_axis:
            ax = axes[error_count]
            
            
            centroid_positions = {ax: centroids[ax][:2].numpy() for ax in centroids}

            
            for idx, (centroid_label, pos) in enumerate(centroid_positions.items()):
                ax.scatter(*pos, color=colors[idx], s=100)

            
            transformed_embedding_2d = transformed_embedding[:2].numpy()
            ax.scatter(*transformed_embedding_2d, color='black', s=150)

            
            for idx, (centroid_label, pos) in enumerate(centroid_positions.items()):
                line_style = '--' if centroid_label == original_axis else '-'
                color_='black' if centroid_label == original_axis else colors[idx]
                ax.plot([transformed_embedding_2d[0], pos[0]],
                        [transformed_embedding_2d[1], pos[1]],
                        color=color_, linestyle=line_style)

            axes_used.add(original_axis)
            error_count += 1

            if error_count >= 16:
                break  


for i, ax in enumerate(axes):
    if i < error_count:
        ax.set_title(f'Error in Axis {list(transformed_embeddings_by_axis.keys())[i]}')
        ax.legend(loc='upper right')

plt.tight_layout()
plt.show()
