In [2]:
# General imports
import re
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

In [3]:


# Chemin vers le dossier contenant les fichiers .txt
dossier = 'data/preprocessed/'

data = []

for fichier in os.listdir(dossier):
    if fichier.endswith('.txt'):
        chemin_fichier = os.path.join(dossier, fichier)
        with open(chemin_fichier, 'r', encoding='utf-8') as f: 
            contenu = f.read()
            data.append({
                'id': fichier,  
                'texte': contenu      
            })


df = pd.DataFrame(data)


print(df.head())


        id                                              texte
0  336.txt  center display flex var content hcekrn svg con...
1  273.txt  white paper guidelines procurement september w...
2  106.txt  big data new challenges law ethics internation...
3  100.txt  globan maincontent position relative important...
4  241.txt  echnical eport april artificial intelligence p...


In [3]:
# Or we might want to use an already-implemented tool. The NLTK package has a lot of very useful text processing tools, among them various tokenizers
# Careful, NLTK was the first well-documented NLP package, but it might be outdated for some uses. Check the documentation !
from nltk.tokenize import word_tokenize


df['tokenize'] = df['texte'].apply(lambda x: word_tokenize(x))

In [None]:
df.head()

In [4]:
def count_words(texts, voc = None):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts
    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
    """
    n_samples = len(texts)
    if voc == None:
        words = set()
        for text in texts:
            words = words.union(set(text)) # list of all words
        n_features = len(words) # number of different words
        vocabulary = dict(zip(words, range(n_features))) # vocab[wd] = index ; indexisation
    else:
        vocabulary = voc
        n_features = len(voc)
    counts = np.zeros((n_samples, n_features))
    for k, text in enumerate(texts): # enumeration a k for a text[k]
        for w in text:
            if w in vocabulary:
                counts[k][vocabulary[w]] += 1.
    return vocabulary, counts

In [None]:
voc, bow = count_words(df['tokenize'])
print(bow.shape)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Create and fit the vectorizer to the training data
vectorizer = CountVectorizer()
Bow = vectorizer.fit_transform(df['texte'])
bow_a = Bow.toarray()
print(bow_a.shape)

In [8]:
frequency = bow.sum(axis = 0)
top_words = np.argsort(frequency)[::-1]

In [None]:
rev_voc = {i: w for w, i in voc.items()}
fig, ax = plt.subplots(figsize=(16,8))
ax.bar(range(15), frequency[top_words[:15]])
ax.set_xticks(range(15))
ax.set_xticklabels([rev_voc[i] for i in top_words[:15]], rotation='vertical')
plt.show()

In [9]:
def euclidean(u, v):
    return np.linalg.norm(u-v)

def length_norm(u):
    return u / np.sqrt(u.dot(u))

def cosine(u, v):
    return 1.0 - length_norm(u).dot(length_norm(v))

from sklearn.neighbors import NearestNeighbors

In [10]:
def print_neighbors(distance, texts, representations, index, k=5):
    """
    Parameters
    ----------
    distance : function
        The distance to use to compare documents
    texts : list of str
        The texts
    representations: 2D Array
        Vector representations of the texts, in the same order
    index: int
        Index of the document for which to return nearest neighbors
    k: int
        Number of neighbors to display    
    """
    neigh = NearestNeighbors(n_neighbors=k, algorithm='brute', metric=distance)
    neigh.fit(representations) 
    dist, ind = neigh.kneighbors([representations[index]])
    print("Plus proches voisins de: \n '%s' \n selon la distance '%s':" % (texts[index], distance.__name__))
    print([[texts[i] for i in s[1:]]  for s in ind])
    print("\n")

In [None]:
print_neighbors(euclidean, df['texte'], bow, 24)
print_neighbors(cosine, df['texte'], bow, 24)

print_neighbors(euclidean, df['texte'], bow_a, 24)
print_neighbors(cosine, df['texte'], bow_a, 24)

In [11]:
from sklearn.preprocessing import normalize

def tfidf_transform(bow):
    """
    Inverse document frequencies applied to our bag-of-words representations
    """
    # IDF
    d = float(bow.shape[0]) + 1.0
    in_doc = bow.astype(bool).sum(axis=0) + 1.0
    idfs = np.log(d / in_doc) + 1.0
    # TF
    sum_vec = bow.sum(axis=1)
    tfs = bow / np.expand_dims(sum_vec + 1.0, axis=1)
    tf_idf = tfs * np.expand_dims(idfs,axis=0)
    return tf_idf

In [None]:
tfidf = tfidf_transform(bow)
print(tfidf.shape)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Create and fit the vectorizer to the training data
tfidf_vectorizer = TfidfVectorizer()
Tfidf = tfidf_vectorizer.fit_transform(df['texte'])
tfidf_a = Tfidf.toarray()
print(tfidf_a.shape)

In [None]:
print_neighbors(euclidean, df['texte'], tfidf_a, 24)
print_neighbors(cosine, df['texte'], tfidf_a, 24)
# Formatage
print_neighbors(euclidean, df['texte'], tfidf, 24)
print_neighbors(cosine, df['texte'], tfidf, 24)

In [15]:
import altair as alt
import pandas as pd

In [16]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=True)
docs_pca = pca.fit_transform(tfidf)

In [27]:
data = pd.DataFrame({'x': docs_pca[:,0],
                     'y': docs_pca[:,1],
                     'texte': df['texte']})
                     #'Category': categories_l})

In [None]:
alt.Chart(data[:]).mark_circle(size=200).encode(
    x="x", y="y",# color='Category',
    tooltip=['texte']
    ).interactive().properties(
    width=500,
    height=500
)

### III - 2 With T-SNE

From the ```sklearn``` documentation: 
- t-SNE [1] is a tool to visualize high-dimensional data. It converts similarities between data points to joint probabilities and tries to minimize the divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. t-SNE has a cost function that is not convex, i.e. **with different initializations we can get different results**.
- In particular, t-SNE has the advantage to reveal data that lie in multiple, different, manifolds or clusters.
- It is highly recommended to use another dimensionality reduction method (e.g. PCA for dense data or TruncatedSVD for sparse data) to reduce the number of dimensions to a reasonable amount (e.g. 50) if the number of features is very high. This will suppress some noise and speed up the computation of pairwise distances between samples.

From this recommendation, we will initialize ```TSNE``` with PCA (choosing the argument ```init='pca'``` when creating the class).

In [29]:
from sklearn.manifold import TSNE

In [None]:
docs_tsne = TSNE(n_components=2, learning_rate='auto',
                  init='pca').fit_transform(tfidf)
print(docs_tsne.shape)

In [31]:
data = pd.DataFrame({'x': docs_tsne[:,0],
                     'y': docs_tsne[:,1],
                     'texte': df['texte']})
                     #'Category': categories_l})

In [None]:
alt.Chart(data[:]).mark_circle(size=200).encode(
    x="x", y="y",# color='Category',
    tooltip=['texte']
    ).interactive().properties(
    width=500,
    height=500
)

<div class='alert alert-block alert-warning'>
            Question:</div>
                        
- Is there any conclusion we can draw with respect to the lexical features and how they allow us to group the documents in this dataset ? 

### III - 3 Topic modeling

Now, the goal is to re-use the bag-of-words representations we obtained earlier - but reduce their dimension before visualization. 

The underlying idea is to **take advantage of the latent structure in the association between the set of
words and the set of documents**. Many methods have been designed to do this - the earliest being **topic models**. 

Note that this allows to obtain reduced document representations, in a **topic space, common to documents and words** - where each document is described as a vector of topics and for each topic, we have access to the importance of words. 


We will do this with two models:
- Using the ```TruncatedSVD```, we will **linearly** reduce the dimension of our BOW representations. This is called *Latent Semantic Analysis* (LSA). 
- Using a *generative model* based on several assumptions on how a document is generated through topics, which the model will retrieve: this is ```LatentDirichletAllocation``` (LDA).

We use here another dataset from this [paper](https://aclanthology.org/2024.latechclfl-1.28/) which includes quite more categories and will be more interesting to explore, as we can expect it to contain clusters clearly visible through looking at lexical features. You can find the dataset on their [git repository](https://git.unistra.fr/thealtres/stage-direction-classif-french-transfer-learning).

First, apply the same pipeline than before:
- Does the data need to be cleaned and pre-processed ?
- Obtain BOW and TF-IDF representations.
- Visualize them with T-SNE.

<div class='alert alert-block alert-info'>
            Code:</div>

In [None]:
voc_th, bow_th = count_words(df['texte'])
print(bow_th.shape)

In [None]:
tfidf_th = tfidf_transform(bow_th)
print(tfidf_th.shape)

In [None]:
docs_tsne_th = TSNE(n_components=2, learning_rate='auto',
                    init='pca').fit_transform(tfidf_th)
data_th = pd.DataFrame({'x': docs_tsne_th[:,0],
                        'y': docs_tsne_th[:,1],
                        'Text': df['texte']})
                        #'Category': df['labelGeneric']
alt.data_transformers.disable_max_rows()
alt.Chart(data_th[:]).mark_circle(size=200).encode(
    x="x", y="y",# color='Category',
    tooltip=['Text']
    ).interactive().properties(
    width=500,
    height=500
)

In [39]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

**Latent Semantic Analysis**: let us choose an arbitrary number of topics - which will be the size of the joint *topic space*.

In [40]:
n_topics = 50
lsa = TruncatedSVD(n_components = n_topics)
lsa_topics = lsa.fit_transform(tfidf_th)

In [None]:
# Correspondances between documents and topics
print(lsa_topics.shape)
# Correspondances between topics and words
print(lsa.components_.shape)

In [42]:
# Reversing the vocabulary to retrieve words from indexes, allowing to find the most important words for each topic
rev_voc_th = {i: w for w, i in voc_th.items()}

In [43]:
def most_important_words(n, reverse_vocabulary, topic_model):
    out = []
    for i, topic in enumerate(topic_model.components_):
        out.append([reverse_vocabulary[j] for j in topic.argsort()[:-n-1:-1]])
    return out

In [None]:
words = most_important_words(8, rev_voc_th, lsa)
for i, topic in enumerate(words[:15]):
    print("Topic ", i+1, " : ", topic)

With a dataset this size, over **short texts**, it is difficult to interpret the topics (many short words, even with TF-IDF). Let's apply T-SNE ! 

In [None]:
docs_tsne_th = TSNE(n_components=2, learning_rate='auto',
                  init='pca', metric='cosine', perplexity=50.0).fit_transform(lsa_topics)
print(docs_tsne_th.shape)

data_th = pd.DataFrame({'x': docs_tsne_th[:,0],
                        'y': docs_tsne_th[:,1],
                        'Text': df['texte']})
                        #'Category': AS13_df['labelGeneric']})

alt.data_transformers.disable_max_rows()
alt.Chart(data_th[:]).mark_circle(size=200).encode(
    x="x", y="y",# color='Category',
    tooltip=['Text']
    ).interactive().properties(
    width=500,
    height=500
)


**Latent Dirichlet Allocation**: 

In [49]:
lda = LatentDirichletAllocation(n_components = n_topics)
lda_topics_th = lda.fit_transform(bow_th)

In [None]:
words = most_important_words(8, rev_voc_th, lda)
for i, topic in enumerate(words[:15]):
    print("Topic ", i+1, " : ", topic)

In [None]:
lda_topics_th.shape

In [None]:
docs_tsne_th = TSNE(n_components=2, learning_rate='auto',
                  init='pca', metric='cosine', perplexity=50.0).fit_transform(lda_topics_th)
print(docs_tsne_th.shape)

data_th = pd.DataFrame({'x': docs_tsne_th[:,0],
                        'y': docs_tsne_th[:,1],
                        'Text': df['texte']})
                        #'Category': AS13_df['labelGeneric']

alt.data_transformers.disable_max_rows()
alt.Chart(data_th[:]).mark_circle(size=200).encode(
    x="x", y="y",# color='Category',
    tooltip=['Text']
    ).interactive().properties(
    width=500,
    height=500
)

<div class='alert alert-block alert-warning'>
            Further question:</div>
  
- Are there any other features that we could consider with the same tools (Second dataset) ?

<div class='alert alert-block alert-info'>
            Code:</div>
            
- Apply the pipeline to obtain a t-sne visualisation over these proposed features. Did it work as expected ? 

### III - 4 Take away

**Idea**: the key to improving representations is to embed data capturing text statistics in a compact space.

But how ? 
Let's look at how a compact **modern (deep learning based) model** can better capture what's happening in our dataset:

In [18]:
%%capture output

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")
model.eval()

In [30]:
# This code is very inefficient: it will take documents one by one and make them go through the model
# We can usually process several of them together to gain time: this is called batching
# Batching may require a large quantity of memory, and to avoid any issue when running this locally,
# we will keep this (very slow and) inefficient solution. 
vectors = []
for i, example in enumerate(df['texte'].tolist()):
    inputs = tokenizer(example, truncation=True, padding=True, return_tensors="pt")
    outputs = model(**inputs)
    vectors.append(outputs.last_hidden_state[0,0,:].detach().numpy()[np.newaxis, :])

In [None]:
# The model outputs vectors of size 768
cam_rep = np.concatenate(vectors, axis=0)
print(cam_rep.shape)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=0)


kmeans.fit(cam_rep)


labels = kmeans.labels_


print(labels)

In [None]:
docs_tsne_th = TSNE(n_components=2, learning_rate='auto',
                    init='random', metric='cosine',
                    perplexity=50.0).fit_transform(cam_rep)
print(docs_tsne_th.shape)

data_th = pd.DataFrame({'x': docs_tsne_th[:,0],
                        'y': docs_tsne_th[:,1],
                        'Text': df['texte'],
                        'labels': labels
                        })

alt.data_transformers.disable_max_rows()
alt.Chart(data_th[:]).mark_circle(size=200).encode(
    x="x", y="y", color='labels',
    tooltip=['Text']
    ).interactive().properties(
    width=500,
    height=500
)

We will see how such a model (*CamemBERT*) works (relatively) soon ! 