In [None]:
import re
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import torch
import altair as alt

df = pd.read_csv('data_preprocessed.csv')

df.head(10)
df_ex = df[df['Sector'] != 'NaN']
df_ex.shape[0]

%%capture output

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
 
model.eval()

score_tableau = []

vectors = []
for i, example in enumerate(df['text_processed'].tolist()):
    inputs = tokenizer(example, truncation=True, padding=True, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}
    outputs = model(**inputs)
    vectors.append(outputs.last_hidden_state[0,0,:].detach().cpu().numpy()[np.newaxis, :])
cam_rep = np.concatenate(vectors, axis=0)
print(cam_rep.shape)
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

for k in range(2,10):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(cam_rep)
    labels = kmeans.labels_
    score = silhouette_score(cam_rep, labels)
    score_tableau.append(score)
    print(f"{k+1}")

"""
docs_tsne_th = TSNE(n_components=1, learning_rate='auto',
                    init='random', metric='cosine',
                    perplexity=50.0).fit_transform(cam_rep)
print(docs_tsne_th.shape)
"""

tfidf_vectorizer = TfidfVectorizer()
Tfidf = tfidf_vectorizer.fit_transform(df['text_processed'])
tfidf_a = Tfidf.toarray()
print(tfidf_a.shape)

pca = PCA(n_components=2, whiten=True)
docs_pca = pca.fit_transform(tfidf_a)

data = pd.DataFrame({'x': docs_pca[:,0],
                     'y': docs_pca[:,1],
                     'texte': df['text_processed']})
                     #'Category': categories_l})

plt.figure()
alt.Chart(data[:]).mark_circle(size=200).encode(
    x="x", y="y",# color='Category',
    tooltip=['texte']
    ).interactive().properties(
    width=500,
    height=500
)