In [2]:
!pip install pandas





In [3]:
import pandas as pd

# On charge les données depuis un fichier csv

# Elon Musk tweets dataset
https://www.kaggle.com/datasets/yasirabdaali/elon-musk-tweets-dataset-17k
Ce dataset contient 17 000 tweets d'Elon Musk, le fondateur de Tesla et SpaceX. Notre objectif est de faire un topic modeling sur ces tweets afin de voir quels sont les sujets qui sont abordés dans les tweets d'Elon Musk.
Les features sont les suivantes:
* Date Created
* Number of Likes
* Source of Tweet
* Tweets

In [4]:
datas = pd.read_csv("elonmusk.csv")

# Nettoyage des données
* On commence par supprimer les colonnes qui ne nous intéressent pas pour notre analyse(la date du tweet, le nombre de likes et la source du tweet)

In [5]:
datas = datas.drop(columns=['Date Created', 'Number of Likes', 'Source of Tweet'])
datas

Unnamed: 0,Tweets
0,@teslaownersSV @cb_doge @Tesla @mayemusk I gue...
1,@cb_doge @Tesla @mayemusk Still doing same thi...
2,Looks good to roll out to all Tesla owners wit...
3,@Tesla__Mania @WholeMarsBlog That is probably ...
4,@WholeMarsBlog Real-world validation &amp; bil...
...,...
17432,That was a total non sequitur btw
17433,"Great Voltaire quote, arguably better than Twa..."
17434,I made the volume on the Model S http://t.co/w...
17435,Went to Iceland on Sat to ride bumper cars on ...


In [6]:
!pip install spacy





* Suppression des mots vides

In [7]:
import spacy

 On charge les stopwords en anglais et on y ajoute les expressions qu'il utilise souvent dans ses tweets (exemple woohoo, cool, etc...) 

In [8]:
from spacy.lang.en import English

nlp = English()
# liste des mots vides en anglais
stop_words = spacy.lang.en.stop_words.STOP_WORDS
stop_words.add("woohoo")
stop_words.add("co")
stop_words.add("true")
stop_words.add("amp")
stop_words.add("rt")
stop_words.add("ok")
stop_words.add("yes")
stop_words.add("um")
stop_words.add("yup")
stop_words.add("maybe")
stop_words.add("good")
stop_words.add("thanks")
stop_words.add("awesome")
stop_words.add("yeah")
stop_words.add("like")
stop_words.add("haha")
stop_words.add("cool")


In [9]:
stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'amp',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'awesome',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'co',
 'cool',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly'

In [10]:
import re

# Fonction de nettoyage des tweets
* On supprime les mots commençant par un @ car ce sont des mentions à d'autres utilisateurs
* On supprime les liens
* On supprime les caractères spéciaux

In [11]:
# Fonction pour le pré-traitement des textes
def preprocess(list_text):
    clean_text = []
    for text in list_text:
        text = str(text).strip().lower()
        doc = nlp(text)
        token_list = []
        for word in doc:
            # (str(word).isdigit() == False):
            if (str(word).lower() not in stop_words) and (word.is_punct == False) and (re.search('@+.|[0-9]|https*|http*', str(word)) == None) and  (str(word).isascii() == True):
                token_list.append(word.text)
        clean_text.append(' '.join(token_list))
    return clean_text


On applique la fonction de nettoyage sur les tweets

In [12]:
clean_text = preprocess(datas['Tweets'])

## Résultat du nettoyage

In [13]:
clean_text

['guess joe mode quieter',
 'thing bigger',
 'looks roll tesla owners cars',
 'probably right order magnitude',
 'real world validation billions miles real world training fsd superhuman',
 '',
 'sigh',
 'fsd beta rolling note priority safety expect overly cautious especially pedestrians',
 'falcon arching orbit',
 'obvious limit rocket reflight far',
 'instagram envy amplifier',
 '',
 'team found bug causing delay detecting pedestrians m. fixed rolling tonight',
 '',
 'pretty accurate',
 '',
 'astronomy',
 'internal beta rollout tonight wider tomorrow',
 'coming',
 'materials science wo regret',
 'major fed rate hike risks deflation',
 'lmk happens',
 'absolutely tesla fleet australia growing rapidly need ramp service general',
 'tesla north america aiming hour service',
 '',
 'anubis',
 'nice dressing sink knocking random doors choice let sink',
 'complex missions',
 'bot block party',
 'assuming efficient pv cells surface coverage',
 'exactly ecosystem entirely dependent sun \n\n civ

# On transforme les mots en matrice de fréquence

In [14]:
!pip install sklearn





In [15]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [16]:
tfidf_vect = TfidfVectorizer()
tf = CountVectorizer()

# * LDA est une méthode probabiliste, on utilise donc CountVectorizer (nombre d'apparition des mots) 

# * NMF est une méthode non probabiliste, on utilise donc TfidfVectorizer (poids des mots)

In [17]:
text_lda = tf.fit_transform(clean_text) # LDA est une méthode probabiliste, on utilise donc CountVectorizer
text_nmf = tfidf_vect.fit_transform(clean_text) # NMF est une méthode non probabiliste, on utilise donc TfidfVectorizer

On récupère la liste des features ou mots

In [18]:
terms_nmf = tfidf_vect.get_feature_names()
terms_lda = tf.get_feature_names()

Matrice TF des mots. On utilise les 15000 premiers tweets, le reste sera utilisé pour mesurer le perplexité et la cohérence du modèle.

In [19]:
df_lda = pd.DataFrame(text_lda[:15000].toarray(), columns=terms_lda)
df_lda # Affichage d'une partie de la matrice

Unnamed: 0,aaargh,aargh,abandoned,abandoning,abdomen,aber,abide,ability,ablate,ablative,...,zooming,zooms,zootopia,zorbix,zork,zu,zuck,zukunft,zune,zx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Matrice TF-IDF des mots. On utilise les 15000 premiers tweets, le reste sera utilisé pour mesurer le perplexité et la cohérence du modèle.

In [20]:
df_nmf = pd.DataFrame(text_nmf[:15000].toarray(), columns=terms_nmf)
df_nmf


Unnamed: 0,aaargh,aargh,abandoned,abandoning,abdomen,aber,abide,ability,ablate,ablative,...,zooming,zooms,zootopia,zorbix,zork,zu,zuck,zukunft,zune,zx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# On entraîne 2 modèles avec les méthodes LDA (Latent Dirichlet Allocation) et NMF (Non-Negative Matrix Factorization)

In [21]:
from sklearn.decomposition import LatentDirichletAllocation, NMF

# Nous allons d'abord essayer de déterminer les meilleurs paramètres pour nos modèles. Pour cela, on va utiliser la méthode GridSearchCV qui va nous permettre de tester plusieurs combinaisons de paramètres et de choisir les meilleurs.

In [25]:
from sklearn.model_selection import GridSearchCV


Evaluation des meilleurs paramètres pour LDA

In [26]:
search_params = {'n_components': [5, 10, 15], 'learning_decay': [.5, .7, .9]} # Paramètres à tester pour LDA
# on initialise LDA sans paramètres
lda = LatentDirichletAllocation()
# Initialise Grid Search 
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(df_lda)


GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 10, 15]})

In [27]:
print("Meilleurs paramètres pour LDA: ", model.best_params_)


Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}


On entraîne un modèle avec LDA

In [22]:
lda = LatentDirichletAllocation(n_components=5, max_iter=5,
                                learning_method='online', learning_offset=50., random_state=0, learning_decay=0.5).fit(text_lda)



On entraîne un modèle avec NMF

In [23]:
nmf = NMF(n_components=5, random_state=1,
          alpha=.1, l1_ratio=.5, init='nndsvd').fit(text_nmf)


# On affiche les 10 mots les plus significatifs de chaque topic

In [24]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic {}:".format(topic_idx), end=' ')
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))


nb_top_words = 10
print("---------------------LDA:--------------------------------")
display_topics(lda, terms_lda, nb_top_words)
print("---------------------NMF:--------------------------------")
display_topics(nmf, terms_nmf, nb_top_words)


---------------------LDA:--------------------------------
Topic 0: cars tesla future company design looks battery life low kids
Topic 1: tesla people new soon right coming space lot work time
Topic 2: actually know think years article boring definitely mission called little
Topic 3: launch rocket high landing test exactly flight want air change
Topic 4: model tesla great car falcon spacex year dragon love rocket
---------------------NMF:--------------------------------
Topic 0: exactly love said question point looking sigh makes monday money
Topic 1: sure hope way pretty sounds year probably need early favorite
Topic 2: great idea work game team sounds thread pretty song shot
Topic 3: soon coming real improvements year update fun software feature sorry
Topic 4: tesla team model car work year cars spacex production time


# Evaluation des modèles

Perplexité du modèle LDA.

In [25]:
test_lda = pd.DataFrame(text_lda[15000:].toarray(), columns=terms_lda)
test_nmf = pd.DataFrame(text_nmf[15000:].toarray(), columns=terms_nmf)

* Perplexité du modèle par rapport à la base test

In [26]:
lda.perplexity(test_lda)

31354.474622092715

* Perplexité du modèle par rapport à la base d'entraînement

In [27]:
lda.perplexity(text_lda)

7672.461000310403