In [None]:

# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install gensim
# !pip install spacy
# !pip install nltk
# !pip install re
# !pip install logging
# !pip install warnings


In [None]:
import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Function to lemmatize the text using SpaCy
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for text in texts:
        doc = nlp(" ".join(text))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
#Chargement des données
data = pd.read_csv("cancer_sport_analyse.csv")
data2 = pd.read_csv("cancer_fasting_analyse.csv")
data3 = pd.read_csv("cancer_cannabis.csv")
tweets_df = pd.concat([data,data2,data3])

tweets_df.shape

In [None]:
tweets_df.head()

In [None]:
import re

#Suppression des liens commençant par http et https
def remove_url(text):
    if isinstance(text, str):
        return re.sub(r'https?:\S*', '', text)
    else:
        return text
tweets_df['Texte corrigé']=tweets_df['Texte corrigé'].apply(remove_url)
def remove_url1(text):
    if isinstance(text, str):
        return re.sub(r'http?:\S*', '', text)
    else:
        return text
tweets_df['Texte corrigé']=tweets_df['Texte corrigé'].apply(remove_url1)

In [None]:
def sent_to_words(sentences):
    for sent in sentences:
        if isinstance(sent, str):
            sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
            sent = re.sub('\s+', ' ', sent)  # remove newline chars
            sent = re.sub("\'", "", sent)  # remove single quotes
            sent = gensim.utils.simple_preprocess(str(sent), deacc=True)
            yield(sent)
        else:
            yield []

# Convert to list
data = tweets_df['Texte corrigé'].values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

pprint(lda_model.print_topics())


In [None]:
# Compute coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_ready, dictionary=id2word)
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

# Print topics
pprint(lda_model.print_topics())

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([int(topic_num), round(prop_topic,4), topic_keywords]).T], ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    contents = contents.reset_index(drop=True)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)


In [None]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)],
                                            axis=0)

# Reset Index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

In [None]:
# Sentence Coloring of N Sentences
def topics_per_document(model, corpus, start=0, end=1):
    corpus_sel = corpus[start:end]
    dominant_topics = []
    topic_percentages = []
    for i, corp in enumerate(corpus_sel):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        topic_percentages.append(topic_percs)
    return(dominant_topics, topic_percentages)

dominant_topics, topic_percentages = topics_per_document(model=lda_model, corpus=corpus, end=-1)

# Distribution of Dominant Topics in Each Document
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()

# Total Topic Distribution by actual weight
topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()

# Top 3 Keywords for each Topic
topic_top3words = [(i, topic) for i, topics in lda_model.show_topics(formatted=False)
                                 for j, (topic, wt) in enumerate(topics) if j < 3]

df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
df_top3words.reset_index(level=0,inplace=True)

In [None]:
import pandas as pd
print(pd.__version__)

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

In [None]:
pyLDAvis.save_html(vis, 'lda_visualization.html')


In [None]:
# Ajouter une nouvelle colonne pour les étiquettes de sujets
tweets_df['Topic_Label'] = ""

# Mapper des noms de sujets aux identifiants de sujets
topic_labels_mapping = {
    0 : "Traitement et recherche sur le cancer",
    1 : "Combat contre le cancer et activisme",
    2 : "Santé et bien-être liés au cancer",
    3 : "Études cliniques et croissance tumorale",
    4 : "Effets des cannabinoïdes sur le cancer",
    5 : "Utilisation médicale du cannabis dans le traitement du cancer",
    6 : "Impact environnemental du traitement du cancer",
    7 : "Potentiel thérapeutique du cannabis dans la lutte contre le cancer"
}

# Assigner des étiquettes en fonction des sujets dominants
for index, row in df_dominant_topic.iterrows():
    tweet_index = row['Document_No']
    topic_id = row['Dominant_Topic']
    topic_label = topic_labels_mapping[topic_id]
    tweets_df.at[tweet_index, 'Topic_Label'] = topic_label


In [None]:
# Spécifiez le chemin et le nom de fichier pour sauvegarder le DataFrame avec les étiquettes
output_file_path = "tweets_with_labels.csv"

# Enregistrez le DataFrame avec les étiquettes dans un fichier CSV
tweets_df.to_csv(output_file_path, sep=';', index=False)

# Confirmez que le fichier a été sauvegardé avec succès
print("Le DataFrame avec les étiquettes a été sauvegardé dans :", output_file_path)


In [None]:
# Importez pandas
import pandas as pd

# Lisez le fichier CSV dans un DataFrame
t_df = pd.read_csv("tweets_with_labels.csv", sep = ";", skip_blank_lines=True)
# Affichez la forme du DataFrame
print(t_df.shape)

In [None]:
null_values = t_df[t_df['Topic_Label'].isnull()]


In [None]:
duplicates = t_df[t_df.duplicated(subset=['Topic_Label'], keep=False)]


In [None]:
# Supprimer les lignes avec des valeurs nulles dans la colonne "Topic_Label"
#t_df = t_df.dropna(subset=['Topic_Label'])

# Supprimer les doublons dans la colonne "Topic_Label"
t_df = t_df.drop_duplicates(subset=['Fichier'], keep='first')


In [None]:
print(t_df.shape)