### Topic Modeling per Grazia

in questo notebook ci sono delle utils che ho usato per costruire il modellino. Nella repo, invece:
- nodes-with... : il tuo csv
- LDA_15: modello con 15 topics
- LDA_25: modello con 25 topics
- mallet-2.0.8: una versione del modello LDA, ottimizzata in Java. [http://mallet.cs.umass.edu/topics.php]
- corpus/topic_corpus.mm: il corpus serializzato
- wiki_txt: i testi (in utf-8 encoding) delle pagine che mi hai indicato
- links.json: un json (dizionario) chiave-valore/titolo-links nella pagina
- phrasers/bigram_model.pkl: un phraser (custom sul corpus in questione) che considera i bigrammi
- phrasers/trigram_model.pkl: un phraser (custom sul corpus in questione) che considera i trigrammi
- wikitdm: ho modificato il progetto di un tizio che permetteva, a partire da una directory con dei txts, di creare un tdidf direttamente, ma non l'ho poi usato perchè è troppo basic. Lo aggiungo per completezza.

#### Links

- Coherence Topic Model: [http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf]

In [13]:
import pandas as pd
import wikipediaapi, os
import matplotlib.colors as mcolors
import gensim, pyLDAvis

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from pyLDAvis import gensim as gm
from gensim.corpora import TextCorpus, Dictionary
from gensim.models.phrases import Phraser, Phrases
from gensim.models import TfidfModel
from gensim.models.coherencemodel import CoherenceModel

from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS

In [None]:
save_dir = "/home/nicolo/Documenti/projects/wiki/wiki_txt/"

remove_title = ["See also", 
                "References", 
                "Further reading", 
                "External links", 
                "History", 
                "Critiques",
                "Notes",
                "Publications",
                "Controversies"]

stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
path = [os.path.join(save_dir, el) for el in os.listdir(save_dir) if el[-3:] == "txt" ]
mallet_path = "/home/nicolo/Scaricati/mallet-2.0.8/bin/mallet"

In [None]:
wiki_data = pd.read_csv("/home/nicolo/Documenti/projects/wiki/nodes-with-modularity-02122019csv.csv")
wiki_data_names = list(wiki_data.name)

In [None]:
wiki = wikipediaapi.Wikipedia(language="en", extract_format=wikipediaapi.ExtractFormat.WIKI)

In [None]:
def extract_text_from_pages(list_of_names:list, wiki_object, save_dir:str):
    
    wiki_json_data, fail = {}, []
    
    for wiki_page in list_of_names:
        
        try: 
            page = wiki_object.page(wiki_page)
            format_name = page.title.replace(" ", "_") +'.txt'
            format_name = format_name.replace("/","_")
            links = [link for link in list(page.links.keys()) if ":" not in link]
            with open(os.path.join(save_dir, format_name), "w+", encoding="utf-8") as wiki_txt:
                wiki_txt.write(page.text)

            wiki_json_data.update({"name": page.title,
                                   "links": links})
        except Exception as e:
            fail.append(wiki_page)
            print("error {} in {}".format(e, wiki_page))
    
    return wiki_json_data, fail

def get_token(text_path: str, tokenizer, stop_words):

    with open(text_path, encoding="utf-8") as file:
        text = file.read()
    token = [tok.lower() for tok in tokenizer.tokenize(text) if tok not in stop_words and not tok.isdigit() and len(tok) >= 3]
    return token

def compute_coherence_values(mallet_path, dictionary, corpus, texts, limit, start, step):
    """
    Compute coherence for various number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dic)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

def format_topics_sentences(ldamodel, corpus, texts):

    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
                
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

def create_models(trigrams_phraser,
                  mallet_path,
                  docs_path, 
                  tokenizer, 
                  stop_words, 
                  thres=20,
                  num_topics_start=5,
                  num_topics_end=25,
                  step=5):
    
    trigrams = Phraser.load(trigrams_phraser)
    docs = [get_token(doc, tokenizer, stop_words) for doc in docs_path]
    corpus = [[trigrams[doc], i] for i, doc in enumerate(docs) if len(doc)>=thres]
    corpus_tok, ids = [c[0] for c in corpus], [c[1] for c in corpus]
    dic = Dictionary(corpus_tok)
    corpus_ = [dic.doc2bow(text) for text in corpus_tok]
    model_list, coherence_values = compute_coherence_values(mallet_path,
                                                            dictionary=dic, 
                                                            corpus=corpus_, 
                                                            texts=docs, 
                                                            start=num_topics_start, 
                                                            limit=num_topics_end, 
                                                            step=step)
    return model_list, coherence_values

In [None]:
docs = [get_token(doc, tokenizer, stop_words) for doc in path]

In [None]:
trigrams = Phraser.load("/home/nicolo/Documenti/projects/wiki/trigram_model.pkl")

In [None]:
corpus = [[trigrams[doc], i] for i, doc in enumerate(docs) if len(doc)>=20]

In [None]:
corpus_tok = [c[0] for c in corpus]
ids = [c[1] for c in corpus]

In [None]:
dic = Dictionary(corpus_tok)
corpus_ = [dic.doc2bow(text) for text in corpus_tok]

In [None]:
model_list, coherence_values = compute_coherence_values(mallet_path, dictionary=dic, corpus=corpus_, texts=docs, start=20, limit=30, step=5)

In [None]:
optimal_model = model_list[1]
optimal_model.save("/home/nicolo/Documenti/projects/wiki/LDAmallet_25")
#model_topics = optimal_model.show_topics(formatted=False)
#print(optimal_model.print_topics(num_words=10))

In [None]:
ldamodel_25 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(optimal_model)

In [None]:
ldamodel_25.save("/home/nicolo/Documenti/projects/wiki/LDA_25.model")

In [None]:
tab = format_topics_sentences(ldamodel_25, corpus_, corpus_tok)

In [None]:
remove_path = [p for i, p in enumerate(path) if i in ids]

In [None]:
tab = tab.drop([0, "Page"], axis=1)

In [None]:
contents = pd.Series([p for i, p in enumerate(path)], name="Page")
tab_ = pd.concat([tab, contents], axis=1)

In [None]:
tab_ = tab_.rename(columns={0:"Tokens"})

In [None]:
sent_topics_sort = pd.DataFrame()
sent_topics_outdf_grpd = tab_.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sort = pd.concat([sent_topics_sort, 
                                 grp.sort_values(['Perc_Contribution'], ascending=False).head(15)], 
                                 axis=0)
    
sent_topics_sort.reset_index(drop=True, inplace=True)
sent_topics_sort.columns = ['Topic_Num', "Topic_Perc_Contrib", "Topic_Keywords", "Page"]

sent_topics_sort

In [None]:
tab_.to_csv("/home/nicolo/Documenti/projects/wiki/topic_pages_25.csv", header=True, index=False)

In [3]:
lda_model = gensim.models.LdaModel.load("/home/nicolo/Documenti/projects/wiki/LDA_25/LDA_25.model")

In [8]:
corpus = gensim.corpora.MmCorpus("/home/nicolo/Documenti/projects/wiki/corpus/topic_corpus.mm")

In [9]:
dic = Dictionary.load("/home/nicolo/Documenti/projects/wiki/dict")

In [15]:
vis

In [16]:
pyLDAvis.save_html(vis, '/home/nicolo/Documenti/projects/wiki/visual.html')