### Topic Modeling per Grazia

in questo notebook ci sono delle utils che ho usato per costruire il modellino. Nella repo, invece:
- nodes-with... : il tuo csv
- mallet-2.0.8: una versione del modello LDA modificata.
- wiki_txt: i testi (in utf-8 encoding) delle pagine che mi hai indicato
- links.json: un json (dizionario) chiave-valore/titolo-links nella pagina
- bigram_model.pkl: un phraser (custom sul corpus in questione) che considera i bigrammi
- trigram_model.pkl: un phraser (custom sul corpus in questione) che considera i trigrammi
- wikitdm: ho modificato il progetto di un tizio che permetteva, a partire da una directory con dei txts, di creare un tdidf direttamente, ma non l'ho poi usato perchè è troppo basic. Lo aggiungo per completezza.

In [1]:
import pandas as pd
import wikipediaapi, os
import gensim

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from pyLDAvis import gensim as gm
from gensim.corpora import TextCorpus, Dictionary
from gensim.models.phrases import Phraser, Phrases
from gensim.models import TfidfModel
from gensim.models.coherencemodel import CoherenceModel

In [2]:
save_dir = "/home/nicolo/Documenti/projects/wiki/wiki_txt/"

remove_title = ["See also", 
                "References", 
                "Further reading", 
                "External links", 
                "History", 
                "Critiques",
                "Notes",
                "Publications",
                "Controversies"]

stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
path = [os.path.join(save_dir, el) for el in os.listdir(save_dir) if el[-3:] == "txt" ]
mallet_path = "/home/nicolo/Scaricati/mallet-2.0.8/bin/mallet"

In [3]:
wiki_data = pd.read_csv("/home/nicolo/Documenti/projects/wiki/nodes-with-modularity-02122019csv.csv")
wiki_data_names = list(wiki_data.name)

In [4]:
wiki = wikipediaapi.Wikipedia(language="en", extract_format=wikipediaapi.ExtractFormat.WIKI)

In [18]:
def extract_text_from_pages(list_of_names:list, wiki_object, save_dir:str):
    
    wiki_json_data, fail = {}, []
    
    for wiki_page in list_of_names:
        
        try: 
            page = wiki_object.page(wiki_page)
            format_name = page.title.replace(" ", "_") +'.txt'
            format_name = format_name.replace("/","_")
            links = [link for link in list(page.links.keys()) if ":" not in link]
            with open(os.path.join(save_dir, format_name), "w+", encoding="utf-8") as wiki_txt:
                wiki_txt.write(page.text)

            wiki_json_data.update({"name": page.title,
                                   "links": links})
        except Exception as e:
            fail.append(wiki_page)
            print("error {} in {}".format(e, wiki_page))
    
    return wiki_json_data, fail

def get_token(text_path: str, tokenizer, stop_words):

    with open(text_path, encoding="utf-8") as file:
        text = file.read()
    token = [tok.lower() for tok in tokenizer.tokenize(text) if tok not in stop_words and not tok.isdigit() and len(tok) >= 3]
    return token

def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    """
    Compute coherence for various number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dic)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [6]:
docs = [get_token(doc, tokenizer, stop_words) for doc in path]

In [7]:
trigrams = Phraser.load("/home/nicolo/Documenti/projects/wiki/trigram_model.pkl")

In [8]:
corpus = [[trigrams[doc], i] for i, doc in enumerate(docs) if len(doc)>=20]

In [15]:
corpus_tok = [c[0] for c in corpus]

In [17]:
dic = Dictionary(corpus_tok)
corpus_ = [dic.doc2bow(text) for text in corpus_tok]

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dic, corpus=corpus_, texts=docs, start=5, limit=20, step=5)

In [70]:
optimal_model = model_list[2]
#model_topics = optimal_model.show_topics(formatted=False)
#print(optimal_model.print_topics(num_words=10))

In [81]:
model_lda = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list[2])

In [None]:
#pyLDAvis.enable_notebook()
#vis = gm.prepare(model_lda, corpus, dic)
#vis