<a href="https://colab.research.google.com/github/cbsobral/python/blob/master/topic_nltk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data

In [None]:
import nltk
from nltk.corpus import PlaintextCorpusReader
url = (r'/content/drive/My Drive/data/')
corpus_list = PlaintextCorpusReader(url, '.*txt')  # all files ending in 'txt'

file_ids = corpus_list.fileids()

## LDA


### Model

In [None]:
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import gensim
import pandas as pd
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


wordnet = nltk.WordNetLemmatizer()

stoplist = stopwords.words('english')
additional_stopwords = """question impact professor school dissertation paper take following http nuffield
                          title school session study work topics project partner practice happy plan see supervise
                          research thesis issue design student topic supervision university lab mia mpp"""  # define additional stopwords in a string
stoplist += additional_stopwords.split()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def normalize_token(token):
    """
    Convert token to lowercase, and stem using the Porter algorithm.
    """
    return wordnet.lemmatize(token.lower())

def filter_token(token):
    """
    Evaluate whether or not to retain ``token``.
    """
    token = token.lower()
    return token not in stoplist and token.isalpha() and len(token) > 2

In [None]:
documents=[[normalize_token(token) 
            for token in corpus_list.words(fileids=[fileid])
            if filter_token(token)]
            for fileid in corpus_list.fileids()]

dictionary = gensim.corpora.Dictionary(documents)         
documents_bow = [dictionary.doc2bow(document) for document in documents]

In [None]:
model = gensim.models.LdaModel(documents_bow, 
                               id2word=dictionary,
                               num_topics=15, 
                               update_every=0,
                               random_state=123,
                               passes=500)

In [None]:
for i, topic in enumerate(model.print_topics(num_topics=8, num_words=7)):
    print (i, ':', topic)

0 : (6, '0.031*"health" + 0.013*"data" + 0.010*"social" + 0.010*"http" + 0.009*"inequality" + 0.008*"org" + 0.008*"policy"')
1 : (1, '0.020*"electricity" + 0.019*"market" + 0.018*"energy" + 0.014*"review" + 0.013*"system" + 0.011*"generation" + 0.010*"emission"')
2 : (0, '0.000*"partisan" + 0.000*"observational" + 0.000*"polish" + 0.000*"persuading" + 0.000*"postdoctoral" + 0.000*"motivate" + 0.000*"postwar"')
3 : (12, '0.030*"law" + 0.018*"right" + 0.014*"international" + 0.014*"human" + 0.012*"european" + 0.010*"legal" + 0.009*"migration"')
4 : (7, '0.000*"partisan" + 0.000*"observational" + 0.000*"polish" + 0.000*"persuading" + 0.000*"postdoctoral" + 0.000*"motivate" + 0.000*"postwar"')
5 : (11, '0.033*"public" + 0.020*"management" + 0.014*"innovation" + 0.012*"government" + 0.012*"social" + 0.012*"sector" + 0.010*"administration"')
6 : (5, '0.016*"policy" + 0.016*"political" + 0.014*"analysis" + 0.012*"social" + 0.009*"data" + 0.009*"comparative" + 0.008*"governance"')
7 : (13, '0.

### LDA Comparison

In [None]:
#@title Add Text { run: "auto", vertical-output: true }

yr_text = "Family Policy, Health Policy, Education Policy, Social Inequality, Child Outcomes, Poverty 2.\tWhat is your research question?  What are the effects of social inequalities on child outcomes? What are the effects of social inequalities on child development? What are the effects of social inequalities on child well-being? To define the specific question the definition of \"child outcomes\", \"child development\" and child \"well-being\" still need to be defined. 3.\tWhy would you like to explore this topic and question? Please let us know why you are curious about your research topic, and why you think your question(s) are worth exploring. I have always been interested in social policy, especially topics related to Child, Youth, and Family Policies. In the past years, I worked with public education and early childhood development in Brazil. Now, I want to expand my view beyond the education area but still analyze the current situation of marginalized children and teenagers. However, I am still not sure if this will be related to the Brazilian or an International context. " #@param {type:"string"}



In [None]:
#@title Add Document
path = "/content/drive/My Drive/docs/ana.txt" #@param {type:"string"}
yr_p = open(path)
yr_path = yr_p.read()

In [None]:
yr_tokens = nltk.word_tokenize(yr_path)
yr_bow_vector = dictionary.doc2bow(yr_tokens)

In [None]:
# model comparison result
#print(model[yr_bow_vector])

# pd data frame
results = pd.DataFrame(model[yr_bow_vector])
results.columns = ['topic', 'proximity']
results.sort_values(['proximity'], ascending=False, inplace=True)
print(results.nlargest(3,['proximity']))


   topic  proximity
6     10   0.273964
8     12   0.192131
7     11   0.156727


In [None]:
# table with documents and topic probability
topics = [model[documents_bow[i]] for i in range(len(documents))]
num_topics = 8

def topics_document_to_dataframe(topics_document, num_topics):
    res = pd.DataFrame(columns=range(num_topics))
    for topic_weight in topics_document:
        res.loc[0, topic_weight[0]] = topic_weight[1]
    return res

document_topic = \
pd.concat([topics_document_to_dataframe(topics_document, num_topics=num_topics) for topics_document in topics]) \
  .reset_index(drop=True).fillna(0)

#document_topic

In [None]:
document_topic.sort_values(5, ascending=False)[5].head(10)

8     0.997619
17    0.996725
0     0.996631
7     0.996354
24    0.996028
14    0.994900
11    0.989630
22    0.927352
13    0.263976
27    0.242323
Name: 5, dtype: float64

In [None]:
documents_lda = model[documents_bow]

topic_dt = pd.DataFrame(documents_lda)
doc_dt = pd.DataFrame(file_ids)
conc = pd.concat([doc_dt, topic_dt], axis=1)
conc.columns = ['doc_id', 'topic1', 'topic2', 'topic3']
conc.sort_values(['topic1'], ascending=True, inplace=True)
conc

Unnamed: 0,doc_id,topic1,topic2,topic3
12,Hirth.txt,"(1, 0.9987719)",,
27,Snower.txt,"(3, 0.7528624)","(5, 0.24232279)",
13,Hustedt.txt,"(5, 0.26397434)","(10, 0.4501205)","(11, 0.2823809)"
22,Munzert.txt,"(5, 0.9273515)","(10, 0.070632994)",
11,Hassel.txt,"(5, 0.9896296)",,
14,Iacovone.txt,"(5, 0.9948998)",,
24,Reh.txt,"(5, 0.99602836)",,
7,GohdesHW.txt,"(5, 0.99635416)",,
0,Anheier.txt,"(5, 0.9966306)",,
17,Kayser.txt,"(5, 0.9967252)",,
