In [47]:
import numpy as np
import pandas as pd
import spacy
import pickle
import os
import sys

# gensim
import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from gensim import corpora
from gensim.parsing.preprocessing import preprocess_documents


# sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# Plotting tools
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim


%matplotlib inline
print('Python Version: %s' % (sys.version))

Python Version: 3.7.3 | packaged by conda-forge | (default, Mar 27 2019, 23:01:00) 
[GCC 7.3.0]


In [48]:
df = pd.read_excel('riae-bibs-2014-2018-pt.xlsx')
df.head()

Unnamed: 0,CATEGORY,TITLE,AUTHOR,JOURNAL,YEAR,VOLUME,NUMBER,PAGES,ABSTRACT,URL,ISSN,TYPE,DOI
0,ARTICLE,Observação Participante e Não Participante: Co...,"Marietto, M. L.",Revista Ibero-Americana de Estratégia,2018,17,4,5-18,OBJETIVO DO ESTUDO: Fornecer explicações sobre...,http://www.spell.org.br/documentos/ver/51414/o...,2176-0756,Journal Article,10.5585/ ijsm.v17i4.2717
1,ARTICLE,Capacidades Dinâmicas em um Ambiente de Crise:...,"Barcelos, R. and Lopes, D. P. T. and Gonçalv...",Revista Ibero-Americana de Estratégia,2018,17,4,19-37,OBJETIVO DO TRABALHO: explorar evidências sobr...,http://www.spell.org.br/documentos/ver/51415/c...,2176-0756,Journal Article,10.5585/ ijsm.v17i4.2615
2,ARTICLE,A Competitividade do Setor de Telecomunicações...,"Muylder, C. F. and Falce, J. L. and Rodrigue...",Revista Ibero-Americana de Estratégia,2018,17,4,38-54,OBJETIVO: O objetivo deste trabalho consiste e...,http://www.spell.org.br/documentos/ver/51416/a...,2176-0756,Journal Article,10.5585/ ijsm.v17i4.2619
3,ARTICLE,Um Modelo Conceitual para a Caracterização da ...,"Carmona, V. C. and Martens, C. D. P. and Fre...",Revista Ibero-Americana de Estratégia,2018,17,4,55-70,OBJETIVO DO ESTUDO: O objetivo do presente art...,http://www.spell.org.br/documentos/ver/51417/u...,2176-0756,Journal Article,10.5585/ ijsm.v17i4.2627
4,ARTICLE,O Efeito Moderador das Capabilidades da Manufa...,"Vitorino Filho, V. A. and Moori, R. G.",Revista Ibero-Americana de Estratégia,2018,17,4,71-89,OBJETIVO DO ESTUDO: o objetivo deste artigo é ...,http://www.spell.org.br/documentos/ver/51418/o...,2176-0756,Journal Article,10.5585/ ijsm.v17i4.2634


In [49]:
print(df.info())
print(df.CATEGORY.value_counts())
print(df.TYPE.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 13 columns):
CATEGORY    172 non-null object
TITLE       172 non-null object
AUTHOR      172 non-null object
JOURNAL     172 non-null object
YEAR        172 non-null int64
VOLUME      172 non-null int64
NUMBER      172 non-null int64
PAGES       170 non-null object
ABSTRACT    172 non-null object
URL         172 non-null object
ISSN        172 non-null object
TYPE        172 non-null object
DOI         160 non-null object
dtypes: int64(3), object(10)
memory usage: 17.5+ KB
None
ARTICLE    164
MISC         8
Name: CATEGORY, dtype: int64
Journal Article    164
Generic              8
Name: TYPE, dtype: int64


In [50]:
# WTF is MISC?
df[df['CATEGORY']=='MISC']

Unnamed: 0,CATEGORY,TITLE,AUTHOR,JOURNAL,YEAR,VOLUME,NUMBER,PAGES,ABSTRACT,URL,ISSN,TYPE,DOI
19,MISC,Comentário Editorial: - Normas APA para a Reda...,"Serra, F. A. R. and Ferreira, M. A. S. P. V.",Revista Ibero-Americana de Estratégia,2018,17,2,1-4,A maioria dos pesquisadores têm dificuldade co...,http://www.spell.org.br/documentos/ver/51000/c...,2176-0756,Generic,10.5585/riae.v17i2.2687
45,MISC,Resenha: - William P. Barnett e a Evolução da ...,"Ribeiro, T. L. S. and Teixeira, J. E. V.",Revista Ibero-Americana de Estratégia,2017,16,4,133-153,O livro “The Red Queen among organizations: ho...,http://www.spell.org.br/documentos/ver/47925/r...,2176-0756,Generic,10.5585/riae.v16i4.2591
63,MISC,A Behavioral Theory of the Firm: Uma Análise C...,"Bernardo, E. G. and Foresto, A. M. and Ribei...",Revista Ibero-Americana de Estratégia,2017,16,2,141-150,"O livro “A Behavioral Theory of The Firm”, rep...",http://www.spell.org.br/documentos/ver/45800/a...,2176-0756,Generic,10.5585/riae.v16i2.2545
64,MISC,Editorial: - Escolha do Periódico para Submiss...,"Serra, F. A. R. and Ferreira, M. A. S. P. V. ...",Revista Ibero-Americana de Estratégia,2017,16,1,1-7,A publicação de artigos científicos em periódi...,http://www.spell.org.br/documentos/ver/45006/e...,2176-0756,Generic,10.5585/riae.v16i1.2526
81,MISC,Resenha: - Cambridge Handbook of Strategy-as-P...,"Marietto, M. L.",Revista Ibero-Americana de Estratégia,2016,15,4,118-125,Desde o artigo seminal de Whittington (1996) p...,http://www.spell.org.br/documentos/ver/43991/r...,2176-0756,Generic,10.5585/riae.v15i4.2469
96,MISC,Resenha: - The Dynamics of Local Learning in G...,"Avrichir, I. and Araujo, B. H. and Ramiro, W.",Revista Ibero-Americana de Estratégia,2016,15,2,130-143,O livro The dynamics of local learning in glob...,http://www.spell.org.br/documentos/ver/41882/r...,2176-0756,Generic,10.5585/riae.v15i2.2346
105,MISC,Leadership B.S. – Jefrey Pfeffer e a Indústria...,"Vils, L. and Rodrigues, G. V.",Revista Ibero-Americana de Estratégia,2016,15,1,147-154,“Desenvolver a capacidade de liderança” é uma ...,http://www.spell.org.br/documentos/ver/40920/l...,2176-0756,Generic,10.5585/riae.v15i1.2840
123,MISC,Resenha: - Opción Para Ganar - (Jim Collins & ...,"Pereira, M. F. and Boa, H. M. C.",Revista Ibero-Americana de Estratégia,2015,14,3,145-157,Com o aumento das pesquisas sobre como algumas...,http://www.spell.org.br/documentos/ver/38153/r...,2176-0756,Generic,10.5585/riae.v14i3.2264


In [51]:
# remove misc - 8 
df = df[df['CATEGORY']!='MISC']
print(df.info())
print(df.CATEGORY.value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 164 entries, 0 to 171
Data columns (total 13 columns):
CATEGORY    164 non-null object
TITLE       164 non-null object
AUTHOR      164 non-null object
JOURNAL     164 non-null object
YEAR        164 non-null int64
VOLUME      164 non-null int64
NUMBER      164 non-null int64
PAGES       162 non-null object
ABSTRACT    164 non-null object
URL         164 non-null object
ISSN        164 non-null object
TYPE        164 non-null object
DOI         152 non-null object
dtypes: int64(3), object(10)
memory usage: 17.9+ KB
None
ARTICLE    164
Name: CATEGORY, dtype: int64


In [52]:
df.to_excel('riae-bibs-164.xlsx', index=False)

In [24]:
nlp = spacy.load('pt_core_news_sm')
print('spaCy Version: %s' % (spacy.__version__))

spaCy Version: 2.1.4


In [25]:
# Convert to list
data = df.ABSTRACT.values.tolist()

In [26]:
# Remove Structured Abstracts
data = [s.replace('OBJETIVO DO ESTUDO: ', '') for s in data]
data = [s.replace('METODOLOGIA/ABORDAGEM: ', '') for s in data]
data = [s.replace('PRINCIPAIS RESULTADOS: ', '') for s in data]
data = [s.replace('CONTRIBUIÇÕES TEÓRICAS/METODOLÓGICAS: ','') for s in data]
data = [s.replace('RELEVÂNCIA/ORIGINALIDADE: ','') for s in data]

In [27]:
print(data[:1])

['Fornecer explicações sobre os métodos, sugerir um roteiro, não exaustivo, e exemplos de utilização dos métodos na etapa de coleta de dados para estimular a utilização dos métodos. Revisão teórica sobre os métodos; Proposta de roteiro de procedimentos para etapa de coleta de dados; Exemplos empíricos de aplicação do método. Localização contextual e longitudinal sobre a evolução do método e o emprego da TI na observação não participante; caracterização dos tipos de pesquisadores; detalhamento didático por meio de roteiro (passo a passo) para utilização dos métodos; exemplos empíricos para facilitar a compreensão de utilização dos métodos; A principal contribuição foi a proposição de um roteiro (passo a passo) para orientar e estimular os pesquisadores a utilizarem os métodos; A contribuição teórica deita-se sobre a distinção do método de observação participante sobre o pressuposto da etnografia, além da caracterização do método de observação não participante com auxilio da TI. Também, 

In [247]:
# remove punctuation and digits
import re
# punctuation and tokenize
data = [re.sub(r'[.!?\\-]','', word) for word in data] 
data = [re.sub(r',','', word) for word in data]
data = [re.sub(r';','', word) for word in data]
data = [re.sub(r'/','', word) for word in data] 
data = [re.sub(r':','', word) for word in data]
data = [re.sub(r'\(','', word) for word in data]
data = [re.sub(r'\)','', word) for word in data] 
data = [re.sub(r'-','', word) for word in data]
data = [re.sub(r'“','', word) for word in data] 
data = [re.sub(r'”','', word) for word in data]
# digits
data = [re.sub(r'[0-9]','', word) for word in data]

## Stop Words
```spaCy``` has some Stop Words

In [248]:
spacy_stopwords = spacy.lang.pt.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

Number of stop words: 465
First ten stop words: ['aumenta', 'nos', 'do', 'isto', 'lhe', 'des', 'dezanove', 'passo', 'pelos', 'questão']


In [7]:
nlp = spacy.load("pt_core_news_sm")

In [251]:
# Break data into documents
documents = []
for line in data:
    temp = nlp(line)
    documents.append(temp)

In [252]:
print('Length of data: %d' % len(data))
print('Length of documents: %d' % len(documents))

Length of data: 164
Length of documents: 164


In [253]:
import pickle
with open('documents', 'wb') as f: #save
    pickle.dump(documents, f)

#with open('documents', 'rb') as f: #load
#    documents = pickle.load(f)

In [23]:
# Adding custom stopwords
nlp.Defaults.stop_words.add("a")
nlp.Defaults.stop_words.add("e")
nlp.Defaults.stop_words.add("o")
nlp.Defaults.stop_words.add("ti")
nlp.Defaults.stop_words.add("(passo")
nlp.Defaults.stop_words.add("passo")
nlp.Defaults.stop_words.add("passo)")
nlp.Defaults.stop_words.add("passo a passo")
nlp.Defaults.stop_words.add("e/ou")
nlp.Defaults.stop_words.add("objetivo")
nlp.Defaults.stop_words.add("ainda")
nlp.Defaults.stop_words.add("além")
nlp.Defaults.stop_words.add("análise")
nlp.Defaults.stop_words.add("análisou")
nlp.Defaults.stop_words.add("análisamos")
nlp.Defaults.stop_words.add("analisando")
nlp.Defaults.stop_words.add("analisados")
nlp.Defaults.stop_words.add("analisado")
nlp.Defaults.stop_words.add("análises")
nlp.Defaults.stop_words.add("artigo")
nlp.Defaults.stop_words.add("abordagem")
nlp.Defaults.stop_words.add("aborda")
nlp.Defaults.stop_words.add("método")
nlp.Defaults.stop_words.add("métodos")
nlp.Defaults.stop_words.add("pesquisa")
nlp.Defaults.stop_words.add("pesquisas")
nlp.Defaults.stop_words.add("literatura")
nlp.Defaults.stop_words.add("academia")
nlp.Defaults.stop_words.add("investigação")
nlp.Defaults.stop_words.add("investiga")
nlp.Defaults.stop_words.add("amostra")
nlp.Defaults.stop_words.add("aumenta")
nlp.Defaults.stop_words.add("aumentou")
nlp.Defaults.stop_words.add("utilização")
nlp.Defaults.stop_words.add("utiliza")
nlp.Defaults.stop_words.add("estudo")
nlp.Defaults.stop_words.add("estudos")
nlp.Defaults.stop_words.add("resultado")
nlp.Defaults.stop_words.add("resultados")
nlp.Defaults.stop_words.add("apresenta")
nlp.Defaults.stop_words.add("apresentou")
nlp.Defaults.stop_words.add("apresentação")
nlp.Defaults.stop_words.add("proposta")
nlp.Defaults.stop_words.add("propostas")
nlp.Defaults.stop_words.add("diferentes")
nlp.Defaults.stop_words.add("diferem")
nlp.Defaults.stop_words.add("varia")
nlp.Defaults.stop_words.add("variou")
nlp.Defaults.stop_words.add("variam")
nlp.Defaults.stop_words.add("exemplos")
nlp.Defaults.stop_words.add("metodologia")
nlp.Defaults.stop_words.add("metodologias")
nlp.Defaults.stop_words.add("realização")
nlp.Defaults.stop_words.add("coleta")

In [24]:
print('Length of stopwords: %d' % len(nlp.Defaults.stop_words))

Length of stopwords: 465


In [3]:
removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']

In [11]:
# tokenizing and removing stop words and punctiation
temp=[]
removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
for line in documents:
    tokens = [token.text for token in line if not token.is_stop and 
              token.is_alpha and len(token)>2 and 
              token.pos_ not in removal]
    temp.append(tokens)

In [12]:
print('Length of documents tokenized: %d' % len(temp))

Length of documents tokenized: 164


In [13]:
# remove common words
texts = [[word for word in document if word not in nlp.Defaults.stop_words]
         for document in temp]

In [14]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

In [15]:
import pickle
with open('texts', 'wb') as f: #save
    pickle.dump(texts, f)

In [16]:
# lemmatize with Portuguese RSLP
import nltk
nltk.download('rslp')
stemmer = nltk.stem.RSLPStemmer()
lemmatized_output = [[stemmer.stem(token) for token in text] for text in texts]

[nltk_data] Downloading package rslp to /home/storopoli/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [38]:
# lemmatize with Portuguese Snowball
import nltk
stemmer_snowball = nltk.stem.snowball.PortugueseStemmer()
lemmatized_output_snowball = [[stemmer_snowball.stem(token) for token in text] for text in texts]

In [41]:
# Adding lemmatized stop words
nlp.Defaults.stop_words.add("explic")
nlp.Defaults.stop_words.add("suger")
nlp.Defaults.stop_words.add("rot")
nlp.Defaults.stop_words.add("etap")
nlp.Defaults.stop_words.add("colet")
nlp.Defaults.stop_words.add("dad")
nlp.Defaults.stop_words.add("etap")
nlp.Defaults.stop_words.add("tip")
nlp.Defaults.stop_words.add("facil")
nlp.Defaults.stop_words.add("compreens")
nlp.Defaults.stop_words.add("princip")
nlp.Defaults.stop_words.add("contribu")
nlp.Defaults.stop_words.add("orient")
nlp.Defaults.stop_words.add("estimul")
nlp.Defaults.stop_words.add("pesquis")
nlp.Defaults.stop_words.add("eou")
nlp.Defaults.stop_words.add("sid")
nlp.Defaults.stop_words.add("long")
nlp.Defaults.stop_words.add("ano")
nlp.Defaults.stop_words.add("silv")
nlp.Defaults.stop_words.add("originalidaderelev")
nlp.Defaults.stop_words.add("ferraz")
nlp.Defaults.stop_words.add("set")
nlp.Defaults.stop_words.add("realiz")
nlp.Defaults.stop_words.add("cri")
nlp.Defaults.stop_words.add("metodolog")
nlp.Defaults.stop_words.add("necess")
nlp.Defaults.stop_words.add("identific")
nlp.Defaults.stop_words.add("uso")
nlp.Defaults.stop_words.add("relev")
nlp.Defaults.stop_words.add("document")
nlp.Defaults.stop_words.add("técn")
nlp.Defaults.stop_words.add("melhor")
nlp.Defaults.stop_words.add("element")
nlp.Defaults.stop_words.add("analis")
nlp.Defaults.stop_words.add("apresent")
nlp.Defaults.stop_words.add("exist")
nlp.Defaults.stop_words.add("relaç")
nlp.Defaults.stop_words.add("influenc")
nlp.Defaults.stop_words.add("influ")
nlp.Defaults.stop_words.add("qual")
nlp.Defaults.stop_words.add("vist")
nlp.Defaults.stop_words.add("colet")
nlp.Defaults.stop_words.add("atu")
nlp.Defaults.stop_words.add("trat")
nlp.Defaults.stop_words.add("red")
nlp.Defaults.stop_words.add("vis")
nlp.Defaults.stop_words.add("compreend")
nlp.Defaults.stop_words.add("há")
nlp.Defaults.stop_words.add("entend")
nlp.Defaults.stop_words.add("utiliz")
nlp.Defaults.stop_words.add("trabalh")
nlp.Defaults.stop_words.add("tem")
nlp.Defaults.stop_words.add("bas")
nlp.Defaults.stop_words.add("tem")
nlp.Defaults.stop_words.add("autor")
nlp.Defaults.stop_words.add("refer")
nlp.Defaults.stop_words.add("obje")
nlp.Defaults.stop_words.add("ger")
nlp.Defaults.stop_words.add("bibliográf")
nlp.Defaults.stop_words.add("&")
nlp.Defaults.stop_words.add("científ")
nlp.Defaults.stop_words.add("foc")
nlp.Defaults.stop_words.add("revis")
nlp.Defaults.stop_words.add("busc")
nlp.Defaults.stop_words.add("aplic")
nlp.Defaults.stop_words.add("cade")
nlp.Defaults.stop_words.add("-")
nlp.Defaults.stop_words.add("permit")
nlp.Defaults.stop_words.add("empír")
nlp.Defaults.stop_words.add("atual")
nlp.Defaults.stop_words.add("categor")
nlp.Defaults.stop_words.add("projet")
nlp.Defaults.stop_words.add("fundament")
nlp.Defaults.stop_words.add("discuss")
nlp.Defaults.stop_words.add("propost")
nlp.Defaults.stop_words.add("soc")
nlp.Defaults.stop_words.add("caracter")
nlp.Defaults.stop_words.add("centr")
nlp.Defaults.stop_words.add("detalh")
nlp.Defaults.stop_words.add("adequ")
nlp.Defaults.stop_words.add("obtenç")
nlp.Defaults.stop_words.add("áre")
nlp.Defaults.stop_words.add("soc")
nlp.Defaults.stop_words.add("port")
nlp.Defaults.stop_words.add("moment")
nlp.Defaults.stop_words.add("auxili")
nlp.Defaults.stop_words.add("respond")
nlp.Defaults.stop_words.add("levant")
nlp.Defaults.stop_words.add("lev")
nlp.Defaults.stop_words.add("realizous")
nlp.Defaults.stop_words.add("confirm")
nlp.Defaults.stop_words.add("journ")
nlp.Defaults.stop_words.add("concluis")
nlp.Defaults.stop_words.add("discut")
nlp.Defaults.stop_words.add("verific")
nlp.Defaults.stop_words.add("entr")
nlp.Defaults.stop_words.add("quest")
nlp.Defaults.stop_words.add("garant")
nlp.Defaults.stop_words.add("rsc")
nlp.Defaults.stop_words.add("descri")
nlp.Defaults.stop_words.add("mostr")
nlp.Defaults.stop_words.add("sug")
nlp.Defaults.stop_words.add("represent")
nlp.Defaults.stop_words.add("escolh")
nlp.Defaults.stop_words.add("fort")
nlp.Defaults.stop_words.add("simil")
nlp.Defaults.stop_words.add("conduz")
nlp.Defaults.stop_words.add("evid")
nlp.Defaults.stop_words.add("entant")
nlp.Defaults.stop_words.add("prim")
nlp.Defaults.stop_words.add("cort")
nlp.Defaults.stop_words.add("ecm")
nlp.Defaults.stop_words.add("avali")
nlp.Defaults.stop_words.add("pres")
nlp.Defaults.stop_words.add("propos")
nlp.Defaults.stop_words.add("possu")
nlp.Defaults.stop_words.add("evidenci")
nlp.Defaults.stop_words.add("evidenc")
nlp.Defaults.stop_words.add("iii")
nlp.Defaults.stop_words.add("import")
nlp.Defaults.stop_words.add("investig")
nlp.Defaults.stop_words.add("sej")
nlp.Defaults.stop_words.add("utilizous")
nlp.Defaults.stop_words.add("observas")
nlp.Defaults.stop_words.add("poss")
nlp.Defaults.stop_words.add("dist")
nlp.Defaults.stop_words.add("volt")
nlp.Defaults.stop_words.add("bop")
nlp.Defaults.stop_words.add("emm")
nlp.Defaults.stop_words.add("cond")
nlp.Defaults.stop_words.add("diferenç")
nlp.Defaults.stop_words.add("vari")
nlp.Defaults.stop_words.add("cen")
nlp.Defaults.stop_words.add("possibilit")
nlp.Defaults.stop_words.add("vid")
nlp.Defaults.stop_words.add("camp")
nlp.Defaults.stop_words.add("artig")
nlp.Defaults.stop_words.add("base")
nlp.Defaults.stop_words.add("obr")
nlp.Defaults.stop_words.add("traz")
nlp.Defaults.stop_words.add("livr")
nlp.Defaults.stop_words.add("cas")
nlp.Defaults.stop_words.add("ocorr")
nlp.Defaults.stop_words.add("estud")
nlp.Defaults.stop_words.add("demonstr")
nlp.Defaults.stop_words.add("únic")
nlp.Defaults.stop_words.add("elabor")
nlp.Defaults.stop_words.add("tratas")
nlp.Defaults.stop_words.add("espec")
nlp.Defaults.stop_words.add("continu")
nlp.Defaults.stop_words.add("aument")
nlp.Defaults.stop_words.add("send")
nlp.Defaults.stop_words.add("suport")
nlp.Defaults.stop_words.add("melh")
nlp.Defaults.stop_words.add("softw")
nlp.Defaults.stop_words.add("obt")
nlp.Defaults.stop_words.add("ampli")
nlp.Defaults.stop_words.add("classific")
nlp.Defaults.stop_words.add("avanç")
nlp.Defaults.stop_words.add("oei")
nlp.Defaults.stop_words.add("alt")
nlp.Defaults.stop_words.add("últ")
nlp.Defaults.stop_words.add("tom")
nlp.Defaults.stop_words.add("conclus")

In [42]:
# remove common words
lemmatized_output = [[word for word in document if word not in nlp.Defaults.stop_words]
         for document in lemmatized_output]

In [43]:
import pickle
with open('lemmatized_output', 'wb') as f: #save
    pickle.dump(lemmatized_output, f)

In [44]:
dictionary = corpora.Dictionary(lemmatized_output)
dictionary.save('dictionary.dict')  # store the dictionary, for future reference
print(dictionary)

Dictionary(1025 unique tokens: ['compet', 'context', 'didá', 'empreg', 'estratég']...)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [45]:
# get dictionary to add more stopwords
import csv
with open('dictionary.csv', 'w') as f:
    for key in dictionary.keys():
        f.write("%s,%s\n"%(key,dictionary[key]))
dictionary_df = pd.DataFrame.from_dict(dictionary, orient='index')
dictionary_df.to_excel('dictionary.xlsx')

In [39]:
dictionary_snowball = corpora.Dictionary(lemmatized_output_snowball)
dictionary_snowball.save('dictionary_snowball.dict')  # store the dictionary, for future reference
print(dictionary_snowball)

Dictionary(1263 unique tokens: ['aplic', 'caracteriz', 'central', 'colet', 'competent']...)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [40]:
# get dictionary to add more stopwords
import csv
with open('dictionary_snowball.csv', 'w') as f:
    for key in dictionary_snowball.keys():
        f.write("%s,%s\n"%(key,dictionary_snowball[key]))
dictionary_snowball_df = pd.DataFrame.from_dict(dictionary_snowball, orient='index')
dictionary_snowball_df.to_excel('dictionary_snowball.xlsx')

In [46]:
# get corpus vectors
corpus = [dictionary.doc2bow(text) for text in lemmatized_output]
corpora.MmCorpus.serialize('corpus.mm', corpus)  # store to disk, for later use
print(len(corpus))

164


## Load Everything
If you ever need to load dictionary, corpus and documents

In [2]:
import pickle
with open('documents', 'rb') as f: #load
    documents = pickle.load(f)
with open('stoplist', 'rb') as f: #load
    stoplist = pickle.load(f)
with open('texts', 'rb') as f: #load
    texts = pickle.load(f)
with open('lemmatized_output', 'rb') as f: #load
    lemmatized_output = pickle.load(f)

dictionary = gensim.corpora.Dictionary.load('dictionary.dict')
corpus = gensim.corpora.MmCorpus('corpus.mm')

print(dictionary)
print(corpus)
print(len(documents))
print(len(lemmatized_output))
print(len(stoplist))

Dictionary(1252 unique tokens: ['aplic', 'caracter', 'centr', 'colet', 'compet']...)
MmCorpus(164 documents, 1252 features, 8582 non-zero entries)
164
164
456


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [62]:
temppp = []
for i in lemmatized_output:
    temp2 = ''
    for line in i:
        temp2 = temp2 + line
        temp2 = temp2 + ' '
    temppp.append(temp2)

In [64]:
len(temppp)

164

In [71]:
np.savetxt("export_texts_lemmatize.csv", temppp, delimiter=",", fmt='%s', header='documents')