### Topics modeling test

In [1]:
import pandas as pd
import nltk

In [2]:
def cleanup(df):
    category_cols = ['abstract','acm_class','arxiv_id','author_text','categories',
                 'comments','created','doi','num_authors','num_categories','primary_cat',
                 'title','updated']
    df[['num_authors', 'num_categories']] = df[['num_authors', 'num_categories']].apply(pd.to_numeric)
    df[['created','updated']] =df[['created', 'updated']].apply(pd.to_datetime)
    return df

In [3]:
df = pd.read_table("processed_data/20200101/per_category/cs.AI.tsv.xz", index_col=0)

In [4]:
df = cleanup(df)

In [5]:
df['year'] = df['created'].dt.year

In [6]:
df.head()

Unnamed: 0,abstract,acm_class,arxiv_id,author_text,categories,comments,created,doi,num_authors,num_categories,primary_cat,title,updated,categories_list,year
1236770,Because of their occasional need to return to ...,,cs/9308101,M. L. Ginsberg,cs.AI,See http://www.jair.org/ for an online appendi...,1993-07-31,,1,1,cs.AI,Dynamic Backtracking,NaT,['cs.AI'],1993
1236771,Market price systems constitute a well-underst...,,cs/9308102,M. P. Wellman,cs.AI,See http://www.jair.org/ for any accompanying ...,1993-07-31,,1,1,cs.AI,A Market-Oriented Programming Environment and ...,NaT,['cs.AI'],1993
1236772,We describe an extensive study of search in GS...,,cs/9309101,"I. P. Gent, T. Walsh",cs.AI,See http://www.jair.org/ for any accompanying ...,1993-08-31,,2,1,cs.AI,An Empirical Analysis of Search in GSAT,NaT,['cs.AI'],1993
1236773,As real logic programmers normally use cut (!)...,,cs/9311101,"F. Bergadano, D. Gunetti, U. Trinchero",cs.AI,See http://www.jair.org/ for any accompanying ...,1993-10-31,,3,1,cs.AI,The Difficulties of Learning Logic Programs wi...,NaT,['cs.AI'],1993
1236774,To support the goal of allowing users to recor...,,cs/9311102,"J. C. Schlimmer, L. A. Hermens",cs.AI,See http://www.jair.org/ for an online appendi...,1993-10-31,,2,1,cs.AI,Software Agents: Completing Patterns and Const...,NaT,['cs.AI'],1993


In [7]:
from nltk.corpus import stopwords

In [8]:
import nltk
from nltk import word_tokenize

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mrinal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/mrinal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
stop_words = stopwords.words('english')

In [12]:
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [13]:
from collections import Counter, defaultdict

In [14]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
  
print("rocks :", lemmatizer.lemmatize("rocks")) 
print("corpora :", lemmatizer.lemmatize("corpora"))

rocks : rock
corpora : corpus


## Generate frequency counts from abstracts

In [39]:
def process_abstract(stream):
    return [lemmatizer.lemmatize(word.lower()) for word in stream if word.isalnum()]

In [40]:
import itertools

In [41]:
df['abstract'].iteritems()

<zip at 0x7fbd44a7ec30>

In [42]:
stream_1 = (word_tokenize(txt) for _, txt in df['abstract'].iteritems())

In [43]:
stream = itertools.chain.from_iterable(stream_1)

In [44]:
lemma_stream = (lemmatizer.lemmatize(word.lower()) for word in stream if word.isalnum())

In [45]:
# for w in lemma_stream:
#     h[w] += 1

In [46]:
c = Counter(lemma_stream)

In [47]:
c['random']

1362

In [48]:
#h = defaultdict(int)

In [49]:
import json

In [50]:
with open("dictionary.json", "w") as f:
    json.dump(c, f)

In [51]:
encoding = {w: i for i, w in enumerate(c)}

In [52]:
with open("encoding.json", "w") as f:
    json.dump(encoding, f)

In [53]:
word_counts_df = pd.DataFrame.from_dict(c, orient='index', columns=['count']).sort_values('count', ascending=False)

In [60]:
word_counts_df_1 = word_counts_df.loc[~word_counts_df.index.isin(stop_words),:]

In [61]:
word_counts_df_1.reset_index().to_csv("word_counts.csv", index=False)

In [52]:
'the' in stop_words

True

## Topic modeling with GenSim

In [16]:
def lemmatize(doc):
    return [lemmatizer.lemmatize(word.lower()) for word in doc]

In [17]:
def filter_stop_words(doc):
    return [word for word in doc if word not in stop_words]

In [18]:
def filter_short_words(doc):
    return [word for word in doc if len(word) > 4]

In [19]:
docs_test = df['abstract'].iloc[:10]

In [20]:
docs_all = (txt for _, txt in df['abstract'].iteritems())

In [21]:
df_1998 = df.loc[df['year'] == 1998,:]

In [22]:
df_2018 = df.loc[df['year'] == 2018,:]

In [23]:
docs_1998 = (txt for _, txt in df_1998['abstract'].iteritems())

In [24]:
docs_2018 = (txt for _, txt in df_2018['abstract'].iteritems())

In [25]:
docs=docs_all

In [26]:
def process_docs(docs):
    corpus = list(map(filter_stop_words, map(filter_short_words, map(lemmatize, map(word_tokenize, docs)))))
    return corpus

In [27]:
corpus_1998 = process_docs(docs_1998)

In [28]:
corpus_2018 = process_docs(docs_2018)

In [30]:
corpus = process_docs(docs_all)

In [31]:
from gensim import corpora
dictionary = corpora.Dictionary(corpus)
corpus_bow = [dictionary.doc2bow(text) for text in corpus]

In [37]:
dict(dictionary)

{0: 'approach',
 1: 'avoiding',
 2: 'backtrack',
 3: 'backtracking',
 4: 'completeness',
 5: 'control',
 6: 'deeper',
 7: 'dependency-directed',
 8: 'developed',
 9: 'difficulty',
 10: 'earlier',
 11: 'erase',
 12: 'existing',
 13: 'guarantee',
 14: 'information',
 15: 'meaningful',
 16: 'method',
 17: 'moved',
 18: 'occasional',
 19: 'paper',
 20: 'point',
 21: 'polynomial',
 22: 'present',
 23: 'problem',
 24: 'progress',
 25: 'provided',
 26: 'providing',
 27: 'retaining',
 28: 'return',
 29: 'search',
 30: 'shallow',
 31: 'solving',
 32: 'sometimes',
 33: 'space',
 34: 'still',
 35: 'technique',
 36: 'thereby',
 37: 'toward',
 38: 'useful',
 39: 'variant',
 40: 'according',
 41: 'activity',
 42: 'agent',
 43: 'allocation',
 44: 'analyzed',
 45: 'artificial',
 46: 'basic',
 47: 'behavior',
 48: 'careful',
 49: 'certain',
 50: 'class',
 51: 'communication',
 52: 'competitive',
 53: 'computational',
 54: 'computing',
 55: 'condition',
 56: 'constitute',
 57: 'construct',
 58: 'constru

In [32]:
import pickle
pickle.dump(corpus_bow, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [144]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus_bow, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [147]:
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.019*"model" + 0.017*"learning" + 0.011*"network" + 0.009*"method" + 0.009*"system"')
(1, '0.022*"learning" + 0.017*"algorithm" + 0.015*"method" + 0.015*"policy" + 0.014*"model"')
(2, '0.033*"problem" + 0.028*"algorithm" + 0.015*"search" + 0.012*"solution" + 0.010*"constraint"')
(3, '0.019*"knowledge" + 0.018*"language" + 0.015*"model" + 0.012*"graph" + 0.012*"logic"')
(4, '0.019*"system" + 0.011*"decision" + 0.010*"agent" + 0.010*"paper" + 0.010*"intelligence"')


In [168]:
from gensim import corpora
dictionary_1998 = corpora.Dictionary(corpus_1998)
corpus_bow_1998 = [dictionary.doc2bow(text) for text in corpus_1998]

In [169]:
import pickle
pickle.dump(corpus_bow_1998, open('corpus_1998.pkl', 'wb'))
dictionary_1998.save('dictionary_1998.gensim')

In [171]:
import gensim
NUM_TOPICS = 5
ldamodel_1998 = gensim.models.ldamodel.LdaModel(corpus_bow_1998, num_topics = NUM_TOPICS, id2word=dictionary_1998, passes=15)
ldamodel_1998.save('model5_1998.gensim')

In [172]:
topics_1998 = ldamodel.print_topics(num_words=5)
for topic in topics_1998:
    print(topic)

(0, '0.014*"control" + 0.014*"daydreaming" + 0.009*"chess" + 0.007*"strategy" + 0.007*"problem"')
(1, '0.023*"logic" + 0.022*"problem" + 0.017*"model" + 0.013*"programming" + 0.013*"macro"')
(2, '0.017*"structure" + 0.014*"approach" + 0.014*"number" + 0.012*"algorithm" + 0.012*"provide"')
(3, '0.039*"theory" + 0.019*"problem" + 0.013*"algorithm" + 0.013*"domain" + 0.013*"patching"')
(4, '0.015*"query" + 0.012*"domain" + 0.010*"system" + 0.010*"language" + 0.009*"agent"')


In [173]:
from gensim import corpora
dictionary_2018 = corpora.Dictionary(corpus_2018)
corpus_bow_2018 = [dictionary.doc2bow(text) for text in corpus_2018]

In [174]:
import pickle
pickle.dump(corpus_bow_2018, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [175]:
import gensim
NUM_TOPICS = 5
ldamodel_2018 = gensim.models.ldamodel.LdaModel(corpus_bow_2018, num_topics = NUM_TOPICS, id2word=dictionary_2018, passes=15)
ldamodel_2018.save('model5_2018.gensim')

In [176]:
topics_2018 = ldamodel_2018.print_topics(num_words=5)
for topic in topics_2018:
    print(topic)

(0, '0.047*"better" + 0.040*"great" + 0.020*"include" + 0.017*"accurate" + 0.016*"domain"')
(1, '0.021*"operate" + 0.014*"examine" + 0.011*"opposed" + 0.008*"produce" + 0.008*"\'contrastive"')
(2, '0.048*"performance" + 0.025*"network" + 0.021*"speech" + 0.016*"assessment" + 0.014*"tested"')
(3, '0.066*"network" + 0.057*"memory" + 0.034*"double-q" + 0.030*"symbolic" + 0.025*"mechanism"')
(4, '0.078*"alternative" + 0.032*"present" + 0.028*"based" + 0.025*"hundred" + 0.025*"construct"')
