## Goal here is to pre-process my abstracts text for topic modeling ##

In [2]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import urllib
import time
import feedparser
%matplotlib inline

from gensim import corpora, models, similarities
from gensim.matutils import Sparse2Corpus
from gensim.parsing.preprocessing import STOPWORDS
from time import time

import pickle


In [6]:
# Gonna just import the whole batch for now, b/c it's small enough and streaming isn't necessary
abstracts = pd.read_csv('./new_hope_data/arxiv_csML.csv')['summary']

In [12]:
stopfree= [[word for word in doc.lower().split(' ') if word not in STOPWORDS ] for doc in abstracts[:5]]

stopfree

[['discuss',
  'algorithms',
  'estimating',
  'shannon',
  'entropy',
  'h',
  'finite',
  'symbol',
  'sequences',
  'long',
  'range',
  'correlations.',
  'particular,',
  'consider',
  'algorithms',
  'estimate',
  'h',
  'code',
  'lengths',
  'produced',
  'compression',
  'algorithm.',
  'describing',
  'convergence',
  'sequence',
  'length,',
  'assuming',
  'limits',
  'space',
  'time',
  'complexities',
  'compression',
  'algorithms.',
  'scaling',
  'law',
  'proposed',
  'extrapolation',
  'finite',
  'sample',
  'lengths.',
  'applied',
  'sequences',
  'dynamical',
  'systems',
  'non-trivial',
  'chaotic',
  'regimes,',
  '1-d',
  'cellular',
  'automaton,',
  'written',
  'english',
  'texts.'],
 ['designing',
  'photometric',
  'best',
  'fulfil',
  'set',
  'scientific',
  'goals',
  'complex',
  'task,',
  'demanding',
  'compromise',
  'conflicting',
  'requirements',
  'subject',
  'constraints.',
  'specific',
  'example',
  'determination',
  'stellar',
  'as

In [16]:
# from gensim.corpora.textcorpus.TextCorpus import get_texts
# import gensim.copora.textcorpus as tc
import gensim.corpora.textcorpus as tc

In [28]:
import spacy
# spacy.load('en')
from spacy.lang.en import English
parser = English()

In [30]:
test = abstracts[0]
parsed_data = parser(test)

In [43]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /home/omar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [45]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/omar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [128]:
def prep_text(text):
    myreg=r'\$.+\$|\[.+\]|\(.+\, *\d{2,4}\w*\)' #this removes LATEX formatting as well as (most) citations
    parsed_data = parser(re.sub(myreg, '', text))
    tokens = [str(token).lower() for token in parsed_data if len(token) > 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [129]:
monster_toke = [prep_text(abstract) for abstract in abstracts] # Oh god I hope my comp doesn't freak

In [130]:
dictionary = corpora.Dictionary(monster_toke)

In [131]:
dictionary.save("./new_hope_data/monster_toke.dict")

In [132]:
print(dictionary.token2id)



In [135]:
corpus = [dictionary.doc2bow(text) for text in monster_toke]
corpora.MmCorpus.serialize('./new_hope_data/monster_toke_corp.mm', corpus)

## Ok that's good for now. We have our dictionary and corpus. It's not perfect yet, but eh..meh. we'll see ##

In [2]:
# Load the data from it's saved form
dictionary = corpora.Dictionary.load('./new_hope_data/monster_toke.dict')
corpus = corpora.MmCorpus('./new_hope_data/monster_toke_corp.mm')

In [None]:
# Initialize a model, TFIDF
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

In [7]:
tfidf_corpus = tfidf[corpus]

In [11]:
# Initialize LSI based on TFIDF
lsi = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=4) # 10 for now
corpus_lsi = lsi[tfidf_corpus]

In [12]:
lsi.print_topics(4)
# These topics are horrible!

[(0,
  '0.138*"model" + 0.129*"network" + 0.123*"algorithm" + 0.117*"data" + 0.117*"method" + 0.114*"learning" + 0.102*"problem" + 0.096*"function" + 0.092*"graph" + 0.091*"feature"'),
 (1,
  '-0.268*"network" + 0.213*"matrix" + -0.193*"deep" + -0.187*"neural" + 0.158*"convex" + -0.141*"training" + 0.138*"kernel" + -0.134*"image" + -0.133*"adversarial" + -0.125*"task"'),
 (2,
  '0.499*"graph" + 0.246*"clustering" + -0.240*"gradient" + -0.169*"policy" + 0.151*"kernel" + -0.141*"optimization" + 0.139*"cluster" + -0.138*"stochastic" + -0.129*"convex" + 0.122*"node"'),
 (3,
  '0.271*"inference" + -0.258*"graph" + 0.223*"variational" + 0.212*"bayesian" + -0.206*"matrix" + 0.193*"distribution" + 0.185*"variable" + 0.185*"posterior" + 0.181*"model" + 0.175*"latent"')]

In [15]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=2)
corpus_lda = lda[corpus]
lda.print_topics(2)

[(0,
  '0.018*"model" + 0.015*"learning" + 0.013*"network" + 0.011*"data" + 0.009*"method" + 0.008*"propose" + 0.007*"neural" + 0.007*"approach" + 0.007*"deep" + 0.007*"base"'),
 (1,
  '0.015*"algorithm" + 0.014*"method" + 0.012*"problem" + 0.011*"data" + 0.009*"propose" + 0.007*"show" + 0.007*"model" + 0.007*"function" + 0.006*"result" + 0.006*"learning"')]

In [14]:
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp= hdp[corpus]
hdp.print_topics()

[(0,
  '0.015*model + 0.012*data + 0.011*method + 0.010*learning + 0.010*algorithm + 0.008*propose + 0.008*problem + 0.007*network + 0.006*show + 0.006*base'),
 (1,
  '0.015*model + 0.012*data + 0.012*learning + 0.010*method + 0.009*network + 0.008*algorithm + 0.008*propose + 0.007*problem + 0.006*show + 0.006*base'),
 (2,
  '0.009*model + 0.008*algorithm + 0.007*method + 0.007*data + 0.006*learning + 0.005*propose + 0.005*problem + 0.005*network + 0.005*base + 0.004*show'),
 (3,
  '0.009*model + 0.006*data + 0.005*method + 0.005*algorithm + 0.004*problem + 0.004*learning + 0.004*network + 0.004*propose + 0.003*show + 0.003*result'),
 (4,
  '0.004*model + 0.003*algorithm + 0.003*method + 0.003*data + 0.002*result + 0.002*base + 0.002*approach + 0.002*problem + 0.002*show + 0.002*propose'),
 (5,
  '0.003*model + 0.003*algorithm + 0.002*data + 0.002*learning + 0.001*result + 0.001*show + 0.001*problem + 0.001*base + 0.001*matrix + 0.001*approach'),
 (6,
  '0.002*method + 0.002*network + 

In [6]:
# Going to follow along with https://github.com/bhargavvader/personal/tree/master/notebooks/text_analysis_tutorial
import matplotlib.pyplot as plt
import gensim
import numpy as np
import spacy

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim

import os, re, operator, warnings

In [10]:
# Gonna just import the whole batch for now, b/c it's small enough and streaming isn't necessary
abstracts = pd.read_csv('./new_hope_data/arxiv_csML.csv')['summary']

In [15]:
import spacy
nlp = spacy.load('en') #en

In [17]:
preproc_abstracts = [[w.lemma_ for w in nlp(abstract) if not w.is_stop and not w.is_punct and not w.like_num] for abstract in abstracts]
# for abstract in abstracts:
#     curr_abst = nlp(abstract)
#     new_abst = [[w.lemma_ for w in nlp(abstract) if not w.is_stop and not w.is_punct and not w.like_num] for abstract in abstracts]

In [19]:
with open('./new_hope_data/preproc_abstracts', 'wb') as fp:
    pickle.dump(preproc_abstracts, fp)


In [3]:
with open ('./new_hope_data/preproc_abstracts', 'rb') as fp:
    corpus = pickle.load(fp)

In [8]:
bigrams = gensim.models.Phrases(corpus)

In [10]:
texts = [bigrams[line] for line in corpus]



In [12]:
dictionary = Dictionary[texts]
corpus = 

['-PRON-',
 'discuss',
 'algorithm',
 'estimate',
 'shannon_entropy',
 'h',
 'finite',
 'symbol_sequence',
 'long_range',
 'correlation',
 'in_particular',
 'consider',
 'algorithm',
 'estimate',
 'h',
 'code',
 'length',
 'produce',
 'compression',
 'algorithm',
 '-PRON-',
 'interest',
 'describe',
 'convergence',
 'sequence',
 'length',
 'assume',
 'limit',
 'space',
 'time',
 'complexity',
 'compression',
 'algorithm',
 'a',
 'scale',
 'law',
 'propose',
 'extrapolation',
 'finite_sample',
 'length',
 'this',
 'apply',
 'sequence',
 'dynamical_system',
 'non_trivial',
 'chaotic',
 'regime',
 '1-d',
 'cellular',
 'automaton',
 'write',
 'english',
 'text']

document to topic weights
#of documents in topic
df of top documents within a given topic
list of top tokens within topic
Click document -> highlight topic tokens 
Set high k for lda