In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
np.set_printoptions(suppress=True)

In [4]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
from sklearn.feature_extraction import stop_words
sorted(list(stop_words.ENGLISH_STOP_WORDS))[:20]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst']

In [6]:
from nltk import stem

In [7]:
wnl = stem.WordNetLemmatizer()
porter = stem.porter.PorterStemmer()

In [8]:
word_list = ['feet', 'foot', 'foots', 'footing']

In [9]:
[wnl.lemmatize(word) for word in word_list]

['foot', 'foot', 'foot', 'footing']

In [10]:
[porter.stem(word) for word in word_list]

['feet', 'foot', 'foot', 'foot']

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [12]:
import nltk

In [13]:
vectorizer = CountVectorizer(stop_words='english')

In [15]:
vectors = vectorizer.fit_transform(newsgroups_train.data).todense()
vectors.shape

(2034, 26576)

In [16]:
print(len(newsgroups_train.data), vectors.shape)


2034 (2034, 26576)


In [17]:
vocab = np.array(vectorizer.get_feature_names())

In [18]:
vocab.shape

(26576,)

In [19]:
vocab[7000:7020]

array(['cosmonauts', 'cosmos', 'cosponsored', 'cost', 'costa', 'costar',
       'costing', 'costly', 'costruction', 'costs', 'cosy', 'cote',
       'couched', 'couldn', 'council', 'councils', 'counsel',
       'counselees', 'counselor', 'count'], dtype='<U80')

In [20]:
%time U, s, Vh = linalg.svd(vectors, full_matrices=False) #SVD Step

CPU times: user 31.5 s, sys: 746 ms, total: 32.3 s
Wall time: 17 s


In [21]:
print(U.shape, s.shape, Vh.shape)

(2034, 2034) (2034,) (2034, 26576)


In [22]:
num_top_words=8

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [23]:
show_topics(Vh[:10])

['critus ditto propagandist surname galacticentric kindergarten surreal imaginative',
 'jpeg gif file color quality image jfif format',
 'graphics edu pub mail 128 3d ray ftp',
 'jesus god matthew people atheists atheism does graphics',
 'image data processing analysis software available tools display',
 'god atheists atheism religious believe religion argument true',
 'space nasa lunar mars probe moon missions probes',
 'image probe surface lunar mars probes moon orbit',
 'argument fallacy conclusion example true ad argumentum premises',
 'space larson image theory universe physical nasa material']

In [None]:
##### Next Steps ########

In [27]:
import pandas as pd
import pickle

In [29]:
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [68]:
from gensim import matutils,models
import scipy.sparse

In [81]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [84]:
import gensim
data_words = list(sent_to_words(newsgroups_train.data))

[['hi', 've', 'noticed', 'that', 'if', 'you', 'only', 'save', 'model', 'with', 'all', 'your', 'mapping', 'planes', 'positioned', 'carefully', 'to', 'ds', 'file', 'that', 'when', 'you', 'reload', 'it', 'after', 'restarting', 'ds', 'they', 'are', 'given', 'default', 'position', 'and', 'orientation', 'but', 'if', 'you', 'save', 'to', 'prj', 'file', 'their', 'positions', 'orientation', 'are', 'preserved', 'does', 'anyone', 'know', 'why', 'this', 'information', 'is', 'not', 'stored', 'in', 'the', 'ds', 'file', 'nothing', 'is', 'explicitly', 'said', 'in', 'the', 'manual', 'about', 'saving', 'texture', 'rules', 'in', 'the', 'prj', 'file', 'like', 'to', 'be', 'able', 'to', 'read', 'the', 'texture', 'rule', 'information', 'does', 'anyone', 'have', 'the', 'format', 'for', 'the', 'prj', 'file', 'is', 'the', 'cel', 'file', 'format', 'available', 'from', 'somewhere', 'rych']]


In [90]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [89]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [100]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [95]:
import spacy
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [98]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anandabhishek/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [104]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [105]:
import gensim.corpora as corpora
id2word=corpora.Dictionary(data_lemmatized)

In [106]:
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

In [107]:
lda=models.LdaModel(corpus=corpus,id2word=id2word,num_topics=10,passes=20)
lda.print_topics()

[(0,
  '0.023*"tell" + 0.023*"mary" + 0.016*"get" + 0.015*"actually" + 0.014*"story" + 0.012*"ridiculous" + 0.009*"thing" + 0.009*"people" + 0.008*"never" + 0.008*"know"'),
 (1,
  '0.016*"space" + 0.012*"also" + 0.010*"package" + 0.007*"use" + 0.007*"user" + 0.007*"program" + 0.007*"would" + 0.007*"launch" + 0.006*"wireframe" + 0.006*"model"'),
 (2,
  '0.014*"well" + 0.014*"perhaps" + 0.012*"get" + 0.012*"version" + 0.011*"say" + 0.011*"find" + 0.010*"texture" + 0.010*"change" + 0.009*"burn" + 0.009*"delete"'),
 (3,
  '0.013*"would" + 0.012*"think" + 0.012*"space" + 0.009*"ssrt" + 0.009*"say" + 0.009*"first" + 0.008*"build" + 0.007*"take" + 0.007*"rocket" + 0.007*"well"'),
 (4,
  '0.024*"would" + 0.016*"know" + 0.016*"thank" + 0.014*"software" + 0.012*"get" + 0.011*"look" + 0.010*"card" + 0.009*"need" + 0.009*"interested" + 0.009*"window"'),
 (5,
  '0.015*"distribution" + 0.014*"star" + 0.012*"black_hole" + 0.012*"spectrum" + 0.011*"burster" + 0.011*"ns" + 0.011*"orbit" + 0.010*"moon" 