# Latent Semantic Indexing

Z. W. Miller - Copyright 2018

In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
import scipy
%matplotlib inline
plt.style.use('seaborn')

In [2]:
import sklearn
import matplotlib
import sys
libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd), ('Scipy', scipy), ('Sklearn', sklearn))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.6.2 |Anaconda custom (64-bit)| (default, Sep 21 2017, 18:29:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] 

Matplotlib Version: 2.0.2
Numpy Version: 1.12.1
Pandas Version: 0.20.3
Scipy Version: 0.19.1
Sklearn Version: 0.19.0


In [3]:
import numpy as np

class latent_semantic_indexing:
    
    def __init__(self, num_topics=5):
        """
        Latent semantic indexing uses matrix decomposition
        techniques to reduce the large feature space associated
        with text analysis into a smaller "topic" space which
        by exploiting SVD's ability to find correlations in
        features and combine them into super-dimensions made
        of the correlated columns. In the text analysis, that 
        means if the original features are word, LSI will 
        find words that tend to be in the same document together
        and group them as unique topics. 
        """
        self.num_topics = num_topics
        
    def fit(self, X):
        """
        Using SVD as the base of the algorithm (we use numpy since 
        it's faster than our method), we do a dimensionality
        reduction. Remember that V is an expression of the new
        dimensions in terms of the old columns. If we do count
        vectorizer, this is an expression of topics in terms of
        ngrams. We'll use this to extract our topics. We can also
        cast new documents into topic space using the V matrix.
        """
        X = self.convert_to_array(X)
        self.U, self.sigma, self.V = np.linalg.svd(X)
        self.V = self.V[:self.num_topics,:]
        self.sigma = self.sigma[:self.num_topics]
        self.U = self.U[:,:self.num_topics]
        
    def transform(self, X):
        """
        Since V is a conversion of columns to the lower
        dimensional space, we can just use matrix 
        multiplication to cast any new data into that 
        space.
        ---
        Input: X, data matrix (dataframe, array, list of lists)
        """
        X = self.convert_to_array(X)
        return np.dot(X, self.V.T)
    
    def fit_transform(self, X):
        """
        Fit on X and then transform X and return it as vectors.
        """
        self.fit(X)
        return self.transform(X)
    
    def print_topics(self, X, id_to_word=None, num_words_per_topics=10):
        """
        For each topic created in the SVD decomposition,
        iterate through the strongest contributors (positive
        or negative), and print out those words. Requires a 
        column number to word dictionary, otherwise just prints
        the column number for the strong correlations.
        """
        for idx, row in enumerate(self.V):
            sorted_word_ids = np.argsort(row)[-num_words_per_topics:]
            print("--- Topic ", idx, " ---")
            words_to_print = ""
            for word_id in sorted_word_ids:
                if id_to_word != None:
                    words_to_print += id_to_word[word_id]
                    words_to_print += ', '
                else:
                    words_to_print += "Column "
                    words_to_print += str(word_id)
                    words_to_print += ', '
            print(words_to_print[:-2])
    
    def pandas_to_numpy(self, x):
        """
        Checks if the input is a Dataframe or series, converts to numpy matrix for
        calculation purposes.
        ---
        Input: X (array, dataframe, or series)
        Output: X (array)
        """
        if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
            return x.as_matrix()
        if type(x) == type(np.array([1,2])):
            return x
        return np.array(x) 
    
    def handle_1d_data(self,x):
        """
        Converts 1 dimensional data into a series of rows with 1 columns
        instead of 1 row with many columns.
        """
        if x.ndim == 1:
            x = x.reshape(-1,1)
        return x
    
    def convert_to_array(self, x):
        """
        Takes in an input and converts it to a numpy array
        and then checks if it needs to be reshaped for us
        to use it properly
        """
        x = self.pandas_to_numpy(x)
        x = self.handle_1d_data(x)
        return x
                        

In [4]:
import numpy as np
from collections import Counter
from string import punctuation

class count_vectorizer:
    
    def __init__(self, max_features=None, ngrams = (1,1), tokenizer=None, remove_stopwords=False):
        """
        Count vectorizer reads the text provided, tokenizes it
        with the provided tokenizer (or the default), then generates
        ngrams keeping track of all ngrams as the vocabulary.
        Then it takes provided texts and converts them into vectors
        by counting the appearance of each ngram and tracking that
        for every document. 
        ---
        KWargs:
        max_features: how many ngrams to allow in the vector, using the
        most common features first. If None, defaults to using all
        ngrams (int)
        ngrams: how many tokens to combine to form features. First element
        of tuple is starting point, second is ending point.
        tokenizer: what function to use to create tokens (must return 
        list of tokens)
        """
        self.max_features = max_features
        self.vocabulary = {}
        self.ngrams = ngrams
        if tokenizer == None:
            self.tokenizer = self.tokenize
        else:
            self.tokenizer = tokenizer
        self.remove_stopwords = remove_stopwords
        self.stopwords = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 
                          'there', 'about', 'once', 'during', 'out', 'very', 'having', 
                          'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 
                          'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 
                          'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 
                          'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 
                          'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 
                          'himself', 'this', 'down', 'should', 'our', 'their', 'while', 
                          'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 
                          'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 
                          'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 
                          'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 
                          'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 
                          'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 
                          'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 
                          'was', 'here', 'than'}
        
    def token_generator(self, X):
        """
        Generator that returns joined tokens as a single
        string to act as a feature. It generates the tokens
        by iterating through the allowed ngrams and combining
        the appropriate number of tokens into a string.
        """
        for i in range(self.ngrams[0],self.ngrams[1]+1):
            for ix, _ in enumerate(X):
                if ix+i < len(X)+1:
                    yield ' '.join(X[ix:ix+i])
    
    def tokenize(self, X):
        """
        Simple tokenizer that removes punctuation,
        lowercases the text, and breaks on spaces.
        Also removes stopwords and numeric values
        from being treated as words.
        """
        for symbol in punctuation:
            X = X.replace(symbol,'')
        final_token_list = [] 
        for token in X.lower().split():
            if self.remove_stopwords:
                if not self.check_stopwords(token):
                    try:
                        int(token)
                        float(token)
                    except:
                        final_token_list.append(token)  
            else:
                final_token_list.append(token)
        return final_token_list
        
    def check_stopwords(self, token):
        """
        Checks if the token is in our list of common
        stopwords, and returns a boolean.
        """
        return token in self.stopwords
    
    def fit(self, X):
        """
        Go through all provided training documents and
        create the list of vocabulary for known documents
        by looking at all ngrams and tracking how often
        those ngrams appear. If max_features is defined,
        only keep the most common tokens. Afterward,
        generate a token_to_id mapper and an id_to_token
        mapper.
        """
        for document in X:
            tokens = self.tokenizer(document)
            for token in self.token_generator(tokens):
                if token in self.vocabulary.keys():
                    self.vocabulary[token] += 1
                else:
                    self.vocabulary[token] = 1
        
        if self.max_features != None:
            temp_vocab = {}
            for key, value in Counter(self.vocabulary).most_common(self.max_features):
                temp_vocab[key] = value
            self.vocabulary = temp_vocab
            del temp_vocab
            
        self.token_to_id = {ky: ix for ix, ky in enumerate(sorted(self.vocabulary.keys()))}
        self.id_to_token = {ix: ky for ix, ky in enumerate(sorted(self.vocabulary.keys()))}
        
        
    def transform(self, X):
        """
        Go through all provided documents and use the known
        vocabulary to track how often each ngram appears in
        the document. At the end, stack all of the generated
        document vectors together. Skip the initial vector that
        all 0's, which is just there to act as a template.
        """
        vectorized_docs = np.zeros(len(self.vocabulary.keys()))
        for document in X:
            tokens = self.tokenizer(document)
            vectorized_doc = np.zeros(len(self.vocabulary.keys()))
            for token in self.token_generator(tokens):
                if token in self.vocabulary:
                    word_id = self.token_to_id[token]
                    vectorized_doc[word_id] += 1
            vectorized_docs = np.vstack((vectorized_docs,vectorized_doc))
        return vectorized_docs[1:]
    
    def fit_transform(self, X):
        """
        Fit on X and then transform X and return it as vectors.
        """
        self.fit(X)
        return self.transform(X)
                        

In [5]:
cv = count_vectorizer(max_features=200, ngrams=(1,2), remove_stopwords=True)

In [6]:
from sklearn import datasets

categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball']
ng_train = datasets.fetch_20newsgroups(subset='train', 
                                       categories=categories, 
                                       remove=('headers', 
                                               'footers', 'quotes'))
data = ng_train.data

In [7]:
X = cv.fit_transform(data)

In [8]:
lsi = latent_semantic_indexing(num_topics=6)
lsi.fit(X)

In [9]:
lsi.print_topics(lsi.transform(X), id_to_word=cv.id_to_token)

--- Topic  0  ---
season, players, hes, games, last year, pitching, baseball, league, id, won
--- Topic  1  ---
files, version, color, programs, format, images, image, file, gif, jpeg
--- Topic  2  ---
people, one, see, dont, file, jesus, atheists, god, gif, jpeg
--- Topic  3  ---
may, exist, atheist, many, religion, believe, religious, atheism, god, atheists
--- Topic  4  ---
three, god, set, display, software, one, using, data, jesus, image
--- Topic  5  ---
course, even, said, see, religion, atheism, religious, many, atheists, jesus
