In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import nltk
import string
import pandas as pd
import numpy as np

#download assets from nltk
#nltk.download('stopwords')
#nltk.download('punkt')

def tfidf(corpus):
    '''
    Computes the TF-IDF (term frequency - inverse document frequency) matrix

    Args
    - corpus: a list of documents

    Returns
    - tfidfVec: an m x n matrix of the corpus. m = number of different terms used in the documents, n = number of documents 
    - vocab: all the unique words used in the corpus, excluding stop words
    '''

    vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'))
    tfidfVec = vectorizer.fit_transform(corpus)
    vocab = vectorizer.get_feature_names()
        
    return tfidfVec, vocab

def freqCount(corpus):
    pass

def svd(tfidfVec):
    '''
    Gives the singular value decomposition of an m x n matrix.
    A = U * sigma * V^t
    
    Args
    - tfidfVec: an m x n matrix. m = number of documents or sentences, n = number of terms

    Returns
    - U: an m x r matrix of left singular values (document-topic table). r = number of topics
    - sigma: an r x r diagonal matrix of singular values in decreasing order across the diagonal
    - V^t: an n x r matrix of right singular values (term-topic table)
    '''

    lsa = TruncatedSVD(n_components = 10, n_iter=20)
    u = lsa.fit_transform(tfidfVec)
    sigma = lsa.singular_values_
    vt = lsa.components_.T

    return u, sigma, vt

def getImportantSentences(u, sigma):
    '''
    Uses the LSA enhancement described by Josef Steinberg, et al.
    Take all topics that have singular values > half of the largest singular value

    Compute sk = sqrt(sum(v_ki^2 * sigma_i^2) from i = 1 to n)
    sk is the length of the vector of the kth sentence
    n is the number of topics 

    Args
    - U, sigma matrices from SVD

    Returns
    - Vector of indices corresponding to the sentences in corpus sorted in descending order
      of importance
    '''

    #look for the sigma value range that we need to consider using binary search
    #sigma array is sorted in descending order and will never be empty
    l, r, target = 0, len(sigma), sigma[0]/2
    while l < r:
        mid = l + (r-l)//2

        if sigma[mid] < target:
            r = mid
        else:
            l = mid + 1
    sigmaBound = l

    uSlice = u[:, :sigmaBound]
    sigmaSlice = sigma[:sigmaBound]
    uSq = np.square(uSlice)
    sigSq = np.square(np.diag(sigmaSlice))
    prod = np.matmul(uSq, sigSq)
    result = np.sqrt(np.sum(prod, axis = 1)).T

    return (-result).argsort()

def createWordToSentenceMap(corpus):
    '''
    Creates a dictionary that maps a word from the vocab to all sentences with that word in the corpus.

    Args
    - corpus of sentences used in this summary

    Returns
    - the dictionary described
    '''
    
    wordToSentence = {}
    stopWords = set(stopwords.words('english'))

    for i, doc in enumerate(corpus):
        #remove punctuation while preserving contractions in text
        sanitizeText = doc.translate(str.maketrans('', '', string.punctuation))
        tokenized = word_tokenize(sanitizeText)
        #remove duplicate words
        tokenized = list(set([word.lower() for word in tokenized]))

        for word in tokenized:
            if word not in stopWords:
                if word not in wordToSentence:
                    wordToSentence[word] = [i]
                else:
                    wordToSentence[word].append(i)
    
    return wordToSentence

def extractSummary(u, sigma, k, corpus):
    '''
    Helper method to get the text summary.

    Summary will be taken from the top k sentences from getImportantSentences()
    for each topic.

    Args
    - U, sigma from SVD
    - k: number of sentences to include in summary
    - corpus: the list of sentences
    '''

    return [corpus[i] for i in getImportantSentences(u, sigma)[:k]]