In [1]:
from nltk.corpus import gutenberg
import nltk

In [2]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [3]:
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer

stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()

def tokenize_text(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

def expand_contractions(text, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
    
    
from pattern.en import tag
from nltk.corpus import wordnet as wn

# Annotate text tokens with POS tags
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text
    
# lemmatize text based on POS tags    
def lemmatize_text(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word                     
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text
    

def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
    
    
def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

    
def normalize_corpus(corpus, tokenize=False):
    
    normalized_corpus = []    
    for text in corpus:
        text = expand_contractions(text, CONTRACTION_MAP)
        text = lemmatize_text(text)
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if tokenize:
            text = tokenize_text(text)
        normalized_corpus.append(text)
            
    return normalized_corpus

** Latent Semantic Indexing **

* LSI uses SVD.The main principle behind LSI is that similar terms tend to be used in same context and hence co-occur.

In [4]:
toy_corpus = ["The fox jumps over the dog",
"The fox is very clever and quick",
"The dog is slow and lazy",
"The cat is smarter than the fox and the dog",
"Python is an excellent programming language",
"Java and Ruby are other programming languages",
"Python and Java are very popular programming languages",
"Python programs are smaller than Java programs"]

In [5]:
from gensim import corpora,models
import numpy as np

norm_tokenized_corpus=normalize_corpus(toy_corpus,tokenize=True)
norm_tokenized_corpus

[['fox', 'jump', 'dog'],
 ['fox', 'clever', 'quick'],
 ['dog', 'slow', 'lazy'],
 ['cat', 'smarter', 'fox', 'dog'],
 ['python', 'excellent', 'programming', 'language'],
 ['java', 'ruby', 'programming', 'language'],
 ['python', 'java', 'popular', 'programming', 'language'],
 ['python', 'program', 'small', 'java', 'program']]

In [6]:
dictionary=corpora.Dictionary(norm_tokenized_corpus)
dictionary.token2id

{'cat': 7,
 'clever': 3,
 'dog': 0,
 'excellent': 9,
 'fox': 1,
 'java': 13,
 'jump': 2,
 'language': 10,
 'lazy': 5,
 'popular': 15,
 'program': 16,
 'programming': 11,
 'python': 12,
 'quick': 4,
 'ruby': 14,
 'slow': 6,
 'small': 17,
 'smarter': 8}

In [7]:
corpus=[dictionary.doc2bow(text) for text in norm_tokenized_corpus]
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(1, 1), (3, 1), (4, 1)],
 [(0, 1), (5, 1), (6, 1)],
 [(0, 1), (1, 1), (7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1), (12, 1)],
 [(10, 1), (11, 1), (13, 1), (14, 1)],
 [(10, 1), (11, 1), (12, 1), (13, 1), (15, 1)],
 [(12, 1), (13, 1), (16, 2), (17, 1)]]

In [8]:
tfidf=models.TfidfModel(corpus)#Tfidf model is applied on the corpus
corpus_tfidf=list(tfidf[corpus])#the corpus attribute in the array is the tfidf representation of the corpus
corpus_tfidf

[[(0, 0.39239043318859274), (1, 0.39239043318859274), (2, 0.8319011334792957)],
 [(1, 0.31639356562839216), (3, 0.6707813025230176), (4, 0.6707813025230176)],
 [(0, 0.31639356562839216), (5, 0.6707813025230176), (6, 0.6707813025230176)],
 [(0, 0.30165504678093485),
  (1, 0.30165504678093485),
  (7, 0.6395343874660627),
  (8, 0.6395343874660627)],
 [(9, 0.7744161642390763),
  (10, 0.36527597081532565),
  (11, 0.36527597081532565),
  (12, 0.36527597081532565)],
 [(10, 0.36527597081532565),
  (11, 0.36527597081532565),
  (13, 0.36527597081532565),
  (14, 0.7744161642390763)],
 [(10, 0.34310292128626624),
  (11, 0.34310292128626624),
  (12, 0.34310292128626624),
  (13, 0.34310292128626624),
  (15, 0.7274074110285933)],
 [(12, 0.20213859787651756),
  (13, 0.20213859787651756),
  (16, 0.8571020823668701),
  (17, 0.42855104118343507)]]

In [9]:
total_topics=2
lsi=models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics=total_topics)

In [10]:
lsi.print_topics(total_topics)

[(0,
  '0.459*"language" + 0.459*"programming" + 0.344*"java" + 0.344*"python" + 0.336*"popular" + 0.318*"excellent" + 0.318*"ruby" + 0.148*"program" + 0.074*"small" + 0.000*"slow"'),
 (1,
  '-0.459*"dog" + -0.459*"fox" + -0.444*"jump" + -0.322*"cat" + -0.322*"smarter" + -0.208*"slow" + -0.208*"lazy" + -0.208*"quick" + -0.208*"clever" + 0.000*"programming"')]

In [11]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def build_feature_matrix(documents, feature_type='frequency'):

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=1, 
                                     ngram_range=(1, 1))
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    
    return vectorizer, feature_matrix


from scipy.sparse.linalg import svds
    
def low_rank_svd(matrix, singular_count=2):
    
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt
    

In [14]:
norm_corpus=normalize_corpus(toy_corpus)
vectorizer,tfidf_matrix=build_feature_matrix(norm_corpus,feature_type="tfidf")
td_matrix=tfidf_matrix.transpose()
td_matrix.todense()

matrix([[0.        , 0.        , 0.        , 0.57297276, 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.62956522, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.50559126, 0.        , 0.4552969 , 0.41436966, 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.62390152,
         0.        , 0.        , 0.        ],
        [0.50559126, 0.4552969 , 0.        , 0.41436966, 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.45120095, 0.41127481, 0.29411648],
        [0.69911012, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.45120095,
         0.45120095, 0.41127481, 0.        ],
        [0.        , 0.        , 0.62956522, 0.        , 0.        ,
         0.

In [16]:
td_matrix=td_matrix.multiply(td_matrix>0)
td_matrix.todense()

matrix([[0.        , 0.        , 0.        , 0.57297276, 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.62956522, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.50559126, 0.        , 0.4552969 , 0.41436966, 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.62390152,
         0.        , 0.        , 0.        ],
        [0.50559126, 0.4552969 , 0.        , 0.41436966, 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.45120095, 0.41127481, 0.29411648],
        [0.69911012, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.45120095,
         0.45120095, 0.41127481, 0.        ],
        [0.        , 0.        , 0.62956522, 0.        , 0.        ,
         0.

In [30]:
u,s,vt=svds(td_matrix,k=total_topics)
weights=u.transpose()*s[:,None]

In [31]:
print(s[:,None].shape)
print(s.shape)
print(u.shape)
print(u.transpose().shape)

(2, 1)
(2,)
(18, 2)
(2, 18)


In [34]:
def get_topics_terms_weights(weights, feature_names):
    feature_names = np.array(feature_names)
    sorted_indices = np.array([list(row[::-1]) 
                           for row 
                           in np.argsort(np.abs(weights))])
    sorted_weights = np.array([list(wt[index]) 
                               for wt, index 
                               in zip(weights,sorted_indices)])
    sorted_terms = np.array([list(feature_names[row]) 
                             for row 
                             in sorted_indices])
    
    topics = [np.vstack((terms.T, 
                     term_weights.T)).T 
              for terms, term_weights 
              in zip(sorted_terms, sorted_weights)]     
    
    return topics            

def print_topics_udf(topics, total_topics=1,
                     weight_threshold=0.0001,
                     display_weights=False,
                     num_terms=None):
    
    for index in range(total_topics):
        topic = topics[index]
        topic = [(term, float(wt))
                 for term, wt in topic]
        topic = [(word, round(wt,2)) 
                 for word, wt in topic 
                 if abs(wt) >= weight_threshold]
                     
        if display_weights:
            print('Topic #',str(index+1),' with weights')
            print(topic[:num_terms]) #if num_terms else topic
        else:
            print('Topic #',str(index+1),' without weights')
            tw = [term for term, wt in topic]
            print(tw[:num_terms]) if num_terms else tw
        print
        
feature_names=vectorizer.get_feature_names()
topics = get_topics_terms_weights(weights, feature_names)        
print_topics_udf(topics=topics,
                 total_topics=total_topics,
                 weight_threshold=0.15,
                 display_weights=True)


Topic # 1  with weights
[('dog', 0.72), ('fox', 0.72), ('jump', 0.43), ('smarter', 0.34), ('cat', 0.34), ('slow', 0.23), ('lazy', 0.23), ('quick', 0.23), ('clever', 0.23)]
Topic # 2  with weights
[('programming', 0.73), ('language', 0.73), ('java', 0.56), ('python', 0.56), ('popular', 0.34), ('ruby', 0.33), ('excellent', 0.33), ('program', 0.21)]


In [None]:
def train_lsi_model_gensim(corpus, total_topics=2):
    
    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary,
                          num_topics=total_topics)
    return lsi
 




def train_lda_model_gensim(corpus, total_topics=2):
    
    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=total_topics)
    return lda                     
