In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
df = pd.read_csv('papers.csv')

In [2]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [3]:
import re 
from nltk.corpus import stopwords  
from nltk.stem.wordnet import WordNetLemmatizer


stop_words = set(stopwords.words('english'))

#Creating a list of custom stopwords 
new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]

stop_words = list(stop_words.union(new_words))

def pre_process(text):
    
    #lowercase 
    text = text.lower()
    
    #remove tags 
    text = re.sub('&lt;?.*&gt' , '&lt:&gt; ' , text)
    
    #remove special characters and digits 
    text = re.sub('(\\d|\\W)+' , ' ' ,text)
    
    #convert list to string 
    text = text.split()
    
    #remove stopwords 
    text = [word for word in text if word not in stop_words]
    
    #remove words less than three letters
    text = [word for word in text if len(word) >= 3]
    
    #lemmatize 
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    
    return ' '.join(text)

docs = df['paper_text'].apply(lambda x : pre_process(x))

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

#docs = docs.tolist()
#Create a vocabulary of words,

cv = CountVectorizer(max_df = 0.95 ,#ignore words that appear in 95% of documents)
                     max_features = 10000, #the size of the vocabulary
                     ngram_range = (1,3) #vocabulary contains single words,bigrams , trigrams
                    )
word_count_vector = cv.fit_transform(docs)

# TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(smooth_idf = True , use_idf = True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

# Function to extract the key words

In [6]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col , coo_matrix.data)
    return sorted(tuples , key = lambda x : (x[1] , x[0]) , reverse = True)

def extract_topn_from_vectors(feature_names , sorted_items , topn = 10):
    ''' get the feature name and tf-idf score of top n items'''
    
    #use only top n items from the vector
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    feature_vals = []
    
    for idx,score in sorted_items:
        fname = feature_names[idx]
        
        #Keep track of feature names and it's corrosponding score
        score_vals.append(round(score,3))
        feature_vals.append(feature_names[idx])
        
    #Create a tuple of feature and score 
    #results = zip(feature, score)
    
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
        
    return results

#Get the feature names 
feature_names  = cv.get_feature_names()

def get_keywords(idx , docs):
    
    #generate tf-idf for the documents
    tf_idf_vectors = tfidf_transformer.transform(cv.transform([docs[idx]]))
    
    #Sort the tf-idf vector by descending order of score
    sorted_items = sort_coo(tf_idf_vectors.tocoo())
    
    #Extract only top n and n here is 10 
    keywords = extract_topn_from_vectors(feature_names,sorted_items , 10)
    
    return keywords

def print_results(idx, keywords , df):
    #Now print the results 
    print('\n======Title=====')
    print(df['title'][idx])
    print('\n======Abstract=====')
    print(df['abstract'][idx])
    print('\n=====keywords=====')
    for k in keywords:
        print(k,keywords[k])
        
idx = 941
keywords = get_keywords(idx , docs)
print_results(idx, keywords,df)


Algorithms for Non-negative Matrix Factorization

Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 

=====keywords=====
update rule 0.344
update 0.285
auxiliary 0.212
non negative matrix 0.21
negative matrix 0.209
rule 0.192
nmf 0.183
multiplicative 0.175
matrix factorization 0.163
mat