In [1]:
import pandas as pd
# http://derekgreene.com/slides/topic-modelling-with-scikitlearn.pdf
#currently have 24 topics

In [788]:
data = pd.read_csv('../Data/RYANDATA_filt_even.csv')
data.columns = ['V0', 'topic', 'authors','title','journal','year','vol_issue','doi','abstract']
print(data.groupby('topic')['title'].nunique())

topic
BONE                              500
CARDIOVASCULAR/CARDIOPULMONARY    500
CELLULAR/SUBCELLULAR              500
COMPARATIVE                       500
DENTAL/ORAL/FACIAL                500
ERGONOMICS                        500
EVOLUTION/ANTHROPOLOGY            500
GAIT/LOCOMOTION                   500
JOINT/CARTILAGE                   500
METHODS                           500
MODELING                          500
MUSCLE                            500
NEURAL                            500
ORTHOPAEDICS/SPINE                500
ORTHOPAEDICS/SURGERY              500
PROSTHETICS/ORTHOTICS             500
REHABILITATION                    500
ROBOTICS                          500
SPORT/EXERCISE                    500
TENDON/LIGAMENT                   500
TISSUE/BIOMATERIAL                500
TRAUMA/IMPACTTESTING              500
VETERINARY/AGRICULTURAL           500
VISUAL/VESTIBULAR/EYE             500
Name: title, dtype: int64


In [789]:
#split data keeping distribution
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits= 1, 
                             test_size = 0.2, 
                             random_state = 0)

for train_idx, test_idx in sss.split(data['title'],data['topic']):
    X_train, X_test = data['title'][train_idx], data['title'][test_idx]
    y_train, y_test = data['topic'][train_idx], data['topic'][test_idx]


y_train.value_counts() #same distribution as original data

MUSCLE                            400
TENDON/LIGAMENT                   400
NEURAL                            400
ROBOTICS                          400
CELLULAR/SUBCELLULAR              400
ORTHOPAEDICS/SURGERY              400
TISSUE/BIOMATERIAL                400
COMPARATIVE                       400
BONE                              400
REHABILITATION                    400
VISUAL/VESTIBULAR/EYE             400
EVOLUTION/ANTHROPOLOGY            400
JOINT/CARTILAGE                   400
GAIT/LOCOMOTION                   400
SPORT/EXERCISE                    400
TRAUMA/IMPACTTESTING              400
CARDIOVASCULAR/CARDIOPULMONARY    400
ORTHOPAEDICS/SPINE                400
DENTAL/ORAL/FACIAL                400
METHODS                           400
PROSTHETICS/ORTHOTICS             400
VETERINARY/AGRICULTURAL           400
ERGONOMICS                        400
MODELING                          400
Name: topic, dtype: int64

# Preprocess data 

In [790]:
#https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
# from nltk.stem.porter import *
import numpy as np
np.random.seed(0)

# import nltk
# nltk.download('wordnet')

#tokenize, lemmatized, stemmed
stemmer = SnowballStemmer(language='english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
#             result.append(lemmatize_stemming(token))
            result.append(token)
    result = ' '.join(result)        
    return result

doc_sample = X_train[0]
print('Original document: ')
words = []
for doc in [doc_sample]:
    print(doc)
print('\nTokenized and lemmatized document: ')
print(preprocess(doc_sample))

X_train_proc = X_train.map(preprocess)

Original document: 
Dried plum's unique capacity to reverse bone loss and alter bone metabolism in postmenopausal osteoporosis model

Tokenized and lemmatized document: 
dried plum unique capacity reverse bone loss alter bone metabolism postmenopausal osteoporosis model


# Vectorize data for NMF and LDA

In [791]:
#https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.externals import joblib
from sklearn.feature_extraction import text
#additional stop words
stop_words = text.ENGLISH_STOP_WORDS.union(['biomechan','locomot'])
#Non-negative Matrix Factorization (NMF) likes TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=3, #min occurances needed
                             max_df=0.5, #max occuraces allowed (%)
                             ngram_range=(1,3), #size range of grams (1-2 words)
                             strip_accents='unicode',
                             lowercase =True,
                             analyzer='word', 
#                              token_pattern=r'\w+', #accidentally lets in numbers as strings
                             stop_words = stop_words,
                             smooth_idf = True,
                             token_pattern= '[a-zA-Z-0-9]{3,}')
tfidf = tfidf_vectorizer.fit_transform(X_train_proc)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

#Latent Dirichlet Allocation (LDA) likes CountVectorizer
tf_vectorizer = CountVectorizer(min_df=3,
                                max_df=0.5,
                                ngram_range=(1,3),
                                strip_accents='unicode',
                                lowercase=True,
                                analyzer='word',
                                stop_words=stop_words,
#                                 token_pattern= u'(?ui)\\b\\w*[a-z]+\\w*\\b',
                                token_pattern= '[a-zA-Z-0-9]{3,}'
                               )
tf = tf_vectorizer.fit_transform(X_train_proc)
tf_feature_names = tf_vectorizer.get_feature_names()

In [807]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

num_topics = 25
#NMF Model
nmf = NMF(n_components = num_topics,
          random_state = 0,
          alpha = 0.1,
          l1_ratio = 0.5,
          init = 'nndsvd')
nmf.fit(tfidf)

#LDA Model
lda = LatentDirichletAllocation(n_components=num_topics,
                                max_iter=10,
                                learning_method='online',
                                learning_offset=50,
#                                 learning_decay=0.9,
                                random_state=0, 
                                verbose = True)
%time lda.fit(tf)
print('\nModel Fit')

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
CPU times: user 1min 35s, sys: 3.58 s, total: 1min 38s
Wall time: 31.6 s

Model Fit


In [289]:
# doc_lengths = []
# for i,t in enumerate(tf):
#       doc_lengths.append(tf[i].nnz)

# term_freq = tf.toarray().sum(axis = 0)

# model_data = {'doc_topic_dists' : lda.transform(tf),
#         'topic_term_dists' : lda.components_,
#         'vocab' : tf_feature_names,
#         'doc_lengths' : doc_lengths,
#         'term_frequency' : term_freq}


In [796]:
#Visualize topics/terms using pyLDAvis
import pyLDAvis
%time vis_data = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer) #default multidimensional scaling (mds) is PCoA. can try 'mmds'

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


CPU times: user 14.6 s, sys: 328 ms, total: 15 s
Wall time: 3min 26s


In [736]:
#save compiled visualization as html AND underlying data as .pkl
pyLDAvis.save_html(vis_data,'../Models/LDA/lda_vis30_even.html')
from sklearn.externals import joblib
joblib.dump(vis_data, '../Models/LDA/lda_vis_data30_even.pkl')
print('vis data saved')

vis data saved


In [797]:
# loaded_vis_data = joblib.load('../Models/lda_vis_data_not_stem.pkl')
pyLDAvis.display(vis_data)

In [719]:
# pyLDAvis.display(vis_data_mmds) # not as helpful as PCA

In [720]:
from mglearn.tools import print_topics
sorting = np.argsort(lda.components_)[:,::-1]
# print_topics(topics = range(num_topics),
#              feature_names= np.array(tf_feature_names),
#              sorting = sorting,
#              topics_per_chunk=3,
#              n_words= 10
#             )

In [847]:
#PROPOSED NEW PRINT_TOPICS FOR MGLEARN

def print_topics(topics, feature_names, sorting, topics_per_chunk=6,
                 n_words=20):
    for i in range(0, len(topics), topics_per_chunk):
        # for each chunk:
        these_topics = topics[i: i + topics_per_chunk]
        # maybe we have less than topics_per_chunk left
        len_this_chunk = len(these_topics)
        # get max length of feature names
        row = []
        feat_len = []
        
        #generate list of sorted features and their lengths
        for i in range(n_words):
            row.append(feature_names[sorting[these_topics, i]])
        topic_words = np.array(row).T
        #get max feature length for each topic
        max_feat_len = []
        for t in topic_words:
            max_feat_len.append(len(max(t, key = len)))
        #generate space between strings equal to 1+len(longest string in topic)
        result = [None]*len(these_topics)*2
        result[::2] = these_topics
        nums = np.array([(x - 5) for x in max_feat_len])
        nums[nums < 0] = 0 #prevents spaces of negative length
        result[1::2] = [str(x) for x in nums]
        print(("topic {:<{}} " * len_this_chunk).format(*result))
        
        #generate space between strings equal to 1+len(longest string in topic)
        result = [None]*len(these_topics)*2
        result[::2] = ['']*len(these_topics)
        nums = np.array([(x - 8) for x in max_feat_len])
        nums[nums < 0] = 0 #prevents spaces of negative length
        result[1::2] = [str(x) for x in nums]
        print(("-------- {:<{}} " * len_this_chunk).format(*result))
        
        # print top n_words frequent words
        for i in range(n_words):
            #generate space between strings 
            result = [None]*len(these_topics)*2
            result[::2] = feature_names[sorting[these_topics, i]]
            result[1::2] = [str(x+2) for x in max_feat_len]
            try:
                print(("{:<{}}" * len_this_chunk).format(*result))
            except:
                pass
        print("\n")

sorting = np.argsort(lda.components_)[:,::-1]        
print_topics(topics = range(num_topics),
             feature_names= np.array(tf_feature_names),
             sorting = sorting,
             topics_per_chunk=6,
             n_words= 10)



topic 0                    topic 1          topic 2             topic 3            topic 4                topic 5     
--------                   --------         --------            --------           --------               --------    
inhibits                   performance      measurement         joint              mechanical             muscle      
guide                      function         inertial            review             properties             force       
carcinoma                  muscle           heart               imaging            mechanical properties  upper       
myosin                     skeletal         local               clinical           behavior               cell        
teres                      swimming         prevention          systematic         preliminary            activity    
ligamentum                 skeletal muscle  behavioral          analysis           young                  effect      
biophysics                 model            expe

In [882]:
test_titles = data['title'].sample(10)


In [919]:
for n,t in enumerate(test_title):
    print(test_title.index[n])

3998
9522
11177
7813
6707
2979
10724
7038
10835
4122


In [925]:
#get paper titles
test_title_vec = tf_vectorizer.transform(test_titles)
pred_topic = lda.transform(test_title_vec)

 
sorting = np.argsort(lda.components_)[:,::-1]        
topic_most_pr = []
topic_probability = []
topic_features = []
real_topic = []
for n,t in enumerate(test_title):
    topic_most_pr.append(pred_topic[n].argmax())
    topic_probability.append(np.round(-np.sort(-pred_topic[n])[0:3],3)) #just change to [0:3] for top 3 probabilities
    row = []
    for i in range(3):
        row.append(np.array(tf_feature_names)[sorting[topic_most_pr[n], i]]) 
    topic_features.append(row)
    real_topic.append(data['topic'].loc[test_title.index[n]])
    
    
pred_topics = pd.DataFrame({'title': test_title, 
                            'pred_topic' : topic_most_pr, 
                            'top 3 prob' : topic_probability, 
                            'topic features' : topic_features,
                            'biomch-l topic' : real_topic
                           })    
pd.set_option('display.max_colwidth', 80)
# pd.reset_option('display.max_colwidth')
pred_topics


Unnamed: 0,title,pred_topic,top 3 prob,topic features,biomch-l topic
3998,Amputee locomotion: frequency content of prosthetic vs,16,"[0.808, 0.052, 0.05]","[knee, biomechanics, ligament]",GAIT/LOCOMOTION
9522,Mri study of the ligamentization of acl grafts in children with open growth ...,5,"[0.179, 0.147, 0.146]","[muscle, force, upper]",TENDON/LIGAMENT
11177,Outcome of ventral fusion of two or three cervical vertebrae with a locking ...,3,"[0.421, 0.219, 0.127]","[joint, review, imaging]",VETERINARY/AGRICULTURAL
7813,Radial head prosthesis after radial head and neck fractures - current litera...,16,"[0.252, 0.249, 0.187]","[knee, biomechanics, ligament]",PROSTHETICS/ORTHOTICS
6707,The accuracy of the oculus rift virtual reality head-mounted display during ...,24,"[0.449, 0.34, 0.076]","[biomechanical, properties, biomechanical properties]",ORTHOPAEDICS/SPINE
2979,Effect of seat belt pretensioners on human abdomen and thorax: Biomechanical...,10,"[0.381, 0.202, 0.117]","[gait, control, walking]",ERGONOMICS
10724,The relation between mechanical impact parameters and most frequent bicycle ...,10,"[0.695, 0.08, 0.08]","[gait, control, walking]",TRAUMA/IMPACTTESTING
7038,A biomechanical comparison of epiphyseal versus metaphyseal fixed bone-conse...,16,"[0.893, 0.004, 0.004]","[knee, biomechanics, ligament]",ORTHOPAEDICS/SURGERY
10835,Microstructural consequences of blast lung injury characterized with digital...,24,"[0.221, 0.197, 0.161]","[biomechanical, properties, biomechanical properties]",TRAUMA/IMPACTTESTING
4122,The knee adduction moment measured with an instrumented force shoe in patien...,24,"[0.414, 0.355, 0.12]","[biomechanical, properties, biomechanical properties]",JOINT/CARTILAGE


# Grid search parameters for lda

[https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/](https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/)

# GuidedLDA 
[https://medium.freecodecamp.org/how-we-changed-unsupervised-lda-to-semi-supervised-guidedlda-e36a95f3a164](https://medium.freecodecamp.org/how-we-changed-unsupervised-lda-to-semi-supervised-guidedlda-e36a95f3a164)