In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re, string
import spacy
import gensim
from gensim import corpora
from gensim.models import TfidfModel
import pyLDAvis
import pyLDAvis.gensim_models
from gensim.models import CoherenceModel
import tqdm

In [2]:
predf = pd.read_csv('./train.csv')
predf.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [3]:
predf.shape

(20972, 9)

In [4]:
compare = [predf[col].sum() for col in predf.columns[3::]]    
comparedf = pd.DataFrame(compare, columns=['Counts'], index=['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Biology', 'Finance'])
comparedf

Unnamed: 0,Counts
Computer Science,8594
Physics,6013
Mathematics,5618
Statistics,5206
Biology,587
Finance,249


In [5]:
X=predf['ABSTRACT']
y=predf[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']]

In [7]:
maindf = pd.DataFrame(predf['ABSTRACT'])
pd.set_option('max_colwidth', 175)
maindf.head()

Unnamed: 0,ABSTRACT
0,"Predictive models allow subject-specific inference when analyzing disease\nrelated alterations in neuroimaging data. Given a subject's data, inference can\nbe made at tw..."
1,"Rotation invariance and translation invariance have great values in image\nrecognition tasks. In this paper, we bring a new architecture in convolutional\nneural network..."
2,"We introduce and develop the notion of spherical polyharmonics, which are a\nnatural generalisation of spherical harmonics. In particular we study the\ntheory of zonal p..."
3,The stochastic Landau--Lifshitz--Gilbert (LLG) equation coupled with the\nMaxwell equations (the so called stochastic MLLG system) describes the creation\nof domain wall...
4,Fourier-transform infra-red (FTIR) spectra of samples from 7 plant species\nwere used to explore the influence of preprocessing and feature extraction on\nefficiency of ...


In [8]:
# Remove Punctuations
def dataCleaning(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\n', ' ', text)
    return text

cleandf = pd.DataFrame(maindf.ABSTRACT.apply(dataCleaning))
cleandf.head()

Unnamed: 0,ABSTRACT
0,predictive models allow subject specific inference when analyzing disease related alterations in neuroimaging data given a subject s data inference can be made at two ...
1,rotation invariance and translation invariance have great values in image recognition tasks in this paper we bring a new architecture in convolutional neural network ...
2,we introduce and develop the notion of spherical polyharmonics which are a natural generalisation of spherical harmonics in particular we study the theory of zonal pol...
3,the stochastic landau lifshitz gilbert llg equation coupled with the maxwell equations the so called stochastic mllg system describes the creation of domain walls ...
4,fourier transform infra red ftir spectra of samples from plant species were used to explore the influence of preprocessing and feature extraction on efficiency of mac...


In [9]:
# Remove Stopwords
stop_words = stopwords.words('english')

rem_words = ['new', 'g', 'result', 'application', 'many', 'type', 'paper', 'effect', 'term', 'positive', 'weak', 'model', 'models', 'method', 'time', 'approach', 'datum', 'data', 'value', 'number', 'non', 'term', 'large', 'case', 'study', 'high', 'system', 'space', 'p', 'n', 'low', 'show', 'form', 'work', 'first', 'simple']
stop_words += rem_words
#print(stop_words)

def remove_stopwords(text):
    textArr = text.split(' ')
    remText = ' '.join(i for i in textArr if i not in stop_words)
    return remText

cleandf['ABSTRACT'] = cleandf['ABSTRACT'].apply(remove_stopwords)

cleandf.head()

Unnamed: 0,ABSTRACT
0,predictive allow subject specific inference analyzing disease related alterations neuroimaging given subject inference made two levels global e identifiying conditi...
1,rotation invariance translation invariance great values image recognition tasks bring architecture convolutional neural network cnn named cyclic convolutional layer ...
2,introduce develop notion spherical polyharmonics natural generalisation spherical harmonics particular theory zonal polyharmonics allows us analogously zonal harmoni...
3,stochastic landau lifshitz gilbert llg equation coupled maxwell equations called stochastic mllg describes creation domain walls vortices fundamental objects nove...
4,fourier transform infra red ftir spectra samples plant species used explore influence preprocessing feature extraction efficiency machine learning algorithms wavelet...


In [10]:

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [11]:
# Lemmatization and Part of Speech Tagging
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ']):
    output = []
    for text in texts:
        doc = nlp(text)
        output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return output

In [12]:
data_list = cleandf['ABSTRACT'].tolist()
print(data_list[100])

  humans learn continuous manner  old rarely utilized knowledge overwritten incoming information important  frequently used knowledge prevented erased  artificial learning systems  lifelong learning far focused mainly accumulating knowledge tasks overcoming catastrophic forgetting   argue  given limited capacity unlimited information learned  knowledge preserved erased selectively  inspired neuroplasticity  propose novel lifelong learning  coined memory aware synapses  mas   computes importance parameters neural network unsupervised online manner  given sample fed network  mas accumulates importance measure parameter network  based sensitive predicted output function change parameter  learning task  changes important parameters penalized  effectively preventing important knowledge related previous tasks overwritten   interesting connection local version hebb rule learning process brain  test sequence object recognition tasks challenging problem learning embedding predicting    subject 

In [13]:
%%time
token_data = lemmatization(data_list)
print(token_data[100])

['human', 'continuous', 'manner', 'old', 'knowledge', 'incoming', 'information', 'important', 'knowledge', 'artificial', 'learning', 'system', 'lifelong', 'learning', 'knowledge', 'task', 'catastrophic', 'limited', 'capacity', 'unlimited', 'information', 'knowledge', 'neuroplasticity', 'novel', 'lifelong', 'memory', 'aware', 'synapsis', 'importance', 'parameter', 'neural', 'network', 'unsupervised', 'online', 'manner', 'sample', 'network', 'importance', 'measure', 'parameter', 'sensitive', 'output', 'function', 'change', 'parameter', 'task', 'important', 'parameter', 'important', 'knowledge', 'previous', 'task', 'interesting', 'connection', 'local', 'version', 'hebb', 'rule', 'learning', 'process', 'brain', 'test', 'sequence', 'object', 'recognition', 'task', 'problem', 'subject', 'object', 'triplet', 'state', 'art', 'performance', 'ability', 'importance', 'parameter', 'unlabeled', 'network', 'test', 'condition']
Wall time: 1min 47s


In [15]:
# Build the bigram and trigram models
bigram_phrases = gensim.models.Phrases(token_data, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram_phrases = gensim.models.Phrases(bigram_phrases[token_data], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return(list(bigram[doc] for doc in texts))

def make_trigrams(texts):
    return(list(trigram[bigram[doc]] for doc in texts))

data_bigrams = make_bigrams(token_data)
data_bigrams_trigrams = make_trigrams(data_bigrams)

In [16]:
# Using TF-IDF to remove words
id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus=corpus, id2word=id2word)

#high_value = 0.09
low_value = 0.03

words = []
words_missing_in_tfidf =[]

for i in range(0, len(corpus)):
    bow=corpus[i]
    low_value_words=[]
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids= [id for id, value in bow]
    #high_value_words = [id for id,value in tfidf[bow] if value > high_value]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    
    corpus[i]=new_bow

In [18]:
%%time
# LDA Model
lda = gensim.models.LdaMulticore
lda_model = lda(corpus=corpus, id2word=id2word, random_state=100, num_topics=6, passes=20)

Wall time: 36.2 s


In [19]:
# Measure's of how good the model is.

# Compute Perplexity: Lower the better.
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

# Compute Coherence Score: Higher the better.
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_bigrams_trigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.8008626019172045

Coherence Score:  0.5421555749863282


In [20]:
# Visualize Topic - Word Distribution

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds')
vis

  default_term_info = default_term_info.sort_values(


In [21]:
# Create Document - Topic Matrix

lda_output = []

for doc in lda_model[corpus]:
    arr = np.zeros(6)
    for topic in doc:
        arr[topic[0]] = topic[1]
    lda_output.append(arr)   

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.num_topics)]

# index names
docnames = ["Doc" + str(i) for i in range(len(maindf))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,dominant_topic
Doc0,0.36,0.23,0.37,0.04,0.0,0.0,2
Doc1,0.0,0.0,0.67,0.0,0.0,0.31,2
Doc2,0.0,0.0,0.0,0.0,0.0,0.97,5
Doc3,0.07,0.0,0.47,0.23,0.0,0.22,2
Doc4,0.07,0.3,0.63,0.0,0.0,0.0,2
Doc5,0.0,0.0,0.0,0.05,0.0,0.94,5
Doc6,0.88,0.0,0.0,0.0,0.0,0.1,0
Doc7,0.0,0.0,0.0,0.98,0.0,0.0,3
Doc8,0.15,0.0,0.0,0.84,0.0,0.0,3
Doc9,0.67,0.0,0.0,0.16,0.0,0.15,0


In [22]:
# Create Document - Topic Distribution

df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic No.', 'No of Documents']
df_topic_distribution

Unnamed: 0,Topic No.,No of Documents
0,2,5969
1,3,4076
2,5,3996
3,1,2722
4,4,2678
5,0,1531


In [21]:

# Number of Passes Range
pass_range = np.arange(10,110,10)

In [22]:
%%time
# Tuning 01: Optimum Passes Test
model_results_runs = {'Runs': [],'Perplexity': [],'Coherence': []}

def lda_run(p):
    lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, random_state=100, num_topics=6, alpha='asymmetric', eta=0.91, passes=p)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_bigrams_trigrams, dictionary=id2word, coherence='c_v')
    perplexity_lda = lda_model.log_perplexity(corpus)
    coherence_lda = coherence_model_lda.get_coherence()
    return perplexity_lda, coherence_lda
    
for i in pass_range:
    pp, cv = lda_run(i)
    model_results_runs['Runs'].append(i)
    model_results_runs['Perplexity'].append(pp)
    model_results_runs['Coherence'].append(cv)
    
pd.DataFrame(model_results_runs).to_csv('lda_tuning_passes.csv', index=False)

Wall time: 8min 48s


In [23]:
%%time
# Tuning 02: Optimum Alpha and Beta Values Test
## Multicore Test
### Supporting function for Multicore LDA:
def compute_coherence_values_multicore(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_bigrams_trigrams, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

# Main Tuning(Multicore)
#%%time
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 6
max_topics = 7
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=60)
    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterate through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values_multicore(corpus=corpus, dictionary=id2word, k=k, a=a, b=b)
                # Save the model results
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_multicore_20passes.csv', index=False)
    pbar.close()

 50%|█████     | 30/60 [13:34<13:34, 27.15s/it]

Wall time: 13min 34s





In [24]:
%%time
# Tuning 03: Optimum Alpha and Beta Values Test
## Singlecore Test
### Supporting function for Singlecore LDA:
def compute_coherence_values_singlecore(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_bigrams_trigrams, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

# Main Function
#%%time
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=270)
    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterate through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values_singlecore(corpus=corpus, dictionary=id2word, k=k, a=a, b=b)
                # Save the model results
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_singlecore.csv', index=False)
    pbar.close()