## Necessary installation and download instructions ref ReadMe file

In [1]:
# Importing essential libraries
## Base
import numpy as np
import pandas as pd

## String pre-processing
import re, string
from nltk.corpus import stopwords
import spacy
import gensim

2023-07-12 01:02:35.953617: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




In [2]:
# Import sampled data
uc_df = pd.read_csv('./Data/strat-sampled-dataset.csv', index_col= 'ID')
uc_df.head()

Unnamed: 0_level_0,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
5,Mean Reverting Portfolios via Penalized OU-Lik...,We study an optimization-based approach to c...,0,0,0,0,0,1


In [3]:
# Check for dataset dimension, null values and column types
print(f'Dataset Dimensions: {uc_df.shape}')
print(f'Column Information:\n')
display(uc_df.info())

Dataset Dimensions: (1499, 8)
Column Information:

<class 'pandas.core.frame.DataFrame'>
Index: 1499 entries, 1 to 1499
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   TITLE                 1499 non-null   object
 1   ABSTRACT              1499 non-null   object
 2   Computer Science      1499 non-null   int64 
 3   Physics               1499 non-null   int64 
 4   Mathematics           1499 non-null   int64 
 5   Statistics            1499 non-null   int64 
 6   Quantitative Biology  1499 non-null   int64 
 7   Quantitative Finance  1499 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 105.4+ KB


None

In [4]:
# Checking count of each category in train set
sample_count = [uc_df[col].sum() for col in uc_df.columns[2::]]    
sc_df = pd.DataFrame(sample_count, columns=['Counts'], index=['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Biology', 'Finance'])
sc_df

Unnamed: 0,Counts
Computer Science,250
Physics,250
Mathematics,250
Statistics,250
Biology,250
Finance,249


In [5]:
# Extracting Abstracts for pre-processing
# From here on we will only model on Abstracts
pre_df = pd.DataFrame(uc_df['ABSTRACT'])
pd.set_option('max_colwidth', 150)
pre_df.head()

Unnamed: 0_level_0,ABSTRACT
ID,Unnamed: 1_level_1
1,"Predictive models allow subject-specific inference when analyzing disease\nrelated alterations in neuroimaging data. Given a subject's data, inf..."
2,"Rotation invariance and translation invariance have great values in image\nrecognition tasks. In this paper, we bring a new architecture in conv..."
3,"We introduce and develop the notion of spherical polyharmonics, which are a\nnatural generalisation of spherical harmonics. In particular we stu..."
4,The stochastic Landau--Lifshitz--Gilbert (LLG) equation coupled with the\nMaxwell equations (the so called stochastic MLLG system) describes the...
5,We study an optimization-based approach to con- struct a mean-reverting\nportfolio of assets. Our objectives are threefold: (1) design a portfol...


In [6]:
# Pre-Processing (String cleaning)

def dataCleaning(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\n', ' ', text)
    return text

df = pd.DataFrame(pre_df.ABSTRACT.apply(dataCleaning))
df.head()

Unnamed: 0_level_0,ABSTRACT
ID,Unnamed: 1_level_1
1,predictive models allow subject specific inference when analyzing disease related alterations in neuroimaging data given a subject s data infe...
2,rotation invariance and translation invariance have great values in image recognition tasks in this paper we bring a new architecture in convo...
3,we introduce and develop the notion of spherical polyharmonics which are a natural generalisation of spherical harmonics in particular we stud...
4,the stochastic landau lifshitz gilbert llg equation coupled with the maxwell equations the so called stochastic mllg system describes the ...
5,we study an optimization based approach to con struct a mean reverting portfolio of assets our objectives are threefold design a portfolio...


In [7]:
# Remove Stopwords
stop_words = stopwords.words('english')

rem_words = ['new', 'g', 'result', 'application', 'many', 'type', 'paper', 'effect', 'term', 'positive', 'weak', 'model', 'models', 'method', 'time', 'approach', 'datum', 'data', 'value', 'number', 'non', 'term', 'large', 'case', 'study', 'high', 'system', 'space', 'p', 'n', 'low', 'show', 'form', 'work', 'first', 'simple']
stop_words += rem_words
#print(stop_words)

def remove_stopwords(text):
    textArr = text.split(' ')
    remText = ' '.join(i for i in textArr if i not in stop_words)
    return remText

df['ABSTRACT'] = df['ABSTRACT'].apply(remove_stopwords)

df.head()

Unnamed: 0_level_0,ABSTRACT
ID,Unnamed: 1_level_1
1,predictive allow subject specific inference analyzing disease related alterations neuroimaging given subject inference made two levels global...
2,rotation invariance translation invariance great values image recognition tasks bring architecture convolutional neural network cnn named cy...
3,introduce develop notion spherical polyharmonics natural generalisation spherical harmonics particular theory zonal polyharmonics allows us ...
4,stochastic landau lifshitz gilbert llg equation coupled maxwell equations called stochastic mllg describes creation domain walls vortices ...
5,optimization based con struct mean reverting portfolio assets objectives threefold design portfolio well represented ornstein uhlenbeck pr...


In [8]:
# Lemmatization and Part of Speech Tagging Function
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ']):
    output = []
    for text in texts:
        doc = nlp(text)
        output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return output

data_list = df['ABSTRACT'].tolist()

token_data = lemmatization(data_list)

In [9]:
# Build the bigram and trigram models
bigram_phrases = gensim.models.Phrases(token_data, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram_phrases = gensim.models.Phrases(bigram_phrases[token_data], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return(list(bigram[doc] for doc in texts))

def make_trigrams(texts):
    return(list(trigram[bigram[doc]] for doc in texts))

data_bigrams = make_bigrams(token_data)
data_bigrams_trigrams = make_trigrams(data_bigrams)

In [10]:
# Using TF-IDF to remove low frequency words
id2word = gensim.corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]

tfidf = gensim.models.TfidfModel(corpus=corpus, id2word=id2word)

#high_value = 0.09
low_value = 0.03

words = []
words_missing_in_tfidf =[]

for i in range(0, len(corpus)):
    bow=corpus[i]
    low_value_words=[]
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids= [id for id, value in bow]
    #high_value_words = [id for id,value in tfidf[bow] if value > high_value]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    
    corpus[i]=new_bow

In [11]:
# LDA Model Build
lda = gensim.models.LdaMulticore
lda_model = lda(corpus=corpus, id2word=id2word, random_state=100, num_topics=6, passes=50)



In [12]:
# Measure's of how good the model is.
## Compute Perplexity: Lower the better.
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

## Compute Coherence Score: Higher the better.
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=data_bigrams_trigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.023405395906819

Coherence Score:  0.37113021371532345


In [13]:
# pyLDAvis prereqs libraries
import pyLDAvis
import pyLDAvis.gensim_models
from gensim.models import CoherenceModel

In [14]:
# Visualization of topic space using pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds')
vis



In [15]:
# Create Document - Topic Matrix
lda_output = []

for doc in lda_model[corpus]:
    arr = np.zeros(6)
    for topic in doc:
        arr[topic[0]] = topic[1]
    lda_output.append(arr)   

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.num_topics)]

# index names
docnames = ["Doc" + str(i) for i in range(len(pre_df))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,dominant_topic
Doc0,0.0,0.0,0.99,0.0,0.0,0.0,2
Doc1,0.0,0.78,0.0,0.2,0.0,0.0,1
Doc2,0.0,0.0,0.0,0.0,0.98,0.0,4
Doc3,0.0,0.0,0.0,0.98,0.0,0.0,3
Doc4,0.98,0.0,0.0,0.0,0.0,0.0,0
Doc5,0.0,0.0,0.0,0.0,0.0,0.98,5
Doc6,0.0,0.0,0.0,0.0,0.98,0.0,4
Doc7,0.0,0.0,0.0,0.0,0.0,0.99,5
Doc8,0.44,0.0,0.55,0.0,0.0,0.0,2
Doc9,0.98,0.0,0.0,0.0,0.0,0.0,0


In [16]:
# Create Document - Topic Distribution
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic No.', 'No of Documents']
df_topic_distribution

Unnamed: 0,Topic No.,No of Documents
0,1,303
1,4,297
2,2,277
3,0,272
4,5,189
5,3,161
