In [1]:
# REQUIRED IMPORTS
from bs4 import BeautifulSoup
import requests, json, os
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import pprint
import gensim
import PyPDF2
import os, math
from tabulate import tabulate

In [2]:
#SPARK RELATED CODE
import findspark

findspark.init()

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.mllib.feature import Word2Vec

conf = SparkConf()

spark = SparkSession \
    .builder \
    .appName('Sparkler') \
    .getOrCreate()

spark.conf.set('spark.sql.caseSensitive', False)

In [3]:
# GENSIM RELATED IMPORTS
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS


In [4]:
# GLOBAL VARIABLES
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.52",
}

params = {
    "q": "",
    "hl": "en",
    "start": 0
}

directory_path = 'DocumentDb'

In [5]:
def word2VecImplementation(cleandataset):
    docTokensCollection = spark.sparkContext.parallelize(cleandataset).map(lambda line: line.split(" "))
    word2Vec = Word2Vec().setVectorSize(10).setMinCount(6).setSeed(42)
    model = word2Vec.fit(docTokensCollection)
    for i in params['q'].replace(',',' ').split(" "):
        try:
            res = model.findSynonyms(i, 3)
            print('Words similar to {} (with respect to other words) are as follows - '.format(i))
            print(tabulate(res, headers=["Word", "Similarity"], tablefmt='grid'))    
        except:
            pass

In [None]:
# download the file and save in DocumentDb folder
def downloadFile(link):
    

In [6]:
#Similar Articles
def getSimilarArticles(df, vectorizer, link): 
    print('------------- TF-IDF FOR CALCULATING THE SIMILAR DOCUMENTS FOR THE KEYWORDS -------------')
    q = [params['q']]
    q_vector = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}

    for i in range(df.shape[1]):
        sim[i] = np.dot(df.loc[:, i].values, q_vector) / np.linalg.norm(df.loc[:,i]) * np.linalg.norm(q_vector)
        
    sim_sorted = sorted(sim.items(), key = lambda x:x[1], reverse = True)
    print('Top 5 documents that are most relevant to the keywords entered are -')
    for k,v in sim_sorted[:5]:
        if v != 0.0 and math.isnan(v)==False:
            print("Similarities: {}".format(v))
            print('Link to the article:', link[k])
            if('https' in link[k]):
                downloadFile(link[k])

In [7]:
def vectorizerMethod(documents_clean):
    # Instantiate a TfIdfVectorizer Object and transform the data to vectors
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(documents_clean)
    X = X.T.toarray()
    df = pd.DataFrame(X, index=vectorizer.get_feature_names_out())    
    return(df, vectorizer)

In [8]:
def cleanData(documents):
    print('------------- DATA PREPROCESSING -------------')
    print('Cleaning data for removing any unicodes, mentions, punctuations, double spaces, stopwords.\nConverting the data to lower case.')
    documents_clean = []
    for d in documents:
        # Remove Unicode
        document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
        # Remove Mentions
        document_test = re.sub(r'@\w+', '', document_test)
        # Lowercase the document
        document_test = document_test.lower()
        # Remove punctuations
        document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
        # Lowercase the numbers
        document_test = re.sub(r'[0-9]', '', document_test)
        # Remove the doubled space
        document_test = re.sub(r'\s{2,}', ' ', document_test)
        documents_clean.append(remove_stopwords(document_test))
    print('Cleaned the data.')
    return(documents_clean)

In [9]:
def collectDocumentFromLinks(link):
    print('------------- COLLECTING ABSTRACT DATA FOR EACH SPRINGER LINK -------------')
    #Retrieve paragrahs from each link, combine each paragrah as a string and save it to docs
    documents = []
    for i in link:
        r = requests.get(i, headers=headers)
        soup = BeautifulSoup(r.content,'html.parser')

        sen = []
        # for springer abstracts
        if soup.find('div', {'id':'Abs1-content'}):
            for i in soup.find('div', {'id':'Abs1-content'}).find_all('p'):
                sen.append(i.text)
        documents.append(' '.join(sen))
    print('Collected abstract data from {} springer links fetched earlier'.format(len(documents)))
    return(documents)

In [10]:
def collectSpringerLinks(soup):
    print('------------- COLLECTING RELEVANT SPRINGER LINKS FOR THE QUERY -------------')
    #Retrieve all popular new links
    link = []
    i=0
    for i in range(0, len(soup)):
        data = soup[i].find_all("div", {"class": "gs_ri"})
        for j in range(len(data)):    
            temp = data[j].find('a')
            if 'link.springer.com/article' in temp['href'] and 'books.google.com' not in temp['href']:
                link.append(temp['href'])
    if len(link)>0:
        print('Collected {} links'.format(len(link)))
        return(link)
    else: 
        print('No springer links found. Try with other keywords.')

In [11]:
def collectGoogleScholarPages(query):
    print('------------- VISITING GOOGLE SCHOLAR PAGES TO FETCH URLS -------------')
    params['q'] = query
    soup = []
    while True:
        url = 'https://scholar.google.com/scholar'
        req = requests.get(url, headers=headers, params=params)
        print(req.url)
        tempData = BeautifulSoup(req.content,'html.parser')
        soup.append(tempData)
        if tempData.find('span', {'class': 'gs_ico gs_ico_nav_next'}) and params['start']<100:
            params['start']+=10
        else:
            break
    print('Visited {} google scholar pages'.format(int(params['start'])/10))
    return soup

In [12]:
###Read PDFs(Papers) from Document Database
def readdocumentdb():
    DE = []
    for filename in os.listdir(directory_path):
        if filename[-4:] == '.pdf':
            filepath = directory_path+'/'+filename
            pdfFileObj = open(filepath, 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
            numofpages = pdfReader.numPages
            pageObj = pdfReader.getPage(0)
            text = pageObj.extractText()

            try:
                abstractindex = re.search(r'Abstract', text, re.IGNORECASE)
                introindex = re.search(r'(I. )?(1. )?I*ntroduction', text, re.IGNORECASE)
                abstractend = 0
                introstart = 0

                if abstractindex:
                    abstractend = abstractindex.end()
                if introindex:
                    introstart = introindex.start()

                if introstart == 0:
                    paperAbstract = text[abstractend:]
                else:
                    paperAbstract = text[abstractend:introstart]
            except:
                paperAbstract = text

            paperAbstract = re.sub(r"^\W+", "", paperAbstract.strip())
            pdfFileObj.close()
            A = [filepath, paperAbstract]
            DE.append(A)

    DocumentsExtract = pd.DataFrame(DE)
    DocumentsExtract.columns = ['DocLink', 'Abstract']
    return(DocumentsExtract)

In [13]:
# ##Reading google scholar
# soup = collectGoogleScholarPages(keywords)
# links = collectSpringerLinks(soup)
# docs = collectDocumentFromLinks(links)
# cdocs = cleanData(docs)
# features, vector = vectorizerMethod(cdocs)
# getSimilarArticles(features, vector, links)



# dbdocs = readdocumentdb(directory_path)
# dbdocsabstract = dbdocs["Abstract"].to_numpy().tolist()
# dbdocslinks = dbdocs["DocLink"].to_numpy().tolist()
# cleanabstractdata = cleanData(dbdocsabstract)
# featuredocs, vectordocs = vectorizerMethod(cleanabstractdata)
# getSimilarArticles(featuredocs, vectordocs, dbdocslinks)

# Entry point of the code

keywords = input("Enter keywords on which you want to search documents:")


##Reading google scholar
soup = collectGoogleScholarPages(keywords)
links = collectSpringerLinks(soup)
docs = collectDocumentFromLinks(links)

##read all the documents from db
dbdocs = readdocumentdb()
dbdocsabstract = dbdocs["Abstract"].to_numpy().tolist()
dbdocslinks = dbdocs["DocLink"].to_numpy().tolist()

combinedlinks = links + dbdocslinks
combineddocs = docs + dbdocsabstract

cleandataset = cleanData(combineddocs)
featureds, vectords = vectorizerMethod(cleandataset)
getSimilarArticles(featureds, vectords, combinedlinks)
word2VecImplementation(cleandataset)



Enter keywords on which you want to search documents:brain tumor cancer oncology checmotherapy
------------- VISITING GOOGLE SCHOLAR PAGES TO FETCH URLS -------------
https://scholar.google.com/scholar?q=brain+tumor+cancer+oncology+checmotherapy&hl=en&start=0
https://scholar.google.com/scholar?q=brain+tumor+cancer+oncology+checmotherapy&hl=en&start=10
https://scholar.google.com/scholar?q=brain+tumor+cancer+oncology+checmotherapy&hl=en&start=20
https://scholar.google.com/scholar?q=brain+tumor+cancer+oncology+checmotherapy&hl=en&start=30
https://scholar.google.com/scholar?q=brain+tumor+cancer+oncology+checmotherapy&hl=en&start=40
https://scholar.google.com/scholar?q=brain+tumor+cancer+oncology+checmotherapy&hl=en&start=50
Visited 5.0 google scholar pages
------------- COLLECTING RELEVANT SPRINGER LINKS FOR THE QUERY -------------
Collected 3 links
------------- COLLECTING ABSTRACT DATA FOR EACH SPRINGER LINK -------------
Collected abstract data from 3 springer links fetched earlier


Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
  sim[i] = np.dot(df.loc[:, i].values, q_vector) / np.linalg.norm(df.loc[:,i]) * np.linalg.norm(q_vector)


------------- DATA PREPROCESSING -------------
Cleaning data for removing any unicodes, mentions, punctuations, double spaces, stopwords.
Converting the data to lower case.
Cleaned the data.
------------- TF-IDF FOR CALCULATING THE SIMILAR DOCUMENTS FOR THE KEYWORDS -------------
Top 5 documents that are most relevant to the keywords entered are -
Similarities: 0.34339633388628565
Link to the article: DocumentDb/BrainTumor-2.pdf
Similarities: 0.34339633388628565
Link to the article: DocumentDb/BrainTumor-6.pdf
Similarities: 0.2404778107666957
Link to the article: DocumentDb/BrainTumor-5.pdf
Similarities: 0.23286722811238117
Link to the article: DocumentDb/Oncology-1.pdf
Words similar to brain (with respect to other words) are as follows - 
+--------------+--------------+
| Word         |   Similarity |
| tumor        |     0.960456 |
+--------------+--------------+
| image        |     0.955047 |
+--------------+--------------+
| segmentation |     0.947331 |
+--------------+----------