In [14]:
import findspark
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
import json
import nltk 
import string
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim import corpora, models, similarities

[nltk_data] Downloading package punkt to /home/celwer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/celwer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load Arxiv Research paper data with Pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('arxiv').getOrCreate()

23/10/06 15:30:28 WARN Utils: Your hostname, celwer-XPS-13-7390-2-in-1 resolves to a loopback address: 127.0.1.1; using 192.168.1.29 instead (on interface wlp0s20f3)
23/10/06 15:30:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/06 15:30:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.option("multiline","false") \
      .json("./arxiv-metadata-oai-snapshot.json")
df.printSchema()


                                                                                

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_parsed: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- categories: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- journal-ref: string (nullable = true)
 |-- license: string (nullable = true)
 |-- report-no: string (nullable = true)
 |-- submitter: string (nullable = true)
 |-- title: string (nullable = true)
 |-- update_date: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- version: string (nullable = true)



In [5]:
# filtered_category = ['cs.AI', 'cs.CL', 'cs.CV', 'cs.LG', 'cs.NE', 'cs.RO']
filtered_category = ['cs.LG', 'cs.NE']
df_filtered=df.filter(df.categories.isin(filtered_category))


In [6]:
df_filtered.select('categories').count()

                                                                                

15917

In [7]:
json_data = df_filtered.toPandas()


                                                                                

### Clean abstracts with stemming, lemmatization, removing stop words

In [8]:

def clean_abstracts:
    all_tokens = []
    porter_stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    for index,row in json_data.iterrows():
        sample_string = row['abstract']
    #     print(sample_string)
        sample_string = re.sub(r'\d+', '', sample_string)
    #     print(sample_string)
        sample_string = "".join([char.lower() for char in sample_string if char not in string.punctuation])
    #     print(sample_string)
        tokens = word_tokenize(sample_string)
        stem_tokens = []
        for token in tokens:
            stem_token = porter_stemmer.stem(token)
            if stem_token in stopwords.words('english'):
                continue
            # print(stem_token)
            stem_tokens.append(stem_token)
        all_tokens.append(stem_tokens)
    return all_tokens




### Create tfidf model 

In [None]:
def tfidf_model(all_tokens):
    dictionary = corpora.Dictionary(all_tokens)
    dictionary.save('/tmp/hep-th.dict')

    raw_corpus = [dictionary.doc2bow(token) for token in all_tokens]
    corpora.MmCorpus.serialize('/tmp/hep-th.mm', raw_corpus)

    dictionary = corpora.Dictionary.load('/tmp/hep-th.dict')
    corpus = corpora.MmCorpus('/tmp/hep-th.mm')

    tfidf = models.TfidfModel(corpus)

    corpus_tfidf = tfidf[corpus]
    
    return(corpus_tfidf)



### provide top 10 recommendations

In [None]:
#given the tfidf model output, provide the top most similar documents to a specific document indicated by doc_ind
def most_similar(corpus_tfidf, num_recs, doc_ind):
    sim_tfidf = similarities.MatrixSimilarity(corpus_tfidf)
    sims = sim_tfidf[corpus_tfidf]
    sort_ind = np.argsort(sims[doc_ind])
    recs = sort_ind[:-1][:-(num_recs+1):-1]
    return(recs)

In [15]:
sims[0]

array([1.        , 0.00533727, 0.01717856, ..., 0.01486367, 0.02324457,
       0.01561839], dtype=float32)

In [10]:
sort_ind = np.argsort(sims[0])

In [11]:
sort_ind

array([ 4648,  5197,  9082, ..., 10149, 12018,     0])

In [12]:
#top 10 most similar
num_recs = 10
recs = sort_ind[:-1][:-(num_recs+1):-1]

In [16]:
json_data.iloc[recs, :]['abstract']

12018      Self-training (ST), or pseudo-labeling has s...
10149      Prediction of protein-ligand (PL) binding af...
12454      Partial Label (PL) learning refers to the ta...
12556      It is widely believed that given the same la...
3280       Gradient Boosted Decision Trees (GBDT) is a ...
3842       Convolutional Neural Networks (CNNs) provide...
6497       We introduce a method combining variational ...
14567      Bayesian Optimization (BO) is typically used...
6591       Partial-label learning (PLL) utilizes instan...
10585      Semi-supervised learning is a critical tool ...
Name: abstract, dtype: object

In [25]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

In [29]:

def most_similar(df,doc_id,similarity_matrix,matrix):
    print (f'Document: {df.iloc[doc_id]["abstract"]}')
    print ('\n')
    print ('Similar Documents:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix:
        if ix==doc_id:
            continue
        print('\n')
        print (f'Document: {df.iloc[ix]["abstract"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')
    return(similar_ix)



In [26]:
# sample = df_filtered.limit(20).toPandas()


### Load Roberta Transformer model, create embeddings and similarity matrix

In [None]:
sbert_model = SentenceTransformer('all-distilroberta-v1')

document_embeddings = sbert_model.encode(json_data['abstract'])

pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)


### Top 10 recommendations

In [None]:
top_10 = most_similar(json_data,0,pairwise_similarities,'Cosine Similarity')