In [1]:

# We install postgres and its dev tools
!sudo apt-get -y -qq update
!sudo apt-get -y -qq install postgresql postgresql-server-dev-all

#  Start postgres
!sudo service postgresql start

# Create user, password, and db!
!sudo -u postgres psql -U postgres -c "ALTER USER postgres PASSWORD 'postgres';"
!sudo -u postgres psql -U postgres -c 'DROP DATABASE IF EXISTS structdb;'
!sudo -u postgres psql -U postgres -c 'CREATE DATABASE structdb;'

!git clone --recursive https://github.com/lanterndata/lantern.git
%cd lantern/
!mkdir build
%cd build
!cmake ..
!sudo make install

%cd ../../

!pip install sentence-transformers==2.2.2 psycopg2-binary cohere openai langchain tiktoken faiss-gpu PyPDF2 pandas paperscraper

 * Starting PostgreSQL 14 database server
   ...done.
ALTER ROLE
DROP DATABASE
CREATE DATABASE
fatal: destination path 'lantern' already exists and is not an empty directory.
/content/lantern
mkdir: cannot create directory ‘build’: File exists
/content/lantern/build
  Compatibility with CMake < 3.5 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value or use a ...<max> suffix to tell
  CMake that the project does not need compatibility with older versions.

[0m
-- Build type: 
-- Found pg_config as /usr/bin/pg_config
-- Found postgres binary at /usr/lib/postgresql/14/bin/postgres
-- PostgreSQL version PostgreSQL 14.9 (Ubuntu 14.9-0ubuntu0.22.04.1) found
-- PostgreSQL package library directory: /usr/lib/postgresql/14/lib
-- PostgreSQL libraries: -lpgcommon -lpgport -lselinux -llz4 -lxslt -lxml2 -lpam -lssl -lcrypto -lgssapi_krb5 -lz -lreadline -lm
-- PostgreSQL extension directory: /usr/share/postgresql/14/extension
-- PostgreSQL linker options: -

In [6]:
class Fragment:
    id = ""
    header = ""
    content = ""
    vector = ""

    def __init__(self, id, header, content, vector):
        self.id = id
        self.header = header
        self.content = content
        self.vector = vector

In [7]:

class Publication:

    id = ""
    title = ""
    pmc = ""
    pubmed = ""
    doi = ""

    def __init__(self, id, title, pmc, pubmed, doi):
        self.id = id
        self.title = title
        self.pmc = pmc
        self.pubmed = pubmed
        self.doi = doi

In [8]:
import psycopg2

class Lantern:
    conn = ""

    def __init__(self, database="structdb"):
        self.conn = self.connect(database)
        self.createTables()


    def connect(self, database="structdb"):
        # We use the dbname, user, and password that we specified above
        conn = psycopg2.connect(
            dbname=database,
            user="postgres",
            password="postgres",
            host="localhost",
            port="5432" # default port for Postgres
        )

        cursor = conn.cursor()
        # Execute the query to load the Lantern extension in
        cursor.execute("CREATE EXTENSION IF NOT EXISTS lantern;")

        conn.commit()
        cursor.close()

        return conn


    def createTables(self):
        self.createFragmentTable()
        self.createPublicationTable()
        self.createUnreadTable()

    def createFragmentTable(self):
        conn = self.conn
        # Create the table
        cursor = conn.cursor()

        create_table_query = "CREATE TABLE IF NOT EXISTS fragments (id text, header text, content text, vector real[]);"

        cursor.execute(create_table_query)

        conn.commit()
        cursor.close()

    def createPublicationTable(self):
        conn = self.conn
        cursor = conn.cursor()

        create_table_query = "CREATE TABLE IF NOT EXISTS publications (id text PRIMARY KEY, title text, pmc text, pubmed text, doi text);"

        cursor.execute(create_table_query)

        conn.commit()
        cursor.close()

    def createUnreadTable(self):
        conn = self.conn
        cursor = conn.cursor()

        create_table_query = "CREATE TABLE IF NOT EXISTS unread (id text PRIMARY KEY);"
        cursor.execute(create_table_query)

        conn.commit()
        cursor.close()

    def insertEmbedding(self, fragment: Fragment):
        conn = self.conn
        cursor = conn.cursor()

        cursor.execute("INSERT INTO fragments (id, header, content, vector) VALUES (%s, %s, %s, %s);", (fragment.id, fragment.header, fragment.content, fragment.vector))
        cursor.execute("CREATE INDEX ON fragments USING hnsw (vector dist_cos_ops) WITH (dim=" + str(fragment.VECTOR_LENGTH) + ");")

        conn.commit()
        cursor.close()

    def insertEmbeddings(self, fragments: list):
        if (len(fragments) < 1):
            print("Empty List")
            return
        conn = self.conn
        cursor = conn.cursor()

        queries=[]
        for fragment in fragments:
            queries.append((fragment.id, fragment.header, fragment.content, fragment.vector))

        cursor.executemany("INSERT INTO fragments (id, header, content, vector) VALUES (%s, %s, %s, %s);", queries)
        cursor.execute("CREATE INDEX ON fragments USING hnsw (vector dist_cos_ops) WITH (dim=" + str(len(fragments[0].vector)) + ");")
        conn.commit()
        cursor.close()

    def insertPublication(self, p):
        conn = self.conn
        cursor = conn.cursor()

        cursor.execute("INSERT INTO publications (id, title, pmc, pubmed, doi) VALUES (%s, %s, %s, %s, %s);", (p.id, p.title, p.pmc, p.pubmed, p.doi))

        query='INSERT INTO unread (id) VALUES (\'{:s}\');'.format(p.id)
        cursor.execute(query)
        conn.commit()
        cursor.close()



    def getAllFragmentsOfPublication(self, id):
        conn = self.conn
        cursor = conn.cursor()

        query='SELECT * FROM fragments WHERE id=\'{:s}\';'.format(id)
        cursor.execute(query)
        fragments = cursor.fetchall()
        conn.commit()
        cursor.close()

        fragmentObjects = []
        for fragment in fragments:
            fragmentObjects.append(Fragment(id, fragment[1], fragment[2], fragment[3]))

        return fragmentObjects


    def getUnreadPublication(self):
        conn = self.conn
        cursor = conn.cursor()

        cursor.execute('SELECT * FROM publications AS p LEFT JOIN unread AS u ON u.id=p.id;')

        publications = cursor.fetchall()

        cursor.execute('DELETE FROM unread;')
        conn.commit()
        cursor.close()


        publicationObjects = []
        for p in publications:
            publicationObjects.append(Publication(p[0], p[1], p[2], p[3], p[4]))

        return publicationObjects

    def publicationExists(self, id):
        conn = self.conn
        cursor = conn.cursor()

        query='SELECT COUNT(*) FROM publications WHERE id=\'{:s}\''.format(id)
        cursor.execute(query)
        count = cursor.fetchone()
        conn.commit()
        cursor.close()

        return count[0] == 1



In [9]:
import os
import pandas as pd
import PyPDF2
from paperscraper.pdf import save_pdf
from paperscraper.get_dumps import biorxiv

import openai
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
import PyPDF2

# OpenAI Setup
OPEN_API_KEY = "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"
# openai.api_key = os.getenv(openai_api_key)
os.environ['OPENAI_API_KEY'] = OPEN_API_KEY

def scrapeBiorxiv(start, end, out_file):
    filepath=out_file
    biorxiv(begin_date=start, end_date=end, save_path=out_file)
    retreiveTextFromPdf(filepath)

def get_embeddings(fname):
    """
    """
    loader = TextLoader(fname)
    documents = loader.load()
    text_splitter = CharacterTextSplitter(separator = ".",chunk_size = 1000, chunk_overlap=0)
    docs = text_splitter.split_documents(documents)

    emb = OpenAIEmbeddings()
    input_texts = [d.page_content for d in docs]

    input_embeddings = emb.embed_documents(input_texts)
    text_embeddings = list(zip(input_texts, input_embeddings))
    return text_embeddings, emb

def retreiveTextFromPdf(inp_file):


    json = pd.read_json(path_or_buf=inp_file, lines=True)
    lantern = Lantern()

    for n, doi in enumerate(json['doi']):
        print(n, doi)


        ##NOTE: This is for example purpose only
        if n > 10:
            break

        if lantern.publicationExists(doi):
            continue

        paper_data = {'doi': doi}
        doi = doi.replace("/", "-")
        pdf_dir = './papers/'
        if not os.path.exists(pdf_dir):
            os.mkdir(pdf_dir)

        pdfsavefile='./papers/' + doi +'.pdf'
        save_pdf(paper_data, filepath=pdfsavefile)

        # creating a pdf reader object
        reader = PyPDF2.PdfReader(pdfsavefile)
        save_txt_path = 'scrapped_txts/'
        if not os.path.exists(save_txt_path):
            os.mkdir(save_txt_path)
        extract_text = ''
        for page in reader.pages:
            extract_text+=page.extract_text()

        txt_file = str('{}.txt'.format(doi))
        with open(save_txt_path+txt_file, 'w') as file:
            file.write(extract_text)


        txt_embs, emb = get_embeddings(save_txt_path+txt_file)

        fragments = []
        for txt, embs in txt_embs:
            fragment = Fragment(doi, 'methods', txt, embs)
            fragments.append(fragment)

        title = ""
        pmc = ""
        pubmed = ""

        publication = Publication(doi, title, pmc, pubmed, doi)

        lantern.insertEmbeddings(fragments)
        lantern.insertPublication(publication)

        os.remove(pdfsavefile)

start_date = "2023-10-30"
end_date = "2023-10-31"
out_file = "bio.jsonl"

scrapeBiorxiv(start_date, end_date, out_file)


280it [00:16, 17.14it/s]


0 10.1101/2020.10.23.351742
1 10.1101/2021.05.29.446196
2 10.1101/2021.08.20.457147
3 10.1101/2022.01.31.478514
4 10.1101/2022.02.13.480270
5 10.1101/2022.03.14.484202
6 10.1101/2022.04.10.487810




7 10.1101/2022.04.28.489897
8 10.1101/2022.05.09.491115




9 10.1101/2022.05.28.493856
10 10.1101/2022.06.08.495145
11 10.1101/2022.06.30.498314


In [10]:
!sudo -u postgres pg_dump structdb > structdb.sql