## Read all papers from database, joining authors and topics

In [1]:
import sqlite3

from pathlib import Path
pd = Path().resolve().parent.parent
database = str(pd) + "/data/generated/database/database.sqlite"
 
# create a database connection
conn = sqlite3.connect(database)

d = {}

with conn:
    cur = conn.cursor()
    cur.execute("SELECT id, year, title, paper_text FROM papers")
 
    rows = cur.fetchall()
    
    for id, year, title, paper_text in rows:
        d[id] = [year, title, paper_text]
        
    for key in d: 
        cur.execute("SELECT A.name FROM authors A JOIN paper_authors PA ON A.id=PA.author_id WHERE PA.paper_id=?", (key,))
        authorRows = cur.fetchall()
        authorsString = ""
        for author in authorRows:
            authorsString += str(author[0]) + ", "
        d[key].extend([authorsString])
        
        cur.execute("SELECT T.name FROM topics T JOIN paper_topic PT ON T.id=PT.topic_id WHERE PT.paper_id=?", (key,))
        topicRows = cur.fetchall()
        topic = str(topicRows[0][0])
        d[key].extend([topic])
        
papersArray = []
for key in d: papersArray.extend([d[key][2]])

## Cleaning and Preprocessing

### Removing stopword and lemmatizing

In [2]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string


stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer() 

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalizedFromNouns = " ".join(lemma.lemmatize(word,'n') for word in punc_free.split())
    normalizedFromVerbs = " ".join(lemma.lemmatize(word,'v') for word in normalizedFromNouns.split())
    return normalizedFromVerbs


papersArrayClean = [clean(paper).split() for paper in papersArray]
print(papersArrayClean[0])

['767', 'selforganization', 'associative', 'database', 'application', 'hisashi', 'suzuki', 'suguru', 'arimoto', 'osaka', 'university', 'toyonaka', 'osaka', '560', 'japan', 'abstract', 'efficient', 'method', 'selforganizing', 'associative', 'database', 'propose', 'together', 'application', 'robot', 'eyesight', 'system', 'propose', 'database', 'associate', 'input', 'output', 'first', 'half', 'part', 'discussion', 'algorithm', 'selforganization', 'propose', 'aspect', 'hardware', 'produce', 'new', 'style', 'neural', 'network', 'latter', 'half', 'part', 'applicability', 'handwritten', 'letter', 'recognition', 'autonomous', 'mobile', 'robot', 'system', 'demonstrate', 'introduction', 'let', 'map', 'f', 'x', 'give', 'here', 'x', 'finite', 'infinite', 'set', 'another', 'finite', 'infinite', 'set', 'learn', 'machine', 'observe', 'set', 'pair', 'x', 'y', 'sample', 'randomly', 'x', 'x', 'y', 'x', 'x', 'mean', 'cartesian', 'product', 'x', 'y', 'and', 'compute', 'estimate', 'j', 'x', 'f', 'make', 's

### Removing punctuation and formulas

In [9]:
papersArrayCleanNoNumbers = []

# removing numbers 
for doc in papersArrayClean:
    numbers = list()
    for word in doc:
            if word.isnumeric(): numbers.append(word)
    numbers = list(numbers)
    for n in numbers:
        doc.remove(n)
    papersArrayCleanNoNumbers.append(doc)
        

papersArrayCleanNoLetter = []

# removing words with less than 4 characters 
for doc in papersArrayCleanNoNumbers:
    letters = list()
    for word in doc:
            if (len(word)<=3 ): letters.append(word)
    letters = list(letters)
    for n in letters:
        doc.remove(n)
    papersArrayCleanNoLetter.append(doc)

print(papersArrayCleanNoLetter[0])

['selforganization', 'associative', 'database', 'application', 'hisashi', 'suzuki', 'arimoto', 'osaka', 'university', 'osaka', 'japan', 'abstract', 'efficient', 'method', 'selforganizing', 'associative', 'database', 'propose', 'together', 'application', 'robot', 'eyesight', 'system', 'propose', 'database', 'associate', 'input', 'output', 'first', 'half', 'part', 'discussion', 'algorithm', 'selforganization', 'propose', 'aspect', 'hardware', 'produce', 'style', 'neural', 'network', 'latter', 'half', 'part', 'applicability', 'handwritten', 'letter', 'recognition', 'autonomous', 'mobile', 'robot', 'system', 'demonstrate', 'introduction', 'give', 'here', 'finite', 'infinite', 'another', 'finite', 'infinite', 'learn', 'machine', 'observe', 'pair', 'sample', 'randomly', 'mean', 'cartesian', 'product', 'compute', 'estimate', 'make', 'small', 'estimation', 'error', 'measure', 'usually', 'that', 'faster', 'decrease', 'estimation', 'error', 'increase', 'number', 'sample', 'better', 'learn', 'mac

## Create index

### Define index schema and create directory

In [10]:
import os.path
from whoosh.fields import *
from whoosh.index import create_in

from pathlib import Path
pd = Path().resolve().parent.parent
index_dir = str(pd) + "/data/generated/index"

schema = Schema(id=ID(unique=True, stored=True), year=NUMERIC, title=TEXT, paper_text=TEXT, authors=TEXT, topic=TEXT)

if not os.path.exists(index_dir):
    os.mkdir(index_dir)
ix = create_in(index_dir, schema) 

### Write cleaned documents in the index

In [None]:
writer = ix.writer()

for i, key in enumerate(d):
    content = " ".join(papersArrayCleanNoLetter[i])
    writer.add_document(id=str(key), year=d[key][0], title=d[key][1], paper_text=content, authors=d[key][3], topic=d[key][4])
    
writer.commit() 