In [153]:
import scholarly
import pandas as pd
from dataclasses import dataclass
import os
import glob
import re
from utils import Lang

In [100]:
dataFiles = glob.glob("data/*.csv")

In [101]:
dataFiles

['data/ML-DL-RL-CV.csv',
 'data/NAACL 2019.csv',
 'data/Dialog.csv',
 'data/ACL18.csv',
 "data/EMNLP '18.csv",
 'data/Blogs.csv',
 'data/KG+Reasoning.csv',
 'data/NeurIPS 18 _ ICLR 19.csv',
 'data/OIE_SRL.csv',
 'data/AAAI - 2019.csv',
 'data/NLP + Reasoning.csv']

In [103]:
tags = list(map(lambda x: x.replace("data/", "").replace(".csv", ""), glob.glob("data/*.csv")))

In [104]:
tags

['ML-DL-RL-CV',
 'NAACL 2019',
 'Dialog',
 'ACL18',
 "EMNLP '18",
 'Blogs',
 'KG+Reasoning',
 'NeurIPS 18 _ ICLR 19',
 'OIE_SRL',
 'AAAI - 2019',
 'NLP + Reasoning']

In [109]:
df_all = pd.DataFrame()

In [113]:
for fname, tag in zip(dataFiles, tags):
    df = pd.read_csv(dataFiles[0])
    df['tags'] = tag
    df_all = df_all.append(df)

In [119]:
df_all = df_all.loc[True^df_all['Assignment'].isnull()]

In [120]:
df_all = df_all.reset_index(drop=True)

In [122]:
df_all = df_all.drop(['Unnamed: 0', 'Due on', 'Start date'], axis=1)

In [125]:
df_all.sample(6)

Unnamed: 0,Subject,Assignment,Status,Time,Priority,tags
894,,Deep Variational Reinforcement Learning for PO...,,,,AAAI - 2019
631,,Understanding Straight-Through Estimator in Tr...,,,,KG+Reasoning
273,CVPR 2018,Multi-Task Learning Using Uncertainty to Weigh...,Done,,Important,Dialog
116,,Stackelberg GAN: Towards Provable Minimax Equi...,,,,NAACL 2019
670,,Evaluating Feature Importance Estimates - Inte...,,,Important,KG+Reasoning
782,,Reconciling modern machine-learning practice a...,,,High,NeurIPS 18 _ ICLR 19


### What to do?
- Get the abstract
- Get ID
- Check in db >> If not exists
- Add to db (ID, Abstract, title)
- Update Index / Phrase Index / Meaning Index 

#### Fields
- All in scholarly
- Time added
- Time published
- Citations
- Url

### Paper Class

In [81]:
@dataclass
class Paper:
    title: str
    abstract: str
    conference: str
        
    def __repr__(self):
        return f"[Paper] {self.title}"

In [154]:
lang = Lang('en')

### Sqlite3

In [1]:
import sqlite3

#### Connect

In [242]:
conn = sqlite3.connect(":memory:")

c = conn.cursor()

c.execute("""CREATE TABLE papers (
        id INTEGER PRIMARY KEY,
        title TEXT 
)""")

c.execute("""CREATE TABLE abstracts (
        paper_id INTEGER,
        abstract TEXT,
        FOREIGN KEY(paper_id) REFERENCES papers(id)
)""")

c.execute("""CREATE TABLE tags (
        id INTEGER PRIMARY KEY,
        tag TEXT
)""")

c.execute("""CREATE TABLE tag2paper (
        tag_id INTEGER,
        paper_id INTEGER,
        score REAL,
        UNIQUE (tag_id, paper_id) ON CONFLICT ABORT,
        FOREIGN KEY(tag_id) REFERENCES tags(id),
        FOREIGN KEY(paper_id) REFERENCES paper(id)        
)""")

conn.commit()


#### Add papers
- Add paper
- store abstract in abstracts table
- Online-update index table

In [243]:
# result = scholarly.search_pubs_query('Memory Architectures in recurrent neural network Langauge Models')

In [247]:
# for _ in range(3):
#     current = next(result)
#     p1 = Paper(current.bib['title'], current.bib['abstract'])
#     print(f"Adding: {p1}")
#     c.execute(f"INSERT INTO papers (title, abstract) VALUES(:title, :abstract)", vars(p1))

for id, row in df_all.iterrows():
    p1 = Paper(row['Assignment'].strip(), "", "")
    c.execute("INSERT INTO papers (title) VALUES(:title)", vars(p1))
    paperId = c.lastrowid
    ws = lang.normalizeSentence(p1.title)
    wtags = [(w,) for w in ws]
    
    c.executemany("INSERT INTO tags (tag) VALUES (?)", wtags)
    tagSelectStr = "'" + "\',\'".join(ws) + "'"
    c.execute(f"SELECT id FROM tags WHERE tag in ({tagSelectStr})")
    tagIds = c.fetchall()
    
    c.executemany("INSERT INTO tag2paper (tag_id, paper_id, score) VALUES (?, ?, ?)", list(map(lambda w: (w[0],paperId,1), tagIds)))
    
    # TODO: Stopwords must not be used for tags.
#     break
        

# conn.commit()

#### Query

In [253]:

c.execute(f"SELECT * from tags")
t = c.fetchmany(3)
print(t)


[(1, 'deep'), (2, 'generative'), (3, 'modeling')]


In [252]:

c.execute(f"SELECT * from tag2paper")
t = c.fetchmany(3)
print(t)


[(1, 1, 1.0), (2, 1, 1.0), (3, 1, 1.0)]


In [256]:

c.execute(f"SELECT * from papers")
t = c.fetchmany(3)
print(t)


[(1, 'Deep Generative Modeling with Applications in Semi-Supervised Learning (Zhilin Yang PHD Thesis)'), (2, 'Deep Generative Modeling with Applications in Semi-Supervised Learning (Zhilin Yang PHD Thesis)'), (3, "Best of Both Worlds: Transferring Knowledge from Discriminative Learning to a Generative Visual Dialog Model (NIPS '17)")]


In [80]:
conn.close()