# SQLite databases

This script migrates scraped data from csv to sqlite databases.

# Load modules

In [None]:
import pandas as pd
import sqlite3

# Create tables

## Papers

In [None]:
conn = sqlite3.connect("reserca.db")
c = conn.cursor()

# c.execute("DROP TABLE papers")
c.execute("""CREATE TABLE IF NOT EXISTS papers (
          id INTEGER PRIMARY KEY,
          url TEXT,
          url_stem TEXT UNIQUE,
          date TEXT,
          publisher TEXT,
          title TEXT,
          type TEXT,
          author TEXT,
          sourceid TEXT,
          sourceref TEXT,
          orcids TEXT,
          citation TEXT,
          issn TEXT,
          published_in TEXT,
          doi TEXT,
          isbn TEXT,
          uri TEXT,
          status_code INTEGER,
          status_description TEXT
         )""")

conn.commit()
conn.close()

## Authors

In [None]:
conn = sqlite3.connect("reserca.db")
c = conn.cursor()

# c.execute("DROP TABLE authors")
c.execute("""CREATE TABLE authors (
            id TEXT PRIMARY KEY,
            label TEXT,
            department TEXT,
            institution TEXT,
            institution_2 TEXT,
            projects TEXT,
            groups TEXT,
            url TEXT,
            status_description TEXT,
            institution_group TEXT
        )""")

conn.commit()
conn.close()

## Groups

In [None]:
conn = sqlite3.connect("reserca.db")
c = conn.cursor()

c.execute("""CREATE TABLE IF NOT EXISTS groups (
          id INTEGER PRIMARY KEY,
          name TEXT,
          institution TEXT,
          group_url TEXT,
          sgr_code TEXT,
          principal_names TEXT,
          principal_ids TEXT,
          researcher_names TEXT,
          researcher_uds TEXT,
          url TEXT,
          url_stem TEXT
         )""")

conn.commit()
conn.close()

## Projects

In [None]:
conn = sqlite3.connect("reserca.db")
c = conn.cursor()

# c.execute("DROP TABLE projects")
c.execute("""CREATE TABLE IF NOT EXISTS projects (
          id INTEGER PRIMARY KEY,
          title TEXT,
          official_code TEXT,
          url TEXT,
          start_date TEXT,
          end_date TEXT,
          institution TEXT,
          principal_names TEXT,
          principal_ids TEXT,
          researcher_names TEXT,
          researcher_ids TEXT,
          url_stem TEXT
         )""")

conn.commit()
conn.close()

# Papers

## Process csv

In [None]:
# Load data
df = pd.read_csv('../data/20220314/paper_clean_20220314.csv')

# Rename columns
renamed_cols = {'url_id':'url_stem',
                'status code':'status_code',
                'status description':'status_description'}
df = df.rename(columns = renamed_cols)

# Format data
df = df.sort_values(by='url_stem')
df = df.drop_duplicates(subset='url_stem')
df = df.reset_index(drop=True).reset_index()
df = df.rename(columns={'index': 'id'})

## Upload to sqlite

In [None]:
conn = sqlite3.connect("reserca.db")
df.to_sql(name='papers', con = conn, if_exists='append', index=False)
conn.close()

## Read data

In [None]:
# Connection
conn = sqlite3.connect("reserca.db")
conn.row_factory = sqlite3.Row

# Query
c = conn.cursor()
sql = "SELECT * FROM sqlite_master WHERE type='table'"
sql = "pragma table_info('papers')"
sql = "SELECT COUNT(*) FROM papers"
sql = "SELECT * FROM papers"
c.execute(sql)

# Print
result = c.fetchone()
print(dict(result))

conn.close()

# Authors

## Process csv

In [None]:
# Load data
df = pd.read_csv('../data/20220314/author_clean_20220314.csv')

# Rename columns
renamed_cols = {'status description':'status_description'}
df = df.rename(columns = renamed_cols)

## Upload to sqlite

In [None]:
df.head(1)

In [None]:
conn = sqlite3.connect("reserca.db")
df.to_sql(name='authors', con = conn, if_exists='append', index=False)
conn.close()

## Read data

In [None]:
# Connection
conn = sqlite3.connect("reserca.db")
conn.row_factory = sqlite3.Row

# Query
c = conn.cursor()
# sql = "SELECT * FROM sqlite_master WHERE type='table'"
# sql = "SELECT name FROM sqlite_master WHERE type='table'"
# sql = "pragma table_info('papers')"
sql = "SELECT COUNT(*) FROM authors"
# sql = "SELECT * FROM authors"
c.execute(sql)

# Print
result = c.fetchone()
print(dict(result))

conn.commit()
conn.close()

# Groups

## Process csv

In [None]:
# Load data
df = pd.read_csv('../data/20220314/group_data_20220314.csv')

rename_cols = {
    'principal names' : 'principal_names',
    'principal ids' : 'principal_ids',
    'researcher names' : 'researcher_names',
    'researcher ids' : 'researcher_ids',
    'url_id':'url_stem'
}
df = df.rename(columns=rename_cols)

df = df.sort_values(by='url_stem')
df = df.drop_duplicates(subset='url_stem')
df = df.reset_index(drop=True).reset_index()
df = df.rename(columns={'index': 'id'})

## Upload to sqlite

In [None]:
conn = sqlite3.connect("reserca.db")
df.to_sql(name='groups', con = conn, if_exists='append', index=False)
conn.close()

## Read data

In [None]:
# Connection
conn = sqlite3.connect("reserca.db")
conn.row_factory = sqlite3.Row

# Query
c = conn.cursor()
# sql = "SELECT * FROM sqlite_master WHERE type='table'"
# sql = "SELECT name FROM sqlite_master WHERE type='table'"
# sql = "pragma table_info('papers')"
# sql = "SELECT COUNT(*) FROM groups"
sql = "SELECT * FROM groups"
c.execute(sql)

# Print
result = c.fetchone()
print(dict(result))

conn.commit()
conn.close()

# Projects

## Process csv

In [None]:
# Load data
df = pd.read_csv('../data/20220314/project_data_20220314.csv')

rename_cols = {
    'official code' : 'official_code',
    'start date' : 'start_date',
    'end date' : 'end_date',
    'principal names' : 'principal_names',
    'principal ids' : 'principal_ids',
    'researcher names' : 'researcher_names',
    'researcher ids' : 'researcher_ids',
    'url_id' : 'url_stem',
}
df = df.rename(columns=rename_cols)

df = df.sort_values(by='url_stem')
df = df.drop_duplicates(subset='url_stem')
df = df.reset_index(drop=True).reset_index()
df = df.rename(columns={'index': 'id'})

## Upload to sqlite

In [None]:
conn = sqlite3.connect("reserca.db")
df.to_sql(name='projects', con = conn, if_exists='append', index=False)
conn.close()

## Read data

In [None]:
# Connection
conn = sqlite3.connect("reserca.db")
conn.row_factory = sqlite3.Row

# Query
c = conn.cursor()
# sql = "SELECT * FROM sqlite_master WHERE type='table'"
# sql = "SELECT name FROM sqlite_master WHERE type='table'"
# sql = "pragma table_info('papers')"
# sql = "SELECT COUNT(*) FROM groups"
sql = "SELECT * FROM projects"
c.execute(sql)

# Print
result = c.fetchone()
print(dict(result))

conn.commit()
conn.close()

# Extra code