# Create Postgres Database and Import Mark Davies Spanish Web/Dialects Corpus (Sample Files) into Postres SQL Database

In [None]:
import os
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.orm import relationship
import pandas as pd
import numpy as np
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, ForeignKey, String, Integer, Float, Boolean, UniqueConstraint

In [None]:
# Connect to datbase named corpus
dbURL = 'postgres+psycopg2://maxcarey:<password>@localhost:5432/corpus'

engine = create_engine(dbURL)

# Here it a tutorial that I loosely followed to get the logic for the next
# few lines: #https://www.compose.com/articles/using-postgresql-through-sqlalchemy
# If the database specified at the end of the database url does not 
# work then I need this logic in order to create the database
# You can see this here: https://stackoverflow.com/a/30971098/5420796
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

session = engine.connect()
Base = declarative_base()

In [None]:
class Corpus(Base):
    __tablename__ = 'corpus'

    # The ordered words in the corpus
    corpus_id = Column(String(), unique=True, primary_key=True)
    
    # The id of the html page in the source table
    source_html_id = Column(String(), ForeignKey('source.source_html_id'), ForeignKey('source_info.source_html_id'))
    
    # The id of the word in the lexicon table
    word_id = Column(String(), ForeignKey('lexicon.id'))

    # What we have hear is many-to-one relationship in which
    # This is the parent table
    # See https://docs.sqlalchemy.org/en/13/orm/basic_relationships.html#many-to-one
    source = relationship("Child")
    lexicon = relationship("Lexicon")

In [None]:
class Lexicon(Base):
    __tablename__ = 'lexicon'

    # The words in the corpus
    id = Column(String(), primary_key=True)
    
    # The headword
    word = Column(String())
    
    # Lemma
    lemma = Column(String())
    
    # The part of speach of the actual word
    pos = Column(String())

In [None]:
class Source(Base):
    __tablename__ = 'source'

    # The HTML of the text
    source_html_id = Column(String(), primary_key=True)
    
    # The id of the html page in the source table
    text = Column(String())

In [None]:
class SourceInfo(Base):
    __tablename__ = 'source_info'
    
    # The id of the actual text
    source_html_id = Column(String(), primary_key=True)
    
    # Num words
    num_words = Column(Integer())
    
    # Genre
    genre = Column(String())
    
    # Country
    country = Column(String())
    
    # Website
    website = Column(String())
    
    # URL
    url = Column(String())
    
    # Title
    title = Column(String())

In [None]:
Base.metadata.create_all(engine)
print(database_exists(engine.url))

In [None]:
corpus = pd.read_csv('raw_data/corpus.txt', delimiter='\t')
source = pd.read_csv('raw_data/source.txt', delimiter='\t')
lexicon = pd.read_csv('raw_data/lexicon.txt', delimiter='\t')
source_info = pd.read_csv('raw_data/source_info.txt', delimiter='\t')

In [None]:
source_info.head(n=100)

In [None]:
# Change the names of the corpus to make it a little easier to understnad
corpus = corpus.rename(columns={"textID": "source_html_id", "ID": "corpus_id", "wID": "word_id"})
source = source.rename(columns={"textID": "source_html_id"})
lexicon = lexicon.rename(columns={"wID": "id", "PoS": "pos"})
source_info = source_info.rename(columns={"textID": "source_html_id", "#words": "num_words"})

In [None]:
source.to_sql(name='source', con=session, if_exists='append', index=False)

In [None]:
lexicon.to_sql(name='lexicon', con=session, if_exists='append', index=False)

In [None]:
source_info.to_sql(name='source_info', con=session, if_exists='append', index=False)

In [None]:
corpus.to_sql(name='corpus', con=session, if_exists='append', index=False)