# Create Postgres Database and Import Mark Davies Spanish Web/Dialects Corpus (Sample Files) into Postres SQL Database

In [1]:
import os
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.orm import relationship
import pandas as pd
import numpy as np
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, ForeignKey, String, Integer, Float, Boolean, UniqueConstraint

In [2]:
# Connect to datbase named corpus
dbURL = 'postgres+psycopg2://maxcarey:<password>@localhost:5432/corpus'

engine = create_engine(dbURL)

# Here it a tutorial that I loosely followed to get the logic for the next
# few lines: #https://www.compose.com/articles/using-postgresql-through-sqlalchemy
# If the database specified at the end of the database url does not 
# work then I need this logic in order to create the database
# You can see this here: https://stackoverflow.com/a/30971098/5420796
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

session = engine.connect()
Base = declarative_base()

True


In [3]:
class Corpus(Base):
    __tablename__ = 'corpus'

    # The ordered words in the corpus
    corpus_id = Column(String(), unique=True, primary_key=True)
    
    # The id of the html page in the source table
    source_html_id = Column(String(), ForeignKey('source.source_html_id'), ForeignKey('source_info.source_html_id'))
    
    # The id of the word in the lexicon table
    word_id = Column(String(), ForeignKey('lexicon.id'))

    # What we have hear is many-to-one relationship in which
    # This is the parent table
    # See https://docs.sqlalchemy.org/en/13/orm/basic_relationships.html#many-to-one
    source = relationship("Child")
    lexicon = relationship("Lexicon")

In [4]:
class Lexicon(Base):
    __tablename__ = 'lexicon'

    # The words in the corpus
    id = Column(String(), primary_key=True)
    
    # The headword
    word = Column(String())
    
    # Lemma
    lemma = Column(String())
    
    # The part of speach of the actual word
    pos = Column(String())

In [5]:
class Source(Base):
    __tablename__ = 'source'

    # The HTML of the text
    source_html_id = Column(String(), primary_key=True)
    
    # The id of the html page in the source table
    text = Column(String())

In [6]:
class SourceInfo(Base):
    __tablename__ = 'source_info'
    
    # The id of the actual text
    source_html_id = Column(String(), primary_key=True)
    
    # Num words
    num_words = Column(Integer())
    
    # Genre
    genre = Column(String())
    
    # Country
    country = Column(String())
    
    # Website
    website = Column(String())
    
    # URL
    url = Column(String())
    
    # Title
    title = Column(String())

In [7]:
Base.metadata.create_all(engine)
print(database_exists(engine.url))

True


In [8]:
corpus = pd.read_csv('raw_data/corpus.txt', delimiter='\t')
source = pd.read_csv('raw_data/source.txt', delimiter='\t')
lexicon = pd.read_csv('raw_data/lexicon.txt', delimiter='\t')
source_info = pd.read_csv('raw_data/source_info.txt', delimiter='\t')

In [9]:
source_info.head(n=100)

Unnamed: 0,textID,#words,genre,country,website,url,title
0,124,268.0,b,AR,03442.com.ar,http://03442.com.ar/2013/07/gran-convocatoria-...,Gran convocatoria para el concurso docente que...
1,1124,679.0,b,AR,adnrionegro.com.ar,http://adnrionegro.com.ar/2012/06/weretilneck-...,Weretilneck anunció que se reforzará la seguri...
2,2124,1100.0,b,AR,agmer.org.ar,http://agmer.org.ar/index/4494-peccin-no-hay-u...,Peccín: “No hay un solo paso atrás” - Agmer
3,4124,3445.0,b,AR,bibliotecaignoria.blogspot.com,http://bibliotecaignoria.blogspot.com/2012/06/...,Mario Vargas Llosa: La desaparición del erotis...
4,5124,403.0,b,AR,argentina-actual.com.ar,http://argentina-actual.com.ar/noticias/el-cof...,El CoFeCA solicita a la Corte que se expida so...
...,...,...,...,...,...,...,...
95,105124,412.0,b,AR,unmetroadelantado.com,http://www.unmetroadelantado.com/2013/04/emo-c...,UN METRO ADELANTADO: EMO-CIONANTE
96,106124,347.0,b,AR,vineadesencantar.com.ar,http://www.vineadesencantar.com.ar/2010/11/si-...,Si te he visto no me acuerdo - vine a desencan...
97,107124,1147.0,b,AR,wim-network.org,http://www.wim-network.org/2013/08/paises-de-l...,Países de la región adoptan el Consenso de Mon...
98,108124,566.0,b,AR,zankyou.terra.com.ar,http://zankyou.terra.com.ar/p/top-5-de-bodas-t...,Top 5 de bodas temáticas - Zankyou


In [10]:
# Change the names of the corpus to make it a little easier to understnad
corpus = corpus.rename(columns={"textID": "source_html_id", "ID": "corpus_id", "wID": "word_id"})
source = source.rename(columns={"textID": "source_html_id"})
lexicon = lexicon.rename(columns={"wID": "id", "PoS": "pos"})
source_info = source_info.rename(columns={"textID": "source_html_id", "#words": "num_words"})

In [11]:
source.to_sql(name='source', con=session, if_exists='append', index=False)

In [12]:
lexicon.to_sql(name='lexicon', con=session, if_exists='append', index=False)

In [13]:
source_info.to_sql(name='source_info', con=session, if_exists='append', index=False)

In [14]:
corpus.to_sql(name='corpus', con=session, if_exists='append', index=False)