# Setting up conection 

In [12]:
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.orm import mapper, sessionmaker
from sqlalchemy import Column, Integer
from sqlalchemy.ext.declarative import declarative_base
from scrapy.selector import Selector
from sqlalchemy.sql import text as sa_text
import re

engine = create_engine('mysql+pymysql://root:@localhost/oerintegrationdb', echo=False)
Base = declarative_base(engine)


class Triple(Base):
    __tablename__ = 'triple'
    __table_args__ = {'autoload': True}


class CleanTriple(Base):
    __tablename__ = 'cleantriple'
    __table_args__ = {'autoload': True}

metadata = Base.metadata
Session = sessionmaker(bind=engine)
session = Session()

print("Ready DB")

Ready DB


In [13]:
triples = session.query(Triple).filter(Triple.subject=="open bccampus ")
engine.execute(sa_text('''TRUNCATE TABLE cleantriple''').execution_options(autocommit=True))
print("Cleaned")

Cleaned


In [14]:
source = "open bccampus"
session.add(CleanTriple(subject=source, predicate="hasName", object=source))
for triple in triples:
    # print(triple.subject)
    body = triple.object
    # Setting name
    name = Selector(text=body).xpath("//h2[@itemprop='name']/text()").get()
    # print(name)
    session.add(CleanTriple(subject=source, predicate="hasBook", object=name))
    session.add(CleanTriple(subject=name, predicate="hasName", object=name))
    
    about = Selector(text=body).xpath("//span[@itemprop='description']/text()").get()
    if about is not None and len(about)>1:
        session.add(CleanTriple(subject=name, predicate="hasAbout", object=about))
    
    meta = Selector(text=body).xpath("//p[@class='text-muted']/text()").getall()
    # print(meta)
    if meta is not None and len(about)>2:
        posted = meta[0]
        updated = meta[1]
        session.add(CleanTriple(subject=name, predicate="postedIn", object=posted))
        session.add(CleanTriple(subject=name, predicate="updatedIn", object=updated))
        
    authors = Selector(text=body).xpath("//span[@itemprop='author copyrightHolder']/text()").get()
    # print(authors)
    if authors is not None:
        authors  = authors.split(",")
        if len(authors)%2 == 0:
            cont = int(len(authors)/2)
            # print(len(authors))
            for i in range(0,cont):
                author = authors[0+i*2]
                institute = authors[1+i*2]
                # print(author, " ", institute)
                session.add(CleanTriple(subject=name, predicate="hasAuthor", object=author))
                session.add(CleanTriple(subject=author, predicate="hasAffiliation", object=institute))
                # i += 2
            # for author in authors:
            #     print(author)
        else:
             session.add(CleanTriple(subject=name, predicate="hasAuthor", object=authors[0]))
    
    
    image = Selector(text=body).xpath("//img[@alt='textbook cover image']/@src").get()
    # print(image)
    if image is not None and len(about)>1:
        session.add(CleanTriple(subject=name, predicate="hasImage", object=image))
    
    
    subject_areas = Selector(text=body).xpath("//div[@class='col-sm-8']/p[2]/a/text()").getall()
    # print(image)
    if subject_areas is not None:
        for area in subject_areas:
            # print(area)
            session.add(CleanTriple(subject=name, predicate="hasSubjectArea", object=area))
    
     
    original_source = Selector(text=body).xpath("//div[@class='col-sm-8']/p[3]/a/text()").get()
    original_source_link = Selector(text=body).xpath("//div[@class='col-sm-8']/p[3]/a/@href").get()
    # print(original_source)
    # print(original_source_link)
    if original_source is not None and original_source_link is not None:
        session.add(CleanTriple(subject=name, predicate="originalSource", object=original_source))
        session.add(CleanTriple(subject=original_source, predicate="hasLink", object=original_source_link))
        
    adoption = Selector(text=body).xpath("//div[@class='col-sm-8']/p[4]/a/@href").get()
    if adoption is not None :
        # print(adoption)
        session.add(CleanTriple(subject=name, predicate="hasAdoption", object="https://open.bccampus.ca"+adoption))
    
    adaptation = Selector(text=body).xpath("//div[@class='col-sm-8']/p[5]/a/@href").get()
    if adaptation is not None :
        # print(adaptation)
        session.add(CleanTriple(subject=name, predicate="hasAdaptation", object="https://open.bccampus.ca"+adaptation))
        
    help_link = Selector(text=body).xpath("//div[@class='col-sm-8']/p[6]/a/@href").get()
    if help_link is not None :
        # print(help_link)
        session.add(CleanTriple(subject=name, predicate="hasHelpLink", object="https://open.bccampus.ca"+help_link))
    
    
    accesibility_link = Selector(text=body).xpath("//div[@class='col-sm-8']/p[7]/a/@href").get()
    if accesibility_link is not None :
        # print(help_link)
        session.add(CleanTriple(subject=name, predicate="hasAccessibility", object=accesibility_link))     
    
    
    license = Selector(text=body).xpath("//p[@class='text-muted']/a[2]/@href").get()
    if license is not None :
        # print(license)
        session.add(CleanTriple(subject=name, predicate="hasLicense", object=license))
    
    similar_books = Selector(text=body).xpath("//section[@class='bkgd-grey-light d-flex flex-row flex-wrap full-width py-3 mt-3']/article/p/a/text()").getall()
    # print(image)
    if similar_books is not None:
        for book in similar_books:
            # print(area)
            session.add(CleanTriple(subject=name, predicate="hasSimilarBook", object=book))
    
    pdf_link  = Selector(text=body).xpath("//a[@title='Digital PDF']/@href").get()
    if pdf_link is not None :
        # print(pdf_link)
        session.add(CleanTriple(subject=name, predicate="pdfAvailableAt", object=pdf_link)) 
    
    site_link  = Selector(text=body).xpath("//a[@title='Read this book online']/@href").get()
    if site_link is not None :
        # print(site_link)
        session.add(CleanTriple(subject=name, predicate="siteAvailableAt", object=site_link))
    

print("Done")     
session.commit()


Done
