# ETL for CourseData.db
A step-by-step walkthrough

## Preliminaries: Extensions, Imports, and Database Connections

In [1]:
%load_ext sql
import pandas as pd
import sqlite3

In [2]:
%sql sqlite:///CourseData.db
conn = sqlite3.connect('CourseData.db')

In [3]:
%%sql
DROP TABLE IF EXISTS import_catalog_courses;
DROP TABLE IF EXISTS import_course_offerings;
DROP TABLE IF EXISTS import_course_meetings;

 * sqlite:///CourseData.db
Done.
Done.
Done.


[]

## 1. Create Tables from ERD

In [4]:
%%sql

-- Program table
CREATE TABLE PROGRAMS (
    ProgramID INTEGER PRIMARY KEY,
    ProgramCode TEXT NOT NULL,
    ProgramName TEXT NOT NULL
);

-- Course Catalog table
CREATE TABLE CATALOG_COURSES (
    CourseID INTEGER PRIMARY KEY,
    CatalogYear TEXT NOT NULL,
    CatalogID TEXT NOT NULL,
    ProgramID INTEGER,
    CourseTitle TEXT NOT NULL,
    Credits TEXT NOT NULL,
    Prereqs TEXT,
    Coreqs TEXT,
    Fees TEXT,
    Attributes TEXT,
    Description TEXT
);
CREATE INDEX ix_catalog_courses_alt_key on CATALOG_COURSES(CatalogYear,CatalogID);

-- Instructors table
CREATE TABLE INSTRUCTORS (
    InstructorID INTEGER PRIMARY KEY,
    Name TEXT NOT NULL
);
CREATE INDEX ix_instructors_name on INSTRUCTORS(Name);

-- Course Offerings table
CREATE TABLE COURSE_OFFERINGS (
    CourseOfferingID INTEGER PRIMARY KEY,
    CourseID INTEGER,
    CatalogID TEXT NOT NULL,
    Term TEXT NOT NULL,
    CRN INTEGER NOT NULL,
    Section TEXT NOT NULL,
    Credits REAL,
    Title TEXT NOT NULL,
    Timecodes TEXT,
    PrimaryInstructorID INTEGER,
    Capacity INTEGER NOT NULL,
    Actual INTEGER NOT NULL,
    Remaining INTEGER NOT NULL
);
CREATE INDEX ix_course_offerings_alt_key on COURSE_OFFERINGS(Term,CatalogID,Section);

-- Locations table
CREATE TABLE LOCATIONS (
    LocationID INTEGER PRIMARY KEY,
    LocationCode TEXT NOT NULL
);

-- Course Meetings table
CREATE TABLE COURSE_MEETINGS (
    CourseMeetingID INTEGER PRIMARY KEY,
    CourseOfferingID INTEGER NOT NULL,
    LocationID INTEGER NOT NULL,
    StartDateTime TEXT NOT NULL,
    EndDateTime TEXT NOT NULL
);

-- A conversion table for matching the term to the corresponding catalog year
-- Catalogs are not available for the first several years
CREATE TABLE TERM_CATALOG_YEAR (
    CatalogYear TEXT NOT NULL,
    Term TEXT NOT NULL
);

INSERT INTO TERM_CATALOG_YEAR (Term, CatalogYear) VALUES 
('Fall2017','2017-2018'),('Winter2018','2017-2018'),('Spring2018','2017-2018'),('Summer2018','2017-2018'),('Fall2018','2018-2019'),
('Winter2019','2018-2019'),('Spring2019','2018-2019');

 * sqlite:///CourseData.db
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
7 rows affected.


[]

## 2. Extract data from CSV files
_The following code uses Python to automate the dirty work that you might do in `sqlite3`._

In [5]:
# Catalog Data
catalog_years = ['2017_2018', '2018_2019']

for cat_year in catalog_years:
    filepath = 'SourceData/Catalogs/CourseCatalog'+cat_year+'.csv'
    data = pd.read_csv(filepath)
    data['cat_year'] = cat_year
    data.to_sql('IMPORT_CATALOG_COURSES',conn,if_exists='append',index=False)


In [6]:
# Course Offering and Course Meeting Data
terms = ['Fall2014','Fall2015','Fall2016','Fall2017','Fall2018',
         'Spring2015','Spring2016','Spring2017','Spring2018','Spring2019',
         'SpringBreak2017',
         'Summer2015','Summer2016','Summer2017','Summer2018',
         'Winter2015','Winter2016','Winter2017','Winter2018']

for term in terms:
    filepath = 'SourceData/'+term+'/courses.csv'
    data = pd.read_csv(filepath)
    data.to_sql('IMPORT_COURSE_OFFERINGS',conn,if_exists='append',index=False) 
    
    filepath = 'SourceData/'+term+'/course_meetings.csv'
    data = pd.read_csv(filepath)
    data.to_sql('IMPORT_COURSE_MEETINGS',conn,if_exists='append',index=False)

# 3 & 4. Transform and Load Data Into ERD Tables

In [7]:
%%sql
DELETE FROM INSTRUCTORS;

INSERT INTO INSTRUCTORS (Name)
SELECT DISTINCT primary_instructor 
FROM import_course_offerings 
WHERE primary_instructor <> 'TBA' AND primary_instructor NOT LIKE '%/%';

 * sqlite:///CourseData.db
0 rows affected.
1095 rows affected.


[]

In [14]:
%%sql
SELECT * FROM INSTRUCTORS WHERE Name like "%Nash%";

 * sqlite:///CourseData.db
Done.


InstructorID,Name
268,Laura Nash


In [8]:
%%sql
DELETE FROM PROGRAMS;

INSERT INTO PROGRAMS (ProgramCode,ProgramName)
SELECT DISTINCT program_code,program_name 
FROM IMPORT_CATALOG_COURSES
ORDER BY program_code;

 * sqlite:///CourseData.db
0 rows affected.
83 rows affected.


[]

In [9]:
%%sql 
DELETE FROM CATALOG_COURSES;

INSERT INTO CATALOG_COURSES (CatalogYear,ProgramID,CatalogID,CourseTitle,Credits,Prereqs,Coreqs,Fees,Attributes,Description)
SELECT DISTINCT cat_year, ProgramID,catalog_id,course_title,credits,prereqs,coreqs,fees,attributes,description
FROM IMPORT_CATALOG_COURSES 
    JOIN PROGRAMS ON (program_code = ProgramCode);

 * sqlite:///CourseData.db
0 rows affected.
4440 rows affected.


[]

In [10]:
%%sql
DELETE FROM COURSE_OFFERINGS;

INSERT INTO COURSE_OFFERINGS (CourseID,Term,CRN,CatalogID,Section,Credits,Title,Timecodes,PrimaryInstructorID,Capacity, Actual, Remaining)
SELECT DISTINCT CatalogYear,term,crn,catalog_id,section,import_course_offerings.credits,title,timecodes,InstructorID,cap,act,rem 
FROM import_course_offerings 
    LEFT JOIN INSTRUCTORS ON (import_course_offerings.primary_instructor=INSTRUCTORS.Name)
    LEFT JOIN TERM_CATALOG_YEAR USING (Term)
    LEFT JOIN CATALOG_COURSES USING (CatalogYear);

 * sqlite:///CourseData.db
0 rows affected.
15937 rows affected.


[]

In [11]:
%%sql 
SELECT CatalogID,Term,CRN, Section,Capacity,Actual,Remaining,substr(Term,-4) as Year
FROM COURSE_OFFERINGS LEFT JOIN INSTRUCTORS ON (COURSE_OFFERINGS.PrimaryInstructorID = INSTRUCTORS.InstructorID)
WHERE Name like '%Huntley%'
ORDER BY Year,Term DESC,CatalogID,Section;

 * sqlite:///CourseData.db
Done.


CatalogID,Term,CRN,Section,Capacity,Actual,Remaining,Year
IS 0100,Fall2014,70369,E,25,26,-1,2014
IS 0135,Fall2014,73060,A,25,26,-1,2014
IS 0320,Fall2014,73061,A,25,16,9,2014
IS 0100,Fall2015,75231,E,29,28,1,2015
IS 0100,Fall2015,75246,F,29,28,1,2015
IS 0135,Fall2015,76388,A,25,21,4,2015
IS 0320,Fall2015,76389,A,25,13,12,2015
IS 0585,Spring2016,38780,01,20,15,5,2016
OM 0101,Spring2016,37253,B,29,28,1,2016
OM 0101,Spring2016,37254,C,29,29,0,2016


In [12]:
%%sql 
INSERT INTO LOCATIONS (LocationCode)
SELECT DISTINCT Location 
FROM import_course_meetings
ORDER BY Location

 * sqlite:///CourseData.db
207 rows affected.


[]

In [13]:
%%sql
INSERT INTO COURSE_MEETINGS (CourseOfferingID,LocationID,StartDateTime,EndDateTime)
SELECT DISTINCT COURSE_OFFERINGS.CourseOfferingID, LocationID,`Start`,`End`
FROM import_course_meetings 
    JOIN COURSE_OFFERINGS USING (Term,CRN)
    JOIN LOCATIONS ON (import_course_meetings.Location = LOCATIONS.LocationCode);

 * sqlite:///CourseData.db
311141 rows affected.


[]

In [None]:
%%sql
SELECT * FROM COURSE_MEETINGS LIMIT 100;