## Import modules
- To begin, we are importing the Pandas and sql modules which will be used to leverage python in the creation of our database.
- The sql magic extension will be used all of our sql specific commands.

In [None]:
%load_ext sql
import pandas as pd
import sqlite3 as sql

## Create database
- Here we are creating a database titled 'CourseData.db'.
- The 'conn' variable acts as the bridge that allows us to interact with the database while using Python code.

In [None]:
%sql sqlite:///CourseData.db
conn = sql.connect('CourseData.db')

![Team %%sql Project ERD](CourseDataERD.png)

## Create tables matching ERD infrastructure

In [None]:
%%sql

-- Create PROGRAMS table
DROP TABLE IF EXISTS PROGRAMS;
CREATE TABLE PROGRAMS (
    ProgramID INTEGER PRIMARY KEY,
    ProgramCode TEXT NOT NULL,
    ProgramName TEXT NOT NULL
);

-- Create CATALOG_COURSES table
DROP TABLE IF EXISTS CATALOG_COURSES;
CREATE TABLE CATALOG_COURSES (
    CourseID INTEGER PRIMARY KEY,
    CatalogYear TEXT NOT NULL,
    CatalogID TEXT NOT NULL,
    ProgramID INTEGER,
    CourseTitle TEXT NOT NULL,
    Credits TEXT NOT NULL,
    Prereqs TEXT,
    Coreqs TEXT,
    Fees TEXT,
    Attributes TEXT,
    Description TEXT
);
CREATE INDEX index_catalog_courses_alt_key on CATALOG_COURSES(CatalogYear,CatalogID);

-- Create COURSE_INSTRUCTORS table
DROP TABLE IF EXISTS COURSE_INSTRUCTORS;
CREATE TABLE COURSE_INSTRUCTORS (
    InstructorID INTEGER PRIMARY KEY,
    Name TEXT NOT NULL
);
CREATE INDEX index_course_instructors_alt_key on COURSE_INSTRUCTORS(Name);

-- Course COURSE_OFFERINGS table
DROP TABLE IF EXISTS COURSE_OFFERINGS;
CREATE TABLE COURSE_OFFERINGS (
    CourseOfferingID INTEGER PRIMARY KEY,
    CourseID INTEGER,
    CatalogID TEXT NOT NULL,
    Term TEXT NOT NULL,
    CRN INTEGER NOT NULL,
    Section TEXT NOT NULL,
    Credits REAL,
    Title TEXT NOT NULL,
    Timecodes TEXT,
    PrimaryInstructorID INTEGER,
    Capacity INTEGER NOT NULL,
    Actual INTEGER NOT NULL,
    Remaining INTEGER NOT NULL
);
CREATE INDEX index_course_offerings_alt_key on COURSE_OFFERINGS(Term,CatalogID,Section);

-- Create COURSE_LOCATIONS table
DROP TABLE IF EXISTS COURSE_LOCATIONS;
CREATE TABLE COURSE_LOCATIONS (
    LocationID INTEGER PRIMARY KEY,
    LocationCode TEXT NOT NULL
);


-- Create COURSE_MEETINGS table
DROP TABLE IF EXISTS COURSE_MEETINGS;
CREATE TABLE COURSE_MEETINGS (
    CourseMeetingID INTEGER PRIMARY KEY,
    CourseOfferingID INTEGER NOT NULL,
    LocationID INTEGER NOT NULL,
    StartDateTime TEXT NOT NULL,
    EndDateTime TEXT NOT NULL
);
-- Conversion table for con
-- Create TERM_TO_CATALOG_YEAR table
DROP TABLE IF EXISTS TERM_TO_CATALOG_YEAR;
CREATE TABLE TERM_TO_CATALOG_YEAR (
    CatalogYear TEXT NOT NULL,
    Term TEXT NOT NULL
);

INSERT INTO TERM_TO_CATALOG_YEAR (Term, CatalogYear) VALUES 
('Fall2017','2017_2018'),('Winter2018','2017_2018'),('Spring2018','2017_2018'),('Summer2018','2017_2018'),('Fall2018','2018_2019'),
('Winter2019','2018_2019'),('Spring2019','2018_2019');

## Delete & Drop 'IMPORT' tables

In [None]:
%%sql
DELETE FROM IMPORT_CATALOG_COURSES;
DELETE FROM IMPORT_COURSE_OFFERINGS;
DELETE FROM IMPORT_COURSE_MEETINGS;
DROP TABLE IF EXISTS IMPORT_CATALOG_COURSES;
DROP TABLE IF EXISTS IMPORT_COURSE_OFFERINGS;
DROP TABLE IF EXISTS IMPORT_COURSE_MEETINGS;

## Loop used to populate 'IMPORT' tables

In [None]:
catalog_years = ['2017_2018', '2018_2019']

for catalog_year in catalog_years:
    file_path = 'SourceData/Catalogs/CourseCatalog' + catalog_year + '.csv'
    df_catalogs = pd.read_csv(file_path)
    df_catalogs['cat_year'] = catalog_year # Add column with 'catalog_year' into table
    df_catalogs.to_sql('IMPORT_CATALOG_COURSES',conn,if_exists='append',index=False)
    
semesters = ['Fall2014','Winter2015','Spring2015','Summer2015','Fall2015','Winter2016',
             'Spring2016','Summer2016','Fall2016','Winter2017','Spring2017','SpringBreak2017',
             'Summer2017','Fall2017','Winter2018','Spring2018','Summer2018','Fall2018',
             'Spring2019']

for semester in semesters:
    file_path = 'SourceData/' + semester + '/courses.csv'
    df_course_offerings = pd.read_csv(file_path)
    df_course_offerings.to_sql('IMPORT_COURSE_OFFERINGS',conn,if_exists='append',index=False)
    
    file_path = 'SourceData/' + semester + '/course_meetings.csv'
    df_course_meetings = pd.read_csv(file_path)
    df_course_meetings.to_sql('IMPORT_COURSE_MEETINGS',conn,if_exists='append',index=False)
    

## Check for duplicate data

In [None]:
%%sql
-- Check for duplicate values
SELECT 
    (SELECT Count(*) FROM IMPORT_CATALOG_COURSES) as 'Raw Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_CATALOG_COURSES)) as 'Distinct Count';

In [None]:
%%sql
-- Check for duplicate values
SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_OFFERINGS) as 'Raw Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_OFFERINGS)) as 'Distinct Count';

In [None]:
%%sql
-- Check for duplicate values
SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_MEETINGS) as 'Raw Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_MEETINGS)) as 'Distinct Count';

## Exact number of duplicate values
- The query below subtracts the count of values from the count of distinct values to quantify the number of duplicates within the IMPORT_COURSE_MEETINGS table.

In [None]:
%%sql
SELECT COUNT(*) - (SELECT COUNT(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_MEETINGS)) AS 'DuplicateValues'
FROM IMPORT_COURSE_MEETINGS;

## Detailed view of duplicate values

- The cell below identifies the duplicate values within IMPORT_COURSE_MEETINGS

In [None]:
%%sql

SELECT term,crn,location,day,start
FROM IMPORT_COURSE_MEETINGS
GROUP BY term,crn,location,day,start
HAVING COUNT(*)>1;

## Populate ERD tables with data from 'IMPORT' tables

In [None]:
%%sql
DELETE FROM COURSE_INSTRUCTORS;

INSERT INTO COURSE_INSTRUCTORS (Name)
SELECT DISTINCT primary_instructor
FROM IMPORT_COURSE_OFFERINGS
WHERE primary_instructor <> 'TBA' AND primary_instructor NOT LIKE '%/%';

In [None]:
%%sql
DELETE FROM PROGRAMS;

INSERT INTO PROGRAMS (ProgramCode,ProgramName)
SELECT DISTINCT program_code,program_name 
FROM IMPORT_CATALOG_COURSES
ORDER BY program_code;

In [None]:
%%sql
DELETE FROM CATALOG_COURSES;

INSERT INTO CATALOG_COURSES (CatalogYear,ProgramID,CatalogID,CourseTitle,Credits,Prereqs,Coreqs,Fees,Attributes,Description)
SELECT DISTINCT cat_year, ProgramID,catalog_id,course_title,credits,prereqs,coreqs,fees,attributes,description
FROM IMPORT_CATALOG_COURSES 
    JOIN PROGRAMS ON (program_code = ProgramCode);

In [None]:
%%sql
DELETE FROM COURSE_OFFERINGS;

INSERT INTO COURSE_OFFERINGS (CourseID,Term,CRN,CatalogID,Section,Credits,Title,Timecodes,PrimaryInstructorID,Capacity, Actual, Remaining)
SELECT DISTINCT CourseID,term,crn,catalog_id,section,IMPORT_COURSE_OFFERINGS.credits,title,timecodes,InstructorID,cap,act,rem 
FROM IMPORT_COURSE_OFFERINGS 
    LEFT JOIN COURSE_INSTRUCTORS ON (primary_instructor=COURSE_INSTRUCTORS.Name)
    LEFT JOIN TERM_TO_CATALOG_YEAR USING (Term)
    LEFT JOIN CATALOG_COURSES ON (catalog_id = CatalogID AND CATALOG_COURSES.CatalogYear = TERM_TO_CATALOG_YEAR.CatalogYear)
;

In [None]:
%%sql 
DELETE FROM COURSE_LOCATIONS;

INSERT INTO COURSE_LOCATIONS (LocationCode)
SELECT DISTINCT Location 
FROM IMPORT_COURSE_MEETINGS
ORDER BY Location

In [None]:
%%sql
DELETE FROM COURSE_MEETINGS;

INSERT INTO COURSE_MEETINGS (CourseOfferingID,LocationID,StartDateTime,EndDateTime)
SELECT DISTINCT COURSE_OFFERINGS.CourseOfferingID, LocationID,start,end
FROM IMPORT_COURSE_MEETINGS 
    JOIN COURSE_OFFERINGS USING (Term,CRN)
    LEFT JOIN COURSE_LOCATIONS ON (IMPORT_COURSE_MEETINGS.Location = COURSE_LOCATIONS.LocationCode);

## Clear storage space

In [None]:
%%sql

DELETE FROM IMPORT_CATALOG_COURSES;
DELETE FROM IMPORT_COURSE_OFFERINGS;
DELETE FROM IMPORT_COURSE_MEETINGS;

DROP TABLE IF EXISTS IMPORT_CATALOG_COURSES;
DROP TABLE IF EXISTS IMPORT_COURSE_OFFERINGS;
DROP TABLE IF EXISTS IMPORT_COURSE_MEETINGS;

vacuum;

![StarSchemaERD](StarSchemaERD.png)

## Create DataWarehouse.db

In [None]:
%sql sqlite:///CourseDataWarehouse.db
conn_dw = sql.connect('CourseDataWarehouse.db')

In [None]:
%%sql

-- Create Fact Table
DROP TABLE IF EXISTS FACT_TABLE;
CREATE TABLE FACT_TABLE (
    ProgramID INTEGER,
    CourseID INTEGER NOT NULL,
    CourseOfferingID INTEGER NOT NULL,
    LocationID INTEGER NOT NULL,
    TotalCatalogs TEXT,
    TotalInstructors TEXT,
    TotalClassrooms TEXT
);

-- Create CATALOG_DIMENSION table
DROP TABLE IF EXISTS CATALOG_DIMENSION;
CREATE TABLE CATALOG_DIMENSION (
    CourseID INTEGER PRIMARY KEY,
    CatalogID TEXT NOT NULL,
    ProgramID INTEGER NOT NULL,
    ProgramCode TEXT NOT NULL,
    ProgramName TEXT NOT NULL,
    Credits TEXT NOT NULL,
    CourseTitle TEXT NOT NULL,
    CatalogYear TEXT NOT NULL,
    Prereqs TEXT,
    Coreqs TEXT,
    Fees TEXT,
    Attributes TEXT,
    Description TEXT,
    Term TEXT   
);

-- Create LOCATION_DIMENSION table
DROP TABLE IF EXISTS LOCATION_DIMENSION;
CREATE TABLE LOCATION_DIMENSION (
    LocationID INTEGER PRIMARY KEY,
    LocationCode TEXT NOT NULL
);

-- Course TIME_DIMENSION table
DROP TABLE IF EXISTS TIME_DIMENSION;
CREATE TABLE TIME_DIMENSION (
    CourseMeetingID INTEGER PRIMARY KEY,
    CourseOfferingID TEXT NOT NULL,
    Timecodes TEXT,
    StartDateTime TEXT,
    EndDateTime TEXT
);

-- Create COURSE_DIMENSION table
DROP TABLE IF EXISTS COURSE_DIMENSION;
CREATE TABLE COURSE_DIMENSION (
    CourseOfferingID INTEGER PRIMARY KEY,
    CRN TEXT NOT NULL,
    Section TEXT NOT NULL,
    Title TEXT NOT NULL,
    Capacity TEXT NOT NULL,
    Actual TEXT,
    Remaining TEXT
);

-- Create INSTRUCTOR_DIMENSION table
DROP TABLE IF EXISTS INSTRUCTOR_DIMENSION;
CREATE TABLE INSTRUCTOR_DIMENSION (
    InstructorID INTEGER PRIMARY KEY,
    Name TEXT NOT NULL
);


## Query data from CourseData.db to CourseDataWarehouse.db

In [None]:
CourseDataDB_Query = """
SELECT DISTINCT *
FROM CATALOG_COURSES 
    JOIN PROGRAMS USING(ProgramID)
"""

In [None]:
%sql DELETE FROM IMPORT_COURSE_DATA_DB;
%sql DROP TABLE IF EXISTS IMPORT_COURSE_DATA_DB;
CourseDataDB = pd.read_sql(sql=CourseDataDB_Query, con=conn)
CourseDataDB.to_sql('IMPORT_COURSE_DATA_DB',conn_dw,if_exists='append',index=False)

In [None]:
%%sql
SELECT *
FROM IMPORT_COURSE_DATA_DB
LIMIT 1;

In [None]:
# %%sql
# DELETE FROM CATALOG_DIMENSION;
# INSERT INTO CATALOG_DIMENSION (CourseID,CataologID,ProgramID,ProgramCode,ProgramName,Credits,CourseTitle,CatalogYear,Fees,Attributes,
#                                Description,Prereqs,Coreqs,Term)
# SELECT DISTINCT CourseID, ProgramID,catalog_id,course_title,credits,prereqs,coreqs,fees,attributes,description
# FROM IMPORT_CATALOG_COURSES 
#     JOIN PROGRAMS USING(;