In [1]:
%load_ext sql
import pandas as pd
import sqlite3 as sql

%sql sqlite:///CourseData.db
conn = sql.connect('CourseData.db')

![Team %%sql Project ERD](CourseDataERD.png)

In [None]:
%%sql

-- Create PROGRAMS table
DROP TABLE IF EXISTS PROGRAMS;
CREATE TABLE PROGRAMS (
    ProgramID INTEGER PRIMARY KEY AUTOINCREMENT,
    ProgramCode TEXT NOT NULL,
    ProgramName TEXT NOT NULL
);

-- Create COURSE_CATALOG table
DROP TABLE IF EXISTS COURSE_CATALOG;
CREATE TABLE COURSE_CATALOG (
    CourseID INTEGER PRIMARY KEY,
    CatalogYear TEXT NOT NULL,
    CatalogID TEXT NOT NULL,
    ProgramID INTEGER,
    CourseTitle TEXT NOT NULL,
    Credits TEXT NOT NULL,
    Prereqs TEXT,
    Coreqs TEXT,
    Fees TEXT,
    Attributes TEXT,
    Description TEXT
);
CREATE INDEX index_course_catalog_alt_key on COURSE_CATALOG(CatalogYear,CatalogID);

-- Create COURSE_INSTRUCTORS table
DROP TABLE IF EXISTS COURSE_INSTRUCTORS;
CREATE TABLE COURSE_INSTRUCTORS (
    InstructorID INTEGER PRIMARY KEY,
    Name TEXT NOT NULL
);
CREATE INDEX index_course_instructors_alt_key on COURSE_INSTRUCTORS(Name);

-- Course COURSE_OFFERINGS table
DROP TABLE IF EXISTS COURSE_OFFERINGS;
CREATE TABLE COURSE_OFFERINGS (
    CourseOfferingID INTEGER PRIMARY KEY,
    CourseID INTEGER,
    CatalogID TEXT NOT NULL,
    Term TEXT NOT NULL,
    CRN INTEGER NOT NULL,
    Section TEXT NOT NULL,
    Credits REAL,
    Title TEXT NOT NULL,
    Timecodes TEXT,
    PrimaryInstructorID INTEGER,
    Capacity INTEGER NOT NULL,
    Actual INTEGER NOT NULL,
    Remaining INTEGER NOT NULL
);
CREATE INDEX index_course_offerings_alt_key on COURSE_OFFERINGS(Term,CatalogID,Section);

-- Create COURSE_LOCATIONS table
DROP TABLE IF EXISTS COURSE_LOCATIONS;
CREATE TABLE COURSE_LOCATIONS (
    LocationID INTEGER PRIMARY KEY,
    LocationCode TEXT NOT NULL
);


-- Create COURSE_MEETINGS table
DROP TABLE IF EXISTS COURSE_MEETINGS;
CREATE TABLE COURSE_MEETINGS (
    CourseMeetingID INTEGER PRIMARY KEY,
    CourseOfferingID INTEGER NOT NULL,
    LocationID INTEGER NOT NULL,
    StartDateTime TEXT NOT NULL,
    EndDateTime TEXT NOT NULL
);
-- Conversion table for con
-- Create TERM_TO_CATALOG_YEAR table
DROP TABLE IF EXISTS TERM_TO_CATALOG_YEAR;
CREATE TABLE TERM_TO_CATALOG_YEAR (
    CatalogYear TEXT NOT NULL,
    Term TEXT NOT NULL
);

INSERT INTO TERM_TO_CATALOG_YEAR (Term, CatalogYear) VALUES 
('Fall2017','2017_2018'),('Winter2018','2017_2018'),('Spring2018','2017_2018'),('Summer2018','2017_2018'),('Fall2018','2018_2019'),
('Winter2019','2018_2019'),('Spring2019','2018_2019');

In [3]:
#Defining path variables
src_path = 'SourceData/'
course_csv_filename = '/courses.csv'
meeting_csv_filename = '/course_meetings.csv'

In [21]:
# List to loop over catalog csv files
df_catalogs_export_data_csv = pd.DataFrame()
catalog_years = ['2017_2018', '2018_2019']

# Loop to import catalog data into our database.db
%sql DROP TABLE IF EXISTS IMPORT_COURSE_CATALOG;
for catalog_year in catalog_years:
    file_path = 'SourceData/Catalogs/CourseCatalog' + catalog_year + '.csv'
    df_catalogs = pd.read_csv(file_path)
    df_catalogs['cat_year'] = catalog_year # Add column with 'catalog_year'
    df_catalogs_export_data_csv = df_catalogs.append(df_catalogs_export_data_csv) # append to a df that will get exported to csv for visual purposes
    df_catalogs.to_sql('IMPORT_COURSE_CATALOG',conn,if_exists='append',index=False) # append: Insert new values to the existing table.

 * sqlite:///CourseData.db
Done.


In [22]:
#list with semesters to loop path
semesters = ['Fall2014','Winter2015','Spring2015','Summer2015','Fall2015','Winter2016',
             'Spring2016','Summer2016','Fall2016','Winter2017','Spring2017','SpringBreak2017',
             'Summer2017','Fall2017','Winter2018','Spring2018','Summer2018','Fall2018',
             'Spring2019'
            ]

%sql DROP TABLE IF EXISTS IMPORT_COURSE_OFFERINGS;
%sql DROP TABLE IF EXISTS IMPORT_COURSE_MEETINGS;
# Loop to import course offerings and course meetings data into database.db
for semester in semesters:
    file_path = 'SourceData/' + semester + '/courses.csv'
    df_course_offerings = pd.read_csv(file_path)
    df_course_offerings.to_sql('IMPORT_COURSE_OFFERINGS',conn,if_exists='append',index=False)
    
    file_path = 'SourceData/' + semester + '/course_meetings.csv'
    df_course_meetings = pd.read_csv(file_path)
    df_course_meetings.to_sql('IMPORT_COURSE_MEETINGS',conn,if_exists='append',index=False)
    

 * sqlite:///CourseData.db
Done.
 * sqlite:///CourseData.db
Done.


In [23]:
# Export to csv for visual purposes

#df_course_offerings.to_csv('Data_Dump_course.csv', index=None)
#df_course_meetings.to_csv('Data_Dump_meeting.csv', index=None)
#df_catalogs_export_data_csv.to_csv('Data_Dump_catalogs.csv', index=None)

In [24]:
%%sql
-- Check for duplicate values
SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_CATALOG) as 'Raw Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_CATALOG)) as 'Distinct Count';

 * sqlite:///CourseData.db
Done.


Raw Count,Distinct Count
4440,4440


In [25]:
%%sql
-- Check for duplicate values
SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_OFFERINGS) as 'Raw Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_OFFERINGS)) as 'Distinct Count';

 * sqlite:///CourseData.db
Done.


Raw Count,Distinct Count
15937,15937


In [26]:
%%sql
-- Check for duplicate values
SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_MEETINGS) as 'Raw Count',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_MEETINGS)) as 'Distinct Count';

 * sqlite:///CourseData.db
Done.


Raw Count,Distinct Count
284907,284847


In [32]:
%%sql

SELECT term,crn,location,day,start
FROM IMPORT_COURSE_MEETINGS
GROUP BY term,crn,location,day,start
HAVING COUNT(*)>1;

 * sqlite:///CourseData.db
Done.


term,crn,location,day,start
Fall2014,73073,MCA 102,M,2014-09-08T18:30:00
Fall2014,73073,MCA 102,M,2014-09-15T18:30:00
Fall2014,73073,MCA 102,M,2014-09-22T18:30:00
Fall2014,73073,MCA 102,M,2014-09-29T18:30:00
Fall2014,73073,MCA 102,M,2014-10-06T18:30:00
Fall2014,73073,MCA 102,M,2014-10-20T18:30:00
Fall2014,73073,MCA 102,M,2014-10-27T18:30:00
Fall2014,73073,MCA 102,M,2014-11-03T18:30:00
Fall2014,73073,MCA 102,M,2014-11-10T18:30:00
Fall2014,73073,MCA 102,M,2014-11-17T18:30:00
