# ETL Project

Foreign Keys
* searchword_id
* VideoCategory
* CountryCode

In [19]:
# import dependencies

import pandas as pd
from sqlalchemy import create_engine
from config import username, password

# Part 1: Load Data

In [20]:
# read CSV

# YouTube Videos
CA_videos = 'data/CAvideos.csv'
DE_videos = 'data/DEvideos.csv'
FR_videos = 'data/FRvideos.csv'
GB_videos = 'data/GBvideos.csv'
IN_videos = 'data/INvideos.csv'
JP_videos = 'data/JPvideos.csv'
KR_videos = 'data/KRvideos.csv'
MX_videos = 'data/MXvideos.csv'
RU_videos = 'data/RUvideos.csv'
US_videos = 'data/USvideos.csv'
CA_videos_df = pd.read_csv(CA_videos, encoding='utf-8', dtype = 'unicode')
DE_videos_df = pd.read_csv(DE_videos, encoding='utf-8', dtype = 'unicode')
FR_videos_df = pd.read_csv(FR_videos, encoding='utf-8', dtype = 'unicode')
GB_videos_df = pd.read_csv(GB_videos, encoding='utf-8', dtype = 'unicode')
IN_videos_df = pd.read_csv(IN_videos, encoding='utf-8', dtype = 'unicode')
JP_videos_df = pd.read_csv(JP_videos, encoding='latin1', dtype = 'unicode')
KR_videos_df = pd.read_csv(KR_videos, encoding='latin1', dtype = 'unicode')
MX_videos_df = pd.read_csv(MX_videos, encoding='latin1', dtype = 'unicode')
RU_videos_df = pd.read_csv(RU_videos, encoding='latin1', dtype = 'unicode')
US_videos_df = pd.read_csv(US_videos, encoding='utf-8', dtype = 'unicode')

# Ted Talks
TED_Talks_query = 'data/QueryResult.csv'
tedtalks = pd.read_csv(TED_Talks_query)
TED_Talks_More_load = 'data/TED_Talks_more.csv'
tedtalks_more = pd.read_csv(TED_Talks_More_load)

# edX
# source: https://www.kaggle.com/edx/course-study
edx_df = pd.read_csv('data/edx_data.csv')

# YouTube Metadata
# NOTE: JSON files are in a separate notebook, "YouTube Video Stats Metadata"
youtube_metadata = pd.read_csv('data/youtube_metadata.csv')

In [21]:
# read HTML

# List of MOOCs
url_mooc_table = 'https://en.wikipedia.org/wiki/List_of_MOOC_providers'
mooc_table = pd.read_html(url_mooc_table)
mooc_df = mooc_table[0]

# Attributes of MOOCs
url_mooc_attributes = 'https://en.wikipedia.org/wiki/Massive_open_online_course'
attribute_table = pd.read_html(url_mooc_attributes)
attr_df = attribute_table[3]

----------------

# Part 2: Transform

### Table 1: Search_Word

## YouTube

### Table X: YouTube Cateories

In [22]:
# all categories are the same across countries, so we don't need this entire df

youtube_metadata = youtube_metadata[['Country', 'CountryCode', 'CategoryID','VideoCategory']]

In [23]:
# we can see now that the categories are the same across countries, so we don't need this entire df

youtube_categories = youtube_metadata.drop_duplicates(subset=['CategoryID', 'VideoCategory'], 
                                                      keep='first')[['CategoryID','VideoCategory']]

In [24]:
# final table
youtube_categories

Unnamed: 0,CategoryID,VideoCategory
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports
5,18,Short Movies
6,19,Travel & Events
7,20,Gaming
8,21,Videoblogging
9,22,People & Blogs


### Table X: YouTube Countries

In [25]:
youtube_countries = youtube_metadata.drop_duplicates(subset=['Country', 'CountryCode'], 
                                                     keep='first')[['Country', 'CountryCode']]

In [26]:
# final table
youtube_countries

Unnamed: 0,Country,CountryCode
0,Canada,CA
31,Germany,DE
62,France,FR
93,Great Britain,GB
124,France,IN
155,Japan,JP
186,South Korea,KR
217,Japan,MX
248,Russia,RU
279,United States,US


## Ted Talks

## EdX

### Table X: MOOC_List

In [27]:
# remove the Wikipedia reference labels that are in brackets by splitting the 'Name' column

cleaned_names = mooc_df['Name'].str.split("[", n = 1, expand = True)
mooc_df['Name'] = cleaned_names[0]

In [28]:
# select only the Type, Founded, and Headquarters information for each MOOC

mooc_df = mooc_df[['Name', 'Type', 'Founded', 'Headquarters']]

In [29]:
# rename columns 

mooc_df = mooc_df.rename(columns={"Name": "MOOC"})

In [30]:
# final table
mooc_df.head()

Unnamed: 0,MOOC,Type,Founded,Headquarters
0,Alison,Commercial,2007,Ireland
1,Canvas Network,Commercial,2008,USA
2,Coursera,Commercial,2012,USA
3,edX,Non-profit,2012,USA
4,FutureLearn,Commercial,2012,UK


### Table X: MOOC_Attributes

In [31]:
# Rename columns to conform to database structure and to format properly

attr_df = attr_df.rename(columns={"Initiatives": "MOOC",
                                  "Free to access": "FreetoAccess",
                                  "Certification fee": "CertificationFee",
                                  "Institutional credits": "InstitutionalCredits"})

In [32]:
# final table

attr_df.head()

Unnamed: 0,MOOC,Nonprofit,FreetoAccess,CertificationFee,InstitutionalCredits
0,edX,Yes,Partial,Yes,Partial
1,Coursera,No,Partial,Yes,Partial
2,Udacity,No,Partial,Yes,Partial
3,Udemy,No,Partial,Yes,Partial
4,P2PU,Yes,Yes,No,No


### Table X: edX_Courses

In [33]:
# rename columns
edx_df = edx_df.rename(columns={"Course Number": "CourseNumber",
                        "Launch Date": "LaunchDate",
                        "Course Title": "CourseTitle", 
                        "Course Subject": "CourseSubject",
                        "Honor Code Certificates": "HonorCodeCert",
                        "Participants (Course Content Accessed)": "Participants",
                        "Audited (> 50% Course Content Accessed)": "Audited",
                        "% Audited": "Audited_Pct",
                        "% Certified": "Certified_Pct",
                        "% Certified of > 50% Course Content Accessed": "AccessedHalf_Certified_Pct",
                        "% Played Video": "PlayedVideo_Pct",
                        "% Posted in Forum": "Posted_Pct",
                        "% Certified": "Certified_Pct",
                        "% Grade Higher Than Zero": "GradeAboveZero_Pct",
                        "Total Course Hours (Thousands)": "CourseHouse_Thou",
                        "Median Hours for Certification": "MerdianCertHours",
                        "Median Age": "MedianAge", 
                        "% Male": "Male_Pct",
                        "% Female": "Female_Pct",
                        "% Bachelor's Degree or Higher": "BachelorsPlus_Pct"})

In [34]:
# add a column that indicates the name of the MOOC and the type of videos

edx_df["MOOC"] = 'edX'
edx_df["VideoCategory"] = 'Education'

In [35]:
# re-order columns

edx_df = edx_df[['MOOC', 'Institution', 'CourseNumber', 'LaunchDate', 'CourseTitle',
       'Instructors', 'CourseSubject', 'VideoCategory', 'Year', 'HonorCodeCert', 'Participants',
       'Audited', 'Certified', 'Audited_Pct', 'Certified_Pct',
       'AccessedHalf_Certified_Pct', 'PlayedVideo_Pct', 'Posted_Pct',
       'GradeAboveZero_Pct', 'CourseHouse_Thou', 'MerdianCertHours',
       'MedianAge', 'Male_Pct', 'Female_Pct', 'BachelorsPlus_Pct']]

In [36]:
# final df

edx_df.head()

Unnamed: 0,MOOC,Institution,CourseNumber,LaunchDate,CourseTitle,Instructors,CourseSubject,VideoCategory,Year,HonorCodeCert,...,AccessedHalf_Certified_Pct,PlayedVideo_Pct,Posted_Pct,GradeAboveZero_Pct,CourseHouse_Thou,MerdianCertHours,MedianAge,Male_Pct,Female_Pct,BachelorsPlus_Pct
0,edX,MITx,6.002x,09/05/2012,Circuits and Electronics,Khurram Afridi,"Science, Technology, Engineering, and Mathematics",Education,1,1,...,54.98,83.2,8.17,28.97,418.94,64.45,26.0,88.28,11.72,60.68
1,edX,MITx,6.00x,09/26/2012,Introduction to Computer Science and Programming,"Eric Grimson, John Guttag, Chris Terman",Computer Science,Education,1,1,...,64.05,89.14,14.38,39.5,884.04,78.53,28.0,83.5,16.5,63.04
2,edX,MITx,3.091x,10/09/2012,Introduction to Solid State Chemistry,Michael Cima,"Science, Technology, Engineering, and Mathematics",Education,1,1,...,72.85,87.49,14.42,34.89,227.55,61.28,27.0,70.32,29.68,58.76
3,edX,HarvardX,CS50x,10/15/2012,Introduction to Computer Science,"David Malan, Nate Hardison, Rob Bowden, Tommy ...",Computer Science,Education,1,1,...,11.11,0.0,0.0,1.11,220.9,0.0,28.0,80.02,19.98,58.78
4,edX,HarvardX,PH207x,10/15/2012,Health in Numbers: Quantitative Methods in Cli...,"Earl Francis Cook, Marcello Pagano","Government, Health, and Social Science",Education,1,1,...,47.12,77.45,15.98,32.52,804.41,76.1,32.0,56.78,43.22,88.33


----------------

# Part 3: Load

In [39]:
engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/online_ed')
con = engine.connect()

In [40]:
# create tables

mooc_df.to_sql(name='MOOC_List', con=engine, if_exists='replace', index=False)
mooc_df.to_sql(name='MOOC_Attributes', con=engine, if_exists='replace', index=False)
youtube_categories.to_sql(name='YouTube_Categories', con=engine, if_exists='replace', index=False)
youtube_countries.to_sql(name='YouTube_Countries', con=engine, if_exists='replace', index=False)
edx_df.to_sql(name='edX_Courses', con=engine, if_exists='replace', index=False)

# TO DO:

In [None]:
# add primary key

with engine.connect() as con:
    con.execute('ALTER TABLE `example_table` ADD PRIMARY KEY (`ID_column`);')