# MOOC Information from Wikipedia

In [1]:
# dependencies
import pandas as pd

## List of MOOCs

### Scrape from Wikipedia

In [2]:
url_mooc_table = 'https://en.wikipedia.org/wiki/List_of_MOOC_providers'

In [3]:
mooc_table = pd.read_html(url_mooc_table)

In [4]:
mooc_df = mooc_table[0]
mooc_df.head()

Unnamed: 0,Name,Description,Topics,Education level,Website,Cost,Languages courses are available in,Provider/s,Type,Content License,Founded,Headquarters
0,Alison,,"IT, Language, Science, Health, Humanities, Sof...",,[1],Freemium,,Alison,Commercial,,2007,Ireland
1,Canvas Network,,,"K12, Higher education",[2],,,"Instructure, Inc",Commercial,"All Rights Reserved, registration required.",2008,USA
2,Coursera,,,,[3],"Subscription, some free courses","English, Hindi, Spanish, French, Chinese, Arab...",,Commercial,Different course licenses,2012,USA
3,edX,,,,[4],,"English, Spanish, French, Mandarin, Italian, R...",,Non-profit,All rights reserved,2012,USA
4,FutureLearn[5][6],,,,[7],"Free access to courses, paid certification[8]",,,Commercial,All rights reserved.,2012,UK


In [5]:
# remove the Wikipedia reference labels that are in brackets by splitting the 'Name' column

cleaned_names = mooc_df['Name'].str.split("[", n = 1, expand = True)
mooc_df['Name'] = cleaned_names[0]

In [6]:
mooc_table[0].columns

Index(['Name', 'Description', 'Topics', 'Education level', 'Website', 'Cost',
       'Languages courses are available in', 'Provider/s', 'Type',
       'Content License', 'Founded', 'Headquarters'],
      dtype='object')

In [7]:
# select only the Type, Founded, and Headquarters information for each MOOC

mooc_df = mooc_df[['Name', 'Type', 'Founded', 'Headquarters']]

In [8]:
# Rename columns: 

mooc_df = mooc_df.rename(columns={"Name": "MOOC"})

In [19]:
mooc_df.head()

Unnamed: 0,MOOC,Type,Founded,Headquarters
0,Alison,Commercial,2007,Ireland
1,Canvas Network,Commercial,2008,USA
2,Coursera,Commercial,2012,USA
3,edX,Non-profit,2012,USA
4,FutureLearn,Commercial,2012,UK


## MOOC Attribues

### Scrape from Wikipedia

In [10]:
url_mooc_attributes = 'https://en.wikipedia.org/wiki/Massive_open_online_course'

In [11]:
attribute_table = pd.read_html(url_mooc_attributes)

In [12]:
attr_df = attribute_table[3]
attr_df.head()

Unnamed: 0,Initiatives,Nonprofit,Free to access,Certification fee,Institutional credits
0,edX,Yes,Partial,Yes,Partial
1,Coursera,No,Partial,Yes,Partial
2,Udacity,No,Partial,Yes,Partial
3,Udemy,No,Partial,Yes,Partial
4,P2PU,Yes,Yes,No,No


In [16]:
# Rename columns to conform to database structure and to format properly

attr_df = attr_df.rename(columns={"Initiatives": "MOOC",
                                  "Free to access": "FreetoAccess",
                                  "Certification fee": "CertificationFee",
                                  "Institutional credits": "InstitutionalCredits"})

In [17]:
attr_df.head()

Unnamed: 0,MOOC,Nonprofit,FreetoAccess,CertificationFee,InstitutionalCredits
0,edX,Yes,Partial,Yes,Partial
1,Coursera,No,Partial,Yes,Partial
2,Udacity,No,Partial,Yes,Partial
3,Udemy,No,Partial,Yes,Partial
4,P2PU,Yes,Yes,No,No


# Load

In [20]:
# import dependencies

from sqlalchemy import create_engine
from config import username, password

In [21]:
engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/employee_db')
conn = engine.connect()

In [23]:
attr_df.to_sql(name='MOOC_Attributes', con=engine)