## Best Class to take to learn Programming

In [1]:
#import libraries
import requests
from bs4 import BeautifulSoup
import pandas  as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
url = 'https://www.classcentral.com/subject/programming-and-software-development'

def get_pages(url, number_results):
    pages = []
    pages.append(url)
    total_results = 0
    page_no = 2 #50 results per page 
    while total_results < number_results:
        primary_url = url
        url_extra = '?page='
        mod_url = primary_url + url_extra + str(page_no)
        pages.append(mod_url)
        page_no += 1
        total_results+=50
    return pages

In [3]:
pages = get_pages(url, 1200)
pages

['https://www.classcentral.com/subject/programming-and-software-development',
 'https://www.classcentral.com/subject/programming-and-software-development?page=2',
 'https://www.classcentral.com/subject/programming-and-software-development?page=3',
 'https://www.classcentral.com/subject/programming-and-software-development?page=4',
 'https://www.classcentral.com/subject/programming-and-software-development?page=5',
 'https://www.classcentral.com/subject/programming-and-software-development?page=6',
 'https://www.classcentral.com/subject/programming-and-software-development?page=7',
 'https://www.classcentral.com/subject/programming-and-software-development?page=8',
 'https://www.classcentral.com/subject/programming-and-software-development?page=9',
 'https://www.classcentral.com/subject/programming-and-software-development?page=10',
 'https://www.classcentral.com/subject/programming-and-software-development?page=11',
 'https://www.classcentral.com/subject/programming-and-software-develo

In [4]:
#init data lists
names = []
institutes = []
providers = []
dates = []
reviews = []
ratings = []

## Defining Functions

In [5]:
def get_reviews(string):
    index = string.find('Reviews')
    review = string[index - 10: index + 10]
    reviews.append(review.strip())

In [6]:
def append_institute(institute):
    if institute is not None:
        institutes.append(institute.text.strip())
    else:
        institutes.append(-1)

In [7]:
def append_provider(provider):
    if provider is not None:
        providers.append(provider.text.strip())
    else:
        providers.append(-1)

In [8]:
def append_date(date):
    if date is not None:
        dates.append(date.text.strip())
    else:
        dates.append('Self Paced')

In [9]:
def append_rating(rating):
    if rating is not None:
        ratings.append(rating.text.strip())
    else:
        ratings.append(-1)

In [10]:
def append_name(name):
    names.append(name)

## Web Scraping

In [11]:
def get_data(pages):
    for page in pages:
        r = requests.get(page)
        soup = BeautifulSoup(r.content, 'html.parser')
        rows = soup.select('tbody tr')

        for row in rows:
            #name
            name = row.select_one('span',{ 'class': 'text-1 line-tight'}).text.strip()
            append_name(name)

            #institute
            institute = row.find('a', {'class': 'color-charcoal small-down-text-2 text-3'})
            append_institute(institute)

            #provider
            provider = row.find('span', {'class': 'hidden medium-up-inline-block'})
            append_provider(provider)

            #date
            date = row.find('td', {'itemprop': 'startDate'})
            append_date(date)

            #reviews
            rev = row.find('span', {'class': 'large-down-hidden block line-tight text-4 color-gray'})
            string = str(rev)
            get_reviews(string)

            #rating
            rating = row.find('span', attrs = {'class': 'xlarge-up-hidden color-charcoal text-center'})
            append_rating(rating)
            
    df = pd.DataFrame({'name': names,
                         'institute': institutes,
                         'provider': providers,
                         'date': dates,
                         'review': reviews,
                         'rating': ratings})
    return df

In [12]:
programming_df = get_data(pages)

In [13]:
programming_df

Unnamed: 0,name,institute,provider,date,review,rating
0,AD,Kellogg School of Management,-1,Self Paced,,5.0
1,Programming for Everybody (Getting Started wit...,University of Michigan,Coursera,"3rd Aug, 2020",20572 Reviews,4.9
2,Python Data Structures,University of Michigan,Coursera,"17th Aug, 2020",7913 Reviews,4.9
3,An Introduction to Interactive Programming in ...,Rice University,Coursera,"17th Aug, 2020",3241 Reviews,4.9
4,Using Python to Access Web Data,University of Michigan,Coursera,"3rd Aug, 2020",2880 Reviews,4.7
...,...,...,...,...,...,...
1395,Learn Backbone.js,-1,Udacity,"18th Aug, 2015",,-1
1396,How to create <anything> in Android,-1,Udacity,"10th Sep, 2015",,-1
1397,DEV206.1x: Introduction to XAML and Applicatio...,Microsoft,edX,Self Paced,,-1
1398,程序设计与算法（大学先修课）,Peking University,Coursera,"8th Oct, 2015",,-1


In [14]:
#dump data from scrape into csv
#programming_df.to_csv(r'D:\Data Science\Best Programming Course\scraped.csv')

## Clean DataFrame

In [15]:
def clean(df):
    #df.drop('Unnamed: 0', axis=1, inplace=True) #only need if reading from csv.
    df = df[df['name'] != 'AD']
    df = df[df['rating'] != -1]
    df.loc[df['institute'] == '-1', 'institute'] = None
    df.drop_duplicates(inplace = True)
    return df

In [16]:
programming_df = clean(programming_df)
programming_df

Unnamed: 0,name,institute,provider,date,review,rating
1,Programming for Everybody (Getting Started wit...,University of Michigan,Coursera,"3rd Aug, 2020",20572 Reviews,4.9
2,Python Data Structures,University of Michigan,Coursera,"17th Aug, 2020",7913 Reviews,4.9
3,An Introduction to Interactive Programming in ...,Rice University,Coursera,"17th Aug, 2020",3241 Reviews,4.9
4,Using Python to Access Web Data,University of Michigan,Coursera,"3rd Aug, 2020",2880 Reviews,4.7
6,Using Databases with Python,University of Michigan,Coursera,"3rd Aug, 2020",2306 Reviews,4.7
...,...,...,...,...,...,...
1381,Programmation sur iPhone et iPad,Sorbonne Universités,France Université Numerique,"16th Apr, 2015",1 Reviews,4.0
1382,Developing Scalable Apps in Python,Google,Udacity,"30th Apr, 2015",1 Reviews,4.0
1383,Android Development for Beginners,Google,Udacity,"11th May, 2015",23 Reviews,4.3
1388,Interactive Computer Graphics with WebGL,University of New Mexico,Coursera,"29th Jun, 2015",2 Reviews,4.5


In [17]:
#We need to get rid of rows with less than 5 reviews as low reviews and high rating can be misleading.
def get_rating(df):
    highest_rated = df.sort_values('rating', ascending = False)
    reviews_int = highest_rated['review'].map(lambda x: x.rstrip('Reviews')).astype(int)
    highest_rated['review'] = pd.DataFrame(reviews_int)
    highest_rated = highest_rated[highest_rated['review'] >= 5]
    return highest_rated

In [18]:
programming_rate = get_rating(programming_df)
programming_rate

Unnamed: 0,name,institute,provider,date,review,rating
38,"HTML, CSS, and Javascript for Web Developers",Johns Hopkins University,Coursera,"24th Aug, 2020",17,5.0
2,Python Data Structures,University of Michigan,Coursera,"17th Aug, 2020",7913,4.9
830,Fundamentals of NetLogo,Santa Fe Institute,Complexity Explorer,Self Paced,7,4.9
1,Programming for Everybody (Getting Started wit...,University of Michigan,Coursera,"3rd Aug, 2020",20572,4.9
7,Python for Everybody,University of Michigan,Coursera,Self Paced,342,4.9
...,...,...,...,...,...,...
74,Cloud Computing Basics (Cloud 101),LearnQuest,Coursera,"31st Aug, 2020",9,2.2
641,Intro to Relational Databases,-1,Udacity,Self Paced,7,2.1
105,Introduction To Swift Programming,University of Toronto,Coursera,"17th Aug, 2020",7,1.3
961,Advanced C++,Microsoft,edX,Self Paced,7,1.3


## Data Retrieval 

Now that we have some functions set up to streamline the process we can webscrape data from different domains on the same website.

In [19]:
#Functions: get_pages(url, # results), run webscraper: get_data(pages), clean(df), get_rating(df)
dataframes = []
dataframes.append(programming_rate)

In [20]:
#python courses
pages = get_pages('https://www.classcentral.com/subject/python', 200)
names = []
institutes = []
providers = []
dates = []
reviews = []
ratings = []
python_df = get_data(pages)
python_df = clean(python_df)
python_rate = get_rating(python_df)
dataframes.append(python_rate)

In [22]:
#machine learning courses
pages = get_pages('https://www.classcentral.com/subject/machine-learning', 250)
names = []
institutes = []
providers = []
dates = []
reviews = []
ratings = []
ml_df = get_data(pages)
ml_df = clean(ml_df)
ml_rate = get_rating(ml_df)
dataframes.append(ml_rate)

In [23]:
#data science courses
pages = get_pages('https://www.classcentral.com/subject/data-science', 600)
names = []
institutes = []
providers = []
dates = []
reviews = []
ratings = []
ds_df = get_data(pages)
ds_df = clean(ds_df)
ds_rate = get_rating(ds_df)
dataframes.append(ds_rate)

In [24]:
#computer science courses
pages = get_pages('https://www.classcentral.com/subject/cs', 1800)
names = []
institutes = []
providers = []
dates = []
reviews = []
ratings = []
cs_df = get_data(pages)
cs_df = clean(cs_df)
cs_rate = get_rating(cs_df)
dataframes.append(cs_rate)

In [25]:
#stats and probability courses
pages = get_pages('https://www.classcentral.com/subject/statistics', 150)
names = []
institutes = []
providers = []
dates = []
reviews = []
ratings = []
stats_df = get_data(pages)
stats_df = clean(stats_df)
stats_rate = get_rating(stats_df)
dataframes.append(stats_rate)

In [26]:
#artificial intelligence courses
pages = get_pages('https://www.classcentral.com/subject/ai', 300)
names = []
institutes = []
providers = []
dates = []
reviews = []
ratings = []
ai_df = get_data(pages)
ai_df = clean(ai_df)
ai_rate = get_rating(ai_df)
dataframes.append(ai_rate)

In [27]:
#data analysis courses
pages = get_pages('https://www.classcentral.com/subject/data-analysis', 150)
names = []
institutes = []
providers = []
dates = []
reviews = []
ratings = []
da_df = get_data(pages)
da_df = clean(da_df)
da_rate = get_rating(da_df)
dataframes.append(da_rate)

Now that we have some data to work with we can create some visuals. 

In [28]:
for df in dataframes:
    

9