# Scraping Data_Camp courses using Python

## source :  https://www.datacamp.com/courses/

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import csv

In [2]:
root_url = 'https://www.datacamp.com/courses/'

In [3]:
souce = urlopen(root_url)

In [4]:
soup = BeautifulSoup(souce,"html.parser")

In [5]:
print(soup.title.text)

Data Science Courses: R & Python Analysis Tutorials | DataCamp


All the courses on DataCamp are listed below :

In [6]:
for div in soup.find_all('div',{'class':'row dc-u-pb-48'}):
    for url in div.find_all('a'):
        print(url.get('href'))

/courses/tech:r
/courses/tech:python
/courses/tech:sql
/courses/tech:git
/courses/tech:shell
/courses/tech:spreadsheets


In [7]:
for div in soup.find_all('div',{'class': 'col-lg-4 col-sm-6'}):
    print(div.text.split())
    

['R']
['Python']
['SQL']
['Git']
['Shell']
['Spreadsheets']


Function to scarp all the courses of Data Camp:

In [8]:
def courses_by_topic(topic):
    courses = []
    url_to_be_scrapped = root_url + topic
    print('Scrapping ' + url_to_be_scrapped)
    content_per_topic = urlopen(url_to_be_scrapped)
    soup = BeautifulSoup(content_per_topic, 'html.parser')
    resource = soup.find_all('div', attrs={'class': 'courses__explore-list row'})
    for div in resource:
        urls = div.find_all('h4', attrs={'class': 'course-block__title'})
        for url in urls:
            courses.append(url.get_text())
    print("Scrapped " + topic +".")
    return courses

In [9]:
r_courses = courses_by_topic('tech:r')
print('Total number of R courses ' + str(len(r_courses)))

Scrapping https://www.datacamp.com/courses/tech:r
Scrapped tech:r.
Total number of R courses 142


In [10]:
python_courses = courses_by_topic('tech:python')
print('Total number of Python courses - ' + str(len(python_courses)))

Scrapping https://www.datacamp.com/courses/tech:python
Scrapped tech:python.
Total number of Python courses - 69


In [11]:
sql_courses = courses_by_topic('tech:sql')
print('Total number of SQL courses - ' + str(len(sql_courses)))

Scrapping https://www.datacamp.com/courses/tech:sql
Scrapped tech:sql.
Total number of SQL courses - 6


In [12]:
git_courses = courses_by_topic('tech:git')
print('Total number of Git courses - ' + str(len(git_courses)))

Scrapping https://www.datacamp.com/courses/tech:git
Scrapped tech:git.
Total number of Git courses - 1


In [13]:
shell_courses = courses_by_topic('tech:shell')
print('Total number of Shell courses - ' + str(len(shell_courses)))

Scrapping https://www.datacamp.com/courses/tech:shell
Scrapped tech:shell.
Total number of Shell courses - 3


In [14]:
spreadsheets_courses = courses_by_topic('tech:spreadsheets')
print('Total number of Shell courses - ' + str(len(spreadsheets_courses)))

Scrapping https://www.datacamp.com/courses/tech:spreadsheets
Scrapped tech:spreadsheets.
Total number of Shell courses - 4


### Collecting all the courses in a single data frame

In [15]:
r_courses = pd.DataFrame(data = r_courses, columns = ['Course Name'])
r_courses['Topic Name'] = 'R'

python_courses = pd.DataFrame(data = python_courses, columns = ['Course Name'])
python_courses['Topic Name'] = 'Python'

sql_courses = pd.DataFrame(data = sql_courses, columns = ['Course Name'])
sql_courses['Topic Name'] = 'SQL'

git_courses = pd.DataFrame(data = git_courses, columns = ['Course Name'])
git_courses['Topic Name'] = 'Git'

shell_courses = pd.DataFrame(data = shell_courses, columns = ['Course Name'])
shell_courses['Topic Name'] = 'Shell'

spreadsheet_courses = pd.DataFrame(data = spreadsheets_courses, columns = ['Course Name'])
spreadsheet_courses['Topic Name'] = 'Spreadsheets'

DataCamp = pd.concat([r_courses,python_courses,sql_courses,git_courses,shell_courses,spreadsheet_courses])

In [16]:
len(DataCamp)

225

In [17]:
DataCamp.to_csv('C://Users//DELL//Desktop//Data Science//Filename2.csv', header=True)