In [35]:
# FILE NAME: Collecting data online edu.ipynb
# PROGRAMMER: VG6
# DATE: 24.03.2020
# Purpose: To collect valuable information from online edu recourse

In [44]:
# importing modules
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import re

In [45]:
# initializing path to firefox driver
firefox_driver = 'C:\\Users\Gololobov\Documents\GitHub\Collecting_data_openedu\geckodriver.exe'

# initializing firefox options instance 
options = webdriver.FirefoxOptions()

# initializing headless options
options.add_argument('headless')

# initializing browser with driver and options
browser = webdriver.Firefox(executable_path=firefox_driver, options=options)


In [46]:
# initializing html tags patterns
tags = re.compile(r'<[^>]+>')

# Removing html tags from text function
# Arguments: 
#   - text we want to remove tags from
#      text : str
# Returns:
#   - string with removed html texts
#      (str)
def remove_html_tags(text : str) -> str:
    return tags.sub(' ', text)

In [47]:
# Getting course information function
# Arguments:
#   - Link to course and it's name
#       link_to_the_course : str, course_name : str
# Returns:
#   - DataFrame with full information about the course
#       (pd.DataFrame)
def get_course_info(link_to_the_course : str, course_name : str) -> pd.DataFrame:
    browser.get(link_to_the_course)

    # initializing beautiful soup class instance to parse current page
    soup = BeautifulSoup(browser.page_source, 'html.parser')

    # initializing course description
    course_description = soup.find('section')

    # initializing all headers in description and take all
    # but 2 last, beacause they are not valuable
    headers_block = soup.find_all('h2')

    # course full description flag
    desc_flag = False

    for header in headers_block:
        if 'Содержание курса' == header.text:
            desc_flag = True

    # recomended courses flag
    recomended_flag = True
    if not re.search('Рекомендуемые курсы', course_description.text):
        recomended_flag = False
        print('No recomended')
        headers_block = headers_block[:-2]

        # getting recomended courses manually
        recomended_courses = soup.find('div', {'id' : 'course-view-recommendations'})

        # getting reviews manually
        review_courses = soup.find('div', {'id' : 'course-view-feedback'})

        # initializing recomendations name
        recomended_name = 'Рекомендуемые курсы'

        # initializing reviews name
        reviews_name = 'Рецензии и оценки'


    # getting td of course
    course_training_directions = 'Профессии, специальности и направления подготовки'

    # initializing professions flag
    profession_flag = False

    # initializing professions flag
    if desc_flag:
        if re.search(course_training_directions, course_description.text):
            profession_flag = True
            course_professions = course_description.find('div', {'id' : 'j_idt128'}).text

        course_desc = remove_html_tags(str(course_description.find('div', {'id' : 'j_idt123'})))
        
       # print(remove_html_tags(str(course_description.find('div', {'id' : 'j_idt123'}))))


    # going through all header and replacing them with
    # special symbol !!!
    for header in headers_block:
        # getting headers pattern
        pattern = re.compile(str(header))

        # substituting headers to !!!
        course_description = re.sub(pattern, '!!!', str(course_description))


    # splitting description by !!!
    if profession_flag:
        splitted_description = course_description.split('!!!')[1:] + [course_professions]
    else:
        splitted_description = course_description.split('!!!')[1:]



    # initializing course_headers_names
    course_headers_names = []


    # getting names of headers
    for header in headers_block:
        # getting text header and making it lower case
        header_text_lower = header.text.lower()

        # adding headers text to list
        course_headers_names.append(header_text_lower)

    if profession_flag:
        if not recomended_flag:
            df_columns = ['название'] + ['ссылка'] + course_headers_names + [recomended_name, reviews_name, course_training_directions]
        else:
            # initializing columns of data frame
            df_columns = ['название'] + ['ссылка'] + course_headers_names + [course_training_directions]
    else:
        if not recomended_flag:
            df_columns = ['название'] + ['ссылка'] + course_headers_names + [recomended_name, reviews_name]
        else:
            df_columns = ['название'] + ['ссылка'] + course_headers_names
            
    # cleaning descriptions out of html tags and \n
    cleaned_description = [remove_html_tags(x) for x in splitted_description]
    cleaned_description = [re.sub(r'\n', ' ', x) for x in cleaned_description]



    # initilizing data of data frame
    if not recomended_flag:
        df_data = [course_name] + [link_to_the_course] + cleaned_description + [recomended_courses] + [review_courses]
    else:    
        df_data = [course_name] + [link_to_the_course] + cleaned_description 


    course_data = pd.DataFrame([df_data], columns=df_columns)
    course_data['содержание курса'] = course_desc
    
    return course_data


In [48]:
# Getting particular type of courses for fixed number of pages function
# Arguments: 
#   - number of pages we want to get information from
#       number_of_pages : int = 58
# Returns:
#   - Data Frame with info about this group of courses
#       (pd.DataFrame)
def get_courses(number_of_pages : int = 58) -> pd.DataFrame:
    # initializing link of courses page
    courses_link = 'https://online.edu.ru/public/courses'
    
    # creating list of Data Frames representing each page
    each_page_courses_dfs = []

    
    # collecting info about courses
    for page in range(number_of_pages):
        # initializing full page with link
        courses_link_with_page = courses_link + '.xhtml?page=' + str(page)
        
        # getting training group page
        browser.get(courses_link_with_page)

        # initializing beautiful soup class instance to parse current page
        soup = BeautifulSoup(browser.page_source, 'html.parser')       
        
        # getting cources blocks
        courses_blokcs = soup.find_all('div', {'class': 'card course-card'})
        
        # declarating links to courses list
        links_to_courses = []

        # declarating names of the courses list
        names_of_courses = []

        # getting link of courses
        for course in courses_blokcs:
            # getting course title div
            course_title = course.find('div', {'class' : 'course-title'})

            # adding reference to the course 
            links_to_courses.append('https://online.edu.ru' + course.find_all('object')[1].find('a')['href'])

            # adding name of the course
            names_of_courses.append(course.find_all('object')[1].text)
            

        # declarating descriptions dfs of courses
        descriptions_dfs = []

        for (course_link, course_name) in zip(links_to_courses, names_of_courses):
            course_info = get_course_info(course_link, course_name)
            descriptions_dfs.append(get_course_info(course_link, course_name))

        # initializing Data Frame with information of all
        # courses on the current page
        page_courses_info = pd.DataFrame()

        # concating all DataFrames we get on previous step
        for data_frame in descriptions_dfs:
            # concatinating all courses info dataframes
            page_courses_info = pd.concat([page_courses_info, data_frame], axis=0, ignore_index=True)
        

        # adding page dataframe to list
        each_page_courses_dfs.append(page_courses_info)

        
    # initializing DataFrame with information od all
    # courses on all pages
    all_courses_info = pd.DataFrame()
    
    # concating all DataFrames we get on previous step
    for data_frame in each_page_courses_dfs:
        # concatinating all courses info dataframes
        all_courses_info = pd.concat([all_courses_info, data_frame], axis=0, ignore_index=True)
    
    return all_courses_info

In [49]:
def save_courses(data, file_name):
    data = data.fillna('')
    
    resulted_d = pd.DataFrame()
    
    resulted_d['URL'] = data['ссылка']
    resulted_d['Название курса'] = data['название']
    resulted_d['Входные требования'] = data['знания'] + ' ' + data['умения'] + ' ' + data['входные требования']
    resulted_d['Содержание курса'] = data['содержание курса']
    resulted_d['Направления подготовки'] = data['Профессии, специальности и направления подготовки']
    
    resulted_d.to_csv(file_name)
    

In [55]:
def update_courses(old_all_data):
    courses_links, courses_names = get_courses_links()
    
    new_courses_links = []
    new_courses_names = []

    for (link, name) in zip(courses_links, courses_names):
        if link not in old_all_data['ссылка'].values:
            print(link)
            new_courses_links.append(link)
            new_courses_names.append(name)

     # declarating descriptions dfs of courses
    descriptions_dfs = []

    for (course_link, course_name) in zip(new_courses_links, new_courses_names):
        descriptions_dfs.append(get_course_info(course_link, course_name))

    # initializing Data Frame with information of all
    # courses on the current page
    page_courses_info = pd.DataFrame()

    # concating all DataFrames we get on previous step
    for data_frame in descriptions_dfs:
        # concatinating all courses info dataframes
        page_courses_info = pd.concat([page_courses_info, data_frame], axis=0, ignore_index=True)
        
    data = pd.concat([old_all_data, page_courses_info], axis=0, ignore_index=True)
    
    data.to_csv('all_data.csv')
    
    save_courses(data, 'final_data.csv')

In [51]:
def get_courses_links(number_of_pages=60):
    courses_link = 'https://online.edu.ru/public/courses'
    
    all_courses_links = []
    all_courses_names = []
    
    # collecting info about courses
    for page in range(number_of_pages):
        # initializing full page with link
        courses_link_with_page = courses_link + '.xhtml?page=' + str(page)
        
        # getting training group page
        browser.get(courses_link_with_page)

        # initializing beautiful soup class instance to parse current page
        soup = BeautifulSoup(browser.page_source, 'html.parser')       
        
        # getting cources blocks
        courses_blokcs = soup.find_all('div', {'class': 'card course-card'})
        
        # declarating links to courses list
        links_to_courses = []

        # declarating names of the courses list
        names_of_courses = []

        # getting link of courses
        for course in courses_blokcs:
            # getting course title div
            course_title = course.find('div', {'class' : 'course-title'})

            # adding reference to the course 
            links_to_courses.append('https://online.edu.ru' + course.find_all('object')[1].find('a')['href'])

            # adding name of the course
            names_of_courses.append(course.find_all('object')[1].text)

            
        all_courses_links += links_to_courses
        all_courses_names += names_of_courses
        
    return all_courses_links, all_courses_names

In [9]:
data = pd.read_csv('courses_data_online_edu.csv')



In [56]:
update_courses(data)

https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11044104
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11044106
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11044113
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11044118
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11044124
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11047197
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11047201
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11047204
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11047208
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11047214
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11047217
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11050964
https://online.edu.ru/public/course.xhtml?faces-redirect=true&cid=11050967
https://online.edu.ru/pub

In [58]:
updated = pd.read_csv('final_data.csv', index_col=0)

In [59]:
updated

Unnamed: 0,URL,Название курса,Входные требования,Содержание курса,Направления подготовки
0,https://online.edu.ru/public/course.xhtml?face...,Теория игр\n,,1. Стратегические взаимодействия 2. Доминиру...,41.03.04 Политология ...
1,https://online.edu.ru/public/course.xhtml?face...,Методы обработки навигационной измерительной и...,,В курсе рассматриваются следующие темы: Эл...,24.03.02 Системы управления движением и навига...
2,https://online.edu.ru/public/course.xhtml?face...,Методы вычислительной математики\n,,Раздел 1. Элементарная теория погрешностей....,01.03.02 Прикладная математика и информатика ...
3,https://online.edu.ru/public/course.xhtml?face...,Функциональное программирование: базовый курс\n,,В курсе рассматриваются следующие темы: 1. В...,09.00.00 Информатика и вычислительная техника ...
4,https://online.edu.ru/public/course.xhtml?face...,Веб-программирование\n,,В курсе рассматриваются следующие темы: 1. ...,09.00.00 Информатика и вычислительная техника ...
...,...,...,...,...,...
709,https://online.edu.ru/public/course.xhtml?face...,История науки и техники в области приборострое...,Курс рассчитан на магистров 1-го года обу...,Раздел 1. Период первоначального накопления ...,12.04.01 Приборостроение ...
710,https://online.edu.ru/public/course.xhtml?face...,Цифровая схемотехника\n,Курс рассчитан на бакалавров 4-го года о...,РАЗДЕЛ 1. ЛОГИЧЕСКИЕ ОПЕРАЦИИ И ЛОГИЧЕСКИЕ Э...,
711,https://online.edu.ru/public/course.xhtml?face...,Нелинейные методы акустической диагностики\n,Курс рассчитан на студентов магистратуры...,Вступление Тема 1. Системы с сосредоточен...,12.04.01 Приборостроение ...
712,https://online.edu.ru/public/course.xhtml?face...,Экономика организации\n,,,
