# ASSIGNMENT 1 - CRAWLER

In [1]:
#import library
from bs4 import BeautifulSoup
from urllib.request import urlopen
import json
import ssl

# open url
url = 'https://vnexpress.net'
gcontext = ssl.SSLContext()
html = urlopen(url, context=gcontext).read()

# soup: html parser
soup = BeautifulSoup(html, 'html.parser')

print(soup.prettify())

<!DOCTYPE html>
<html lang="vi" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <title>
   VnExpress - Báo tiếng Việt nhiều người xem nhất
  </title>
  <meta charset="utf-8"/>
  <meta content="IE=100" http-equiv="X-UA-Compatible">
   <meta content="1547540628876392" property="fb:app_id"/>
   <link href="https://vnexpress.net" rel="canonical"/>
   <meta content="1800" http-equiv="REFRESH"/>
   <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
   <link href="//s.eclick.vn" rel="dns-prefetch"/>
   <link href="//s.vnecdn.net" rel="dns-prefetch"/>
   <link href="//la.vnecdn.net" rel="dns-prefetch"/>
   <link href="https://usi-saas.vnexpress.net" rel="dns-prefetch"/>
   <link href="//core.polyad.net" rel="dns-prefetch"/>
   <link href="//www.google-analytics.com" rel="dns-prefetch"/>
   <link href="//www.googletagmanager.com" rel="dns-prefetch"/>
   <meta content="yes" name="apple-mobile-web-app-capable"/>
   <meta content="Vnexpress.net" name="apple-mobile-web-app-titl

# Tag statistic
* **Input**: soup
* **Output**: tags with number of occurrences in descending order

In [19]:
def tag_statistic(soup):
    tags = {}
    for tag in soup.find_all():
        if tag.name not in tags:
            tags[tag.name] = 1
        else:
            tags[tag.name] += 1
    print(sorted(tags.items(), key=lambda kv:(kv[1], kv[0]), reverse=True))

tag_statistic(soup)

[('a', 514), ('div', 165), ('li', 155), ('span', 140), ('h4', 98), ('img', 91), ('i', 62), ('section', 61), ('h6', 52), ('h3', 51), ('p', 47), ('article', 39), ('script', 37), ('meta', 36), ('ul', 31), ('h2', 26), ('hgroup', 25), ('ins', 22), ('link', 17), ('option', 12), ('strong', 6), ('td', 4), ('tbody', 3), ('table', 3), ('select', 3), ('header', 3), ('button', 3), ('br', 3), ('tr', 2), ('nav', 2), ('video', 1), ('title', 1), ('noscript', 1), ('input', 1), ('iframe', 1), ('html', 1), ('head', 1), ('h1', 1), ('form', 1), ('footer', 1), ('figure', 1), ('body', 1)]


# Class statistic
* **Input**: soup
* **Output**: classes with number of occurrences in descending order

In [24]:
def class_statistic(soup):
    classes = {}
    for tag in soup.find_all():
        if tag.has_key('class'):
            for _class in tag['class']:
                if _class not in classes:
                    classes[_class] = 1
                else:
                    classes[_class] += 1
    print(sorted(classes.items(), key=lambda kv:(kv[1], kv[0]), reverse=True))

class_statistic(soup)

[('txt_num_comment', 95), ('icon_commend', 95), ('font_icon', 95), ('vne_lazy_image', 74), ('ic', 60), ('title_news', 57), ('thumb', 54), ('thumb_art', 53), ('thumb_5x3', 53), ('parent', 51), ('box_category', 46), ('clearfix', 40), ('list_news', 38), ('description', 35), ('width_common', 29), ('title_box_category', 25), ('first', 25), ('mnu_thoisu', 20), ('list_dn', 20), ('adsbycpx', 20), ('list_title', 18), ('right', 17), ('list_title_right', 16), ('left', 16), ('text_ads', 13), ('banner_ads', 13), ('location-stamp', 12), ('list_video', 12), ('hidden320', 8), ('icon_thumb_videophoto', 7), ('box_300', 6), ('box_100', 6), ('owl-carousel', 5), ('ic-caret-right', 5), ('ic-caret-left', 5), ('scrollbar-inner', 4), ('ic-photo', 4), ('ic-ad', 4), ('container', 4), ('title_right', 3), ('mnu_xe', 3), ('mnu_thethao', 3), ('mnu_thegioi', 3), ('mnu_tamsu', 3), ('mnu_suckhoe', 3), ('mnu_sohoa', 3), ('mnu_phapluat', 3), ('mnu_kinhdoanh', 3), ('mnu_khoahoc', 3), ('mnu_gocnhin', 3), ('mnu_giaoduc', 3)

# Get "title" and "link" of news from a web page
* **Description**: 
    * After exacting html source code, I found that news is stored in tag "article" with class "list_news"
    * Each news will contain "title" and "link"
        * title: tag "h1" to "h6" with class "title_news"
        * link: attribute "href" in tag "a"
* **Input**: soup
* **Output**: json object
    * For example: { '0': {'title': ..., 'link': ...}, '1': {...}, ... }

In [130]:
def get_news_from_main_page(soup):
    result = {}
    articles = soup.findAll('article', {'class': 'list_news'})
    i = 0
    for article in articles:
        news = article.findChild(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], {'class': 'title_news'})
        tag_a = news.findChild('a')
        result[i] = { 'title': tag_a.get_text().strip(), 'link': tag_a.get('href') }
        i += 1
    return result

# Get "content" and "summary" of news from url
* **Description**: 
    * After exacting html source code of news, I found that main content is stored in 
        * tag "p" in tag "section" with class "sidebar_1" with normal news
        * tag "p" in tag "section" with class "sidebar_2" with infographics news
    * Summary is stored in
        * tag "div" with class "lead_detail" with video
        * tag "p" with class "description" in tag "section" with class "container infographics" with infographics news
        * tag "p" with class "description with normal news
    * Iterate through all tag p to get content
    * **Note**: some urls are video => I must ignore them and return empty string
* **Input**: soup
* **Output**: string - content, string - summary

In [131]:
# handle special cases: video, infographics
def special_case(soup):
    # video
    detail = soup.find('div', {'class': 'lead_detail'})
    if detail:
        return detail.text.strip(), detail.text.strip()
    #infographics
    else:
        section = soup.find('section', {'class': 'container infographics'})
        detail = soup.find('p', {'class': 'description'})
        if detail:
            return detail.text.strip(), detail.text.strip()
        else:
            return '', ''

In [132]:
def get_content_from_link(url):
    global gcontext
    content = ''
    html = urlopen(url, context=gcontext).read()
    soup = BeautifulSoup(html, 'html.parser')
    #tag: section, class: sidebar_1
    section = soup.find('section', {'class': 'sidebar_1'})
    
    if not section:
        return special_case(soup)
            
    tag_content = section.findChildren('p', recursive=True)
    for p in tag_content:
        content += p.text
        
    #tag: section, class: sidebar_2
    section = soup.find('section', {'class': 'sidebar_2'})
    if not section:
        return special_case(soup)
            
    tag_content = section.findChildren('p', recursive=True)
    for p in tag_content:
        content += p.text
    
    tag_summary = soup.find('p', {'class': 'description'})
    summary = tag_summary.text if tag_summary else ''
    return content.strip(), summary.strip()

# Export result
* **Description**: 
    * Export result to file with utf8 format to keep accented letters
    * Make sure that we can import data from this file to reuse
* **Input**: json object, filename (default = "crawler.txt")

In [133]:
def export_file(json_object, filename='crawler.txt'):
    with open(filename, 'w', encoding='utf8') as json_file:
        json.dump(json_object, json_file, ensure_ascii=False)

# Main function
* Get soup - main page from the given url
* Get all news (title and link) from soup
* Get content and summary by iterating through all news
* Export the result

In [134]:
def main():
    global soup
    news_list = get_news_from_main_page(soup)
    for key, news in news_list.items():
        print(news['link'])
        content, summary = get_content_from_link(news['link'])
        news_list[key]['content'] = content
        news_list[key]['summary'] = summary
    print('total:', len(news_list))
    export_file(news_list)
    
if __name__ == '__main__':
    main()

https://vnexpress.net/thoi-su/ra-mat-doan-tau-dau-tien-tuyen-metro-nhon-ga-ha-noi-4003979.html?vn_source=Home&vn_campaign=ThuongVien&vn_medium=Item-2&vn_term=Desktop&vn_thumb=1
https://vnexpress.net/thoi-su/tau-cat-linh-ha-dong-chay-thu-de-nghiem-thu-4003770.html?vn_source=Home&vn_campaign=ThuongVien&vn_medium=Item-3&vn_term=Desktop&vn_thumb=1
https://vnexpress.net/the-gioi/chuyen-gia-anh-neu-ba-ly-do-nguoi-viet-nhap-cu-lau-4003610.html?vn_source=Home&vn_campaign=ThuongVien&vn_medium=Item-4&vn_term=Desktop&vn_thumb=1
https://vnexpress.net/phap-luat/dang-le-nguyen-vu-toi-muon-som-ket-thuc-vu-ly-hon-4003668.html?vn_source=Home&vn_campaign=ThuongVien&vn_medium=Item-5&vn_term=Desktop&vn_thumb=1
https://vnexpress.net/the-gioi/14-gia-dinh-viet-nam-gui-thong-tin-tim-nguoi-mat-tich-tai-anh-4003821.html?vn_source=Home&vn_campaign=ThuongVien&vn_medium=Item-6&vn_term=Desktop&vn_thumb=0
https://vnexpress.net/oto-xe-may/xe-volkswagen-trung-bay-o-viet-nam-co-ban-do-duong-luoi-bo-4003914.html?vn_sour