<a href="https://colab.research.google.com/github/alyrazik/News_content_collect_store/blob/Crawler-that-follows-links/CrawlingBBC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  News Content Collect and Store
## by Aly Abdelrazek
 
 


##Setup and Importing libraries

In [1]:
!python -m pip install pymongo[srv]

Collecting dnspython<2.0.0,>=1.16.0; extra == "srv"
[?25l  Downloading https://files.pythonhosted.org/packages/ec/d3/3aa0e7213ef72b8585747aa0e271a9523e713813b9a20177ebe1e939deb0/dnspython-1.16.0-py2.py3-none-any.whl (188kB)
[K     |████████████████████████████████| 194kB 2.8MB/s 
[?25hInstalling collected packages: dnspython
Successfully installed dnspython-1.16.0


In [9]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pickle
import pymongo
from pymongo import MongoClient
from datetime import datetime
import time
from time import sleep


In [3]:
pymongo.version

'3.11.0'

##Global variables

In [4]:
news_page_url = "https://bbc.com/news"
parent = "https://bbc.com"
SEARCH_LIMIT = 10   #limit the number of returned articles matching a keyword search


##Crawl


In [5]:
# Obtains news stories links from news_page_url (https://bbc.com/news)
def give_links(url, parent):
  ''' Take a url, and its parent address and returns a list of absolute addresses of web links orginitating from it.'''
  
  page_html = requests.get(url).text
  page_soup = BeautifulSoup(page_html, "html.parser")
  page_links = page_soup.findAll("a") #find all links

  #validate and process obtained links
  output1 = [link.get("href") for link in page_links if link.get("href") is not None]
  output2 = [parent+link if link.startswith("/news") else link for link in output1]
  output3 = [link for link in output2 if "http" in link]


  return output3
 



In [6]:

def is_news_article(link):
  ''' 
  Take a web link address and tests whether the link contains a news article in BBC. Returns 1 if True, 0 otherwise.  
  ''' 
  if "/news" in link: #news articles have /news in address. if not, it is considered not a news article.

    try:
      s = BeautifulSoup(requests.get(link).text , "html.parser")
      if s.find(class_= "css-16rg7hm-ContainerWithSidebarWrapper e1jl38b40") is not None: #this class is only available in news articles
        return 1
      else:
        return 0
    except requests.exceptions.ConnectionError:
        print("Connection to page refused")
    except:
        print("An error occured while trying to connect to link")
    else:
        print("Parsing successful")
  return 0


In [35]:
def follow_links(origin_link, n_other_links_to_follow = 1 ):

  news_urls = []
  other_urls = [origin_link]

  start_time = time.time()
  print("following links...")
  
  for i in range(n_other_links_to_follow):
    links = give_links(other_urls[i], parent)
    other_urls.pop(0) #no longer needed
    news_urls = news_urls + [link for link in links if (is_news_article(link))]
    other_urls = other_urls + [link for link in links if link not in news_urls]
    other_urls = list(dict.fromkeys(other_urls)) #remove duplicates from list.
  end_time = time.time()
  execution_time = end_time - start_time
  print("completed in {:2f} seconds".format(execution_time))
  news_urls = list(dict.fromkeys(news_urls)) #remove duplicates from the list
  return news_urls, other_urls

In [36]:
news_urls, other_urls = follow_links(news_page_url, 3)

following links...
completed in 151.750334 seconds


In [37]:
len(news_urls)

24

In [38]:
len(other_urls)

113

## Scrape

In [57]:
documents = []
document_index = 0

for url in news_urls:
    print("Fetching {}".format(url))
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    news_heading = (soup.find('h1')).text
    news_text = [p.text for p in soup.find('article').find_all('p')]
    news_time = (soup.find('time'))['datetime']

    documents.append({"Document_Index":str(document_index), "URL":url, "Heading":news_heading, "Article":news_text, "DateTime":news_time})
    document_index = document_index+1


Fetching https://bbc.com/news/election-us-2020-54756915
Fetching https://bbc.com/news/election-us-2020-53657174
Fetching https://bbc.com/news/world-us-canada-54665375
Fetching https://bbc.com/news/world-us-canada-54751759
Fetching https://bbc.com/news/world-europe-54759443
Fetching https://bbc.com/news/uk-54756950
Fetching https://bbc.com/news/uk-northern-ireland-54750668
Fetching https://bbc.com/news/world-europe-54747022
Fetching https://bbc.com/news/world-us-canada-54752228
Fetching https://bbc.com/news/world-africa-54733425
Fetching https://bbc.com/news/world-europe-54752194
Fetching https://bbc.com/news/uk-54759343
Fetching https://bbc.com/news/election-us-2020-54731172
Fetching https://bbc.com/news/election-us-2020-54736083
Fetching https://bbc.com/news/world-54553132
Fetching https://bbc.com/news/world-europe-54747020
Fetching https://bbc.com/news/world-asia-54717686
Fetching https://bbc.com/news/health-54661843
Fetching https://bbc.com/news/world-asia-54097609
Fetching https://

##Saving to MongoDB

In [58]:
#connect to client
client = MongoClient("mongodb+srv://aly:a@cluster0.4pfcp.mongodb.net/db?retryWrites=true&w=majority")

In [59]:
#create a database
db = client["news_database"]

In [60]:
#create a collection (a table)
bbc_news = db["bbc_news"]

In [61]:
db['bbc_news'].delete_many({})

<pymongo.results.DeleteResult at 0x7ff3883e22c8>

In [62]:
bbc_news.insert_many(documents)

<pymongo.results.InsertManyResult at 0x7ff3886be748>

##Retrieving content from MongoDB

In [63]:
def retrieve_documents(database, collection):
  ''' Take a MongoDB database and collection name, and returns all documents in collection to a pandas dataframe'''
  retrieved_documents = database[collection].find() #do not use database.collection, it is a string :)
  retrieved_df = pd.DataFrame(retrieved_documents)
  return retrieved_df

db = client["news_database"]
df = retrieve_documents(database = db , collection = 'bbc_news')

## Keyword search in article text

####Create a text index -required for text search

In [64]:
db = client["news_database"]
db.bbc_news.create_index([
          ("Article", "text"),
          ("Heading", "text"),
  ],
  name="search_index",
  weights={
      'Article':25,
      'Heading':100
  }
)

'search_index'

#### Search function

In [65]:
def search(search_text, database, collection):
  '''
  Take a a string containing keywords and outputs all relevant articles to any of them
  Args:
    -search_text: a string of keywords (<string>)
    -database: MongoDB database name (<pymongo.database.Database>)
    -collection: string with the name of MongoDB collection (<string>)
  Returns:
    -A pandas dataframe containing the MongoDB contents of the returned news articles
  '''
  returned_cursor = database[collection].find({"$text": {"$search": search_text}}).limit(SEARCH_LIMIT)
  df = pd.DataFrame(returned_cursor)
  return df


In [66]:
output = search("macron", db, 'bbc_news')

In [68]:
output['URL']

0    https://bbc.com/news/world-europe-54747020
Name: URL, dtype: object

In [56]:
df

Unnamed: 0,_id,Document_Index,Heading,Article,DateTime
0,5f9d4416ccc52a3bd06163bd,0,US Election 2020: Biden and Trump in tug-of-wa...,[.css-14iz86j-BoldText{font-weight:bold;}In a ...,2020-10-31T01:27:07.000Z
1,5f9d4416ccc52a3bd06163be,1,US election 2020 polls: Who is ahead - Trump o...,[By The Visual and Data Journalism TeamBBC New...,2020-10-31T09:35:58.000Z
2,5f9d4416ccc52a3bd06163bf,2,US election 2020: How many Americans have vote...,[.css-14iz86j-BoldText{font-weight:bold;}Early...,2020-10-30T19:55:20.000Z
3,5f9d4416ccc52a3bd06163c0,3,Coronavirus: US sees record-high daily Covid n...,[.css-14iz86j-BoldText{font-weight:bold;}The U...,2020-10-31T04:12:15.000Z
4,5f9d4416ccc52a3bd06163c1,4,Turkey-Greece quake: Search for survivors unde...,[.css-14iz86j-BoldText{font-weight:bold;}Rescu...,2020-10-31T10:18:11.000Z
5,5f9d4416ccc52a3bd06163c2,5,Coronavirus: PM considering England lockdown n...,[.css-14iz86j-BoldText{font-weight:bold;}The p...,2020-10-31T10:10:34.000Z
6,5f9d4416ccc52a3bd06163c3,6,Rare 'blue moon' to enchant Halloween stargazers,"[By Barra BestBBC News NI Weather Presenter, ....",2020-10-31T06:10:58.000Z
7,5f9d4416ccc52a3bd06163c4,7,Coronavirus: Slovakia holds national test but ...,"[By Rob CameronBBC News, Prague, .css-14iz86j-...",2020-10-31T09:34:23.000Z
8,5f9d4416ccc52a3bd06163c5,8,Breonna Taylor: Police officer sues shot black...,[.css-14iz86j-BoldText{font-weight:bold;}A pol...,2020-10-30T23:13:42.000Z
9,5f9d4416ccc52a3bd06163c6,9,Ivory Coast elections: Voters go to the polls ...,[.css-14iz86j-BoldText{font-weight:bold;}Votes...,2020-10-31T05:06:48.000Z
