<a href="https://colab.research.google.com/github/alyrazik/News_content_collect_store/blob/main/CrawlingBBC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  News Content Collect and Store
## by Aly Abdelrazek
 
 


##Setup and Importing libraries

In [71]:
!python -m pip install pymongo[srv]



In [72]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pickle
import pymongo
from pymongo import MongoClient
from datetime import datetime
import time
from time import sleep


In [73]:
pymongo.version

'3.11.0'

##Global variables

In [74]:
news_page_url = "https://bbc.com/news"
parent = "https://bbc.com"
SEARCH_LIMIT = 10   #limit the number of returned articles matching a keyword search


##Crawl


In [75]:
# Obtains a list of links from a page_url

def give_links(url, parent):
  ''' Take a url, and its parent address and returns a list of absolute addresses of web links orginitating from it.'''
  
  page_html = requests.get(url).text
  page_soup = BeautifulSoup(page_html, "html.parser")
  page_links = page_soup.findAll("a") #find all links

  #validate and process obtained links
  output1 = [link.get("href") for link in page_links if link.get("href") is not None]
  output2 = [parent+link if link.startswith("/news") else link for link in output1]
  output3 = [link for link in output2 if "http" in link]


  return output3
 



In [76]:

def is_news_article(link):
  ''' 
  Take a web link address and tests whether the link contains a news article in BBC. Returns 1 if True, 0 otherwise.  
  ''' 
  if "/news" in link: #news articles have /news in address. if not, it is considered not a news article.

    try:
      s = BeautifulSoup(requests.get(link).text , "html.parser")
      if s.find(class_= "css-16rg7hm-ContainerWithSidebarWrapper e1jl38b40") is not None: #this class is only available in news articles
        return 1
      else:
        return 0
    except requests.exceptions.ConnectionError:
        print("Connection to page refused")
    except:
        print("An error occured while trying to connect to link")
    else:
        print("Parsing successful")
  return 0


In [78]:
def follow_links(origin_link, test_func, n_other_links_to_follow = 1):
  '''
  Takes a link and starts following all links it contained searching for a pattern specificed by boolean function test_func. It adds them to matching_urls list. 
  Other links that doesn't match the specified pattern are added to another list (other_urls). The function will then crawl a number of them specified by n_other_links_to_follow.
  Args:
    -origin link: a string of the URL address (<string>)
    -test_func: a callable that takes a string of a URL link and outputs 1 if the link matches a specified pattern, 0 otherwise.
    -n_other_links_to_follow : number of links to follow from the other_urls list (<int>)
  Returns:
    -Two python lists containing matching_urls and other_urls
  '''
  matching_urls = []
  other_urls = [origin_link]

  start_time = time.time()
  print("following links...")
  
  for i in range(n_other_links_to_follow):
    links = give_links(other_urls[i], parent)
    other_urls.pop(0) #no longer needed
    matching_urls = matching_urls + [link for link in links if (test_func(link))]
    other_urls = other_urls + [link for link in links if link not in matching_urls]
    other_urls = list(dict.fromkeys(other_urls)) #remove duplicates from list.

  end_time = time.time()
  execution_time = end_time - start_time
  print("completed in {:2f} seconds".format(execution_time))

  matching_urls = list(dict.fromkeys(matching_urls)) #remove duplicates from the list
  
  return matching_urls, other_urls

In [79]:
news_urls, other_urls = follow_links(news_page_url, is_news_article)

following links...
completed in 78.790332 seconds


In [80]:
len(news_urls)

25

In [81]:
len(other_urls)

104

## Scrape

In [82]:
documents = []
document_index = 0

for url in news_urls:
    print("Fetching {}".format(url))
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    news_heading = (soup.find('h1')).text
    news_text = [p.text for p in soup.find('article').find_all('p')]
    news_time = (soup.find('time'))['datetime']

    documents.append({"Document_Index":str(document_index), "URL":url, "Heading":news_heading, "Article":news_text, "DateTime":news_time})
    document_index = document_index+1


Fetching https://bbc.com/news/election-us-2020-54756915
Fetching https://bbc.com/news/election-us-2020-53657174
Fetching https://bbc.com/news/world-us-canada-54665375
Fetching https://bbc.com/news/world-us-canada-54751759
Fetching https://bbc.com/news/world-europe-54759443
Fetching https://bbc.com/news/uk-54756950
Fetching https://bbc.com/news/uk-northern-ireland-54750668
Fetching https://bbc.com/news/world-europe-54747022
Fetching https://bbc.com/news/world-us-canada-54752228
Fetching https://bbc.com/news/world-africa-54733425
Fetching https://bbc.com/news/world-europe-54752194
Fetching https://bbc.com/news/uk-54759343
Fetching https://bbc.com/news/election-us-2020-54731172
Fetching https://bbc.com/news/election-us-2020-54736083
Fetching https://bbc.com/news/world-54553132
Fetching https://bbc.com/news/uk-wales-54692567
Fetching https://bbc.com/news/world-europe-54747020
Fetching https://bbc.com/news/world-asia-54717686
Fetching https://bbc.com/news/health-54661843
Fetching https://bb

##Saving to MongoDB

In [83]:
#connect to client
client = MongoClient("mongodb+srv://aly:a@cluster0.4pfcp.mongodb.net/db?retryWrites=true&w=majority")

In [84]:
#create a database
db = client["news_database"]

In [85]:
#create a collection (a table)
bbc_news = db["bbc_news"]

In [86]:
db['bbc_news'].delete_many({})

<pymongo.results.DeleteResult at 0x7ff388696788>

In [87]:
bbc_news.insert_many(documents)

<pymongo.results.InsertManyResult at 0x7ff3886be788>

##Retrieving content from MongoDB

In [88]:
def retrieve_documents(database, collection):
  ''' Take a MongoDB database and collection name, and returns all documents in collection to a pandas dataframe'''
  retrieved_documents = database[collection].find() #do not use database.collection, it is a string :)
  retrieved_df = pd.DataFrame(retrieved_documents)
  return retrieved_df

db = client["news_database"]
df = retrieve_documents(database = db , collection = 'bbc_news')

## Keyword search in article text

####Create a text index -required for text search

In [89]:
db = client["news_database"]
db.bbc_news.create_index([
          ("Article", "text"),
          ("Heading", "text"),
  ],
  name="search_index",
  weights={
      'Article':25,
      'Heading':100
  }
)

'search_index'

#### Search function

In [90]:
def search(search_text, database, collection):
  '''
  Take a a string containing keywords and outputs all relevant articles to any of them
  Args:
    -search_text: a string of keywords (<string>)
    -database: MongoDB database name (<pymongo.database.Database>)
    -collection: string with the name of MongoDB collection (<string>)
  Returns:
    -A pandas dataframe containing the MongoDB contents of the returned news articles
  '''
  returned_cursor = database[collection].find({"$text": {"$search": search_text}}).limit(SEARCH_LIMIT)
  df = pd.DataFrame(returned_cursor)
  return df


In [93]:
output = search("indian", db, 'bbc_news')

In [94]:
output['URL']

0    https://bbc.com/news/world-asia-india-54759863
1    https://bbc.com/news/world-asia-india-54655948
2       https://www.bbc.co.uk/news/stories-54623417
Name: URL, dtype: object