In [None]:
#Straight forward way of crawling 2 links deep on the front page of The New York Times
import requests
from bs4 import BeautifulSoup

num_of_links=15

def get_links(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')
    return {e.get('href') for e in soup.find_all('a')
            if e.get('href') and e.get('href').startswith('https')}

links = get_links('https://www.nytimes.com')

all_links = set()
i=0
for link in links:
    if i>=num_of_links:
        break
    all_links |= get_links(link)
    i+=1

In [None]:
#Using itertools (though the time doesn't improve much)
import itertools as it
import requests
from bs4 import BeautifulSoup

def get_links(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')
    return {e.get('href') for e in soup.find_all('a')
            if e.get('href') and e.get('href').startswith('https')}

links = get_links('https://www.nytimes.com')
links_on_pages = map(get_links, links)
all_links = set(it.chain.from_iterable(links_on_pages))

In [None]:
#using multiprocessing and itertools (still the computation time doesn't improve)
from multiprocessing import Pool
import itertools as it
import requests
from bs4 import BeautifulSoup

def get_links(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')
    return {e.get('href') for e in soup.find_all('a')
            if e.get('href') and e.get('href').startswith('https')}

links = get_links('https://www.nytimes.com')

with multiprocessing.Pool(20) as p:
    links_on_pages = map(get_links, links)
all_links = set(it.chain.from_iterable(links_on_pages))

In [None]:
#For the multiprocessing to work in juputer notebooks, I have to create a .py (a temporary one in this case) 
#and use the if __name__ == '__main__': part
from multiprocessing import Pool
import itertools as it
import requests
from bs4 import BeautifulSoup
from functools import partial
import inspect

def parallal_task(func, iterable):

    with open(f'./tmp_func.py', 'w') as file:
        file.write(inspect.getsource(func).replace(func.__name__, "task"))
        file.write('import requests \n')
        file.write('from bs4 import BeautifulSoup')

    from tmp_func import task

    if __name__ == '__main__':
        pool = Pool(processes=20)
        res = pool.map(task, iterable)
        pool.close()
        return res
    else:
        raise "Not in Jupyter Notebook"
        
def get_links(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')
    return {e.get('href') for e in soup.find_all('a')
            if e.get('href') and e.get('href').startswith('https')}

links = get_links('https://www.nytimes.com')
links_on_pages = parallal_task(get_links, links)
all_links = set(it.chain.from_iterable(links_on_pages))

In [None]:
all_links

In [90]:
#CRAWLER TO DOWNLOAD PGN CHESS FILES FROM A WEBSITE
from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup
import inspect

def parallal_task(func, iterable):

    with open(f'./tmp_func.py', 'w') as file:
        file.write(inspect.getsource(func).replace(func.__name__, "task"))
        file.write('import requests \n')
        file.write('from bs4 import BeautifulSoup')

    from tmp_func import task

    if __name__ == '__main__':
        pool = Pool(processes=2)
        res = pool.map(task, iterable) 
        pool.close()
        return res
    else:
        raise "Not in Jupyter Notebook"
        
def req_links(itera):
    url=itera[0]
    folio=itera[1]
    headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 OPR/112.0.0.0"}
    r = requests.get(url,headers)
    if r.status_code == 200:
        # Save the content to a file
        with open(f"twic{folio}.zip", "wb") as file:
            file.write(r.content)
        print("File downloaded successfully!")
    else:
        print(f"Failed to download file. Status code: {r.status_code}")

folios=list(range(1549,1545,-1) )       
links = [f'https://theweekinchess.com/zips/twic{fol}g.zip' for fol in folios]
links_on_pages = parallal_task(req_links, list(zip(links,folios)))

In [87]:
a=list(zip(links,folios))
a


[('https://theweekinchess.com/zips/twic1549g.zip', 1549),
 ('https://theweekinchess.com/zips/twic1548g.zip', 1548),
 ('https://theweekinchess.com/zips/twic1547g.zip', 1547),
 ('https://theweekinchess.com/zips/twic1546g.zip', 1546)]

In [1]:
import requests

def req_links(itera):
    url = itera[0]
    folio = itera[1]
    headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 OPR/112.0.0.0"}
    r = requests.get(url, headers=headers)
    if r.status_code == 200:
        # Save the content to a file
        with open(fr"C:\Users\Dell-G3\Downloads\twic{folio}.zip", "wb") as file:
            file.write(r.content)
        print(f"File twic{folio}.zip downloaded successfully!")
    else:
        print(f"Failed to download file twic{folio}.zip. Status code: {r.status_code}")

folios = list(range(1549, 1545, -1))       
links = [f'https://theweekinchess.com/zips/twic{fol}g.zip' for fol in folios]
list_of_urls_and_folios = list(zip(links, folios))

for itera in list_of_urls_and_folios:
    req_links(itera)

File twic1549.zip downloaded successfully!
File twic1548.zip downloaded successfully!
File twic1547.zip downloaded successfully!
File twic1546.zip downloaded successfully!


In [None]:
#IMPORTANT THIS ONLY WORKS AS A STAND ALONE .py FILE. IN GENERAL JUPYTER NOTEBOOK HAS PROBLEMS DEALING WITH MULTIPROCESSING
from multiprocessing import Pool
import requests

def download_file(itera):
    url = itera[0]
    folio = itera[1]
    headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 OPR/112.0.0.0"}
    r = requests.get(url, headers=headers)
    if r.status_code == 200:
        with open(fr"C:\Users\Dell-G3\Downloads\twic{folio}.zip", "wb") as file:
            file.write(r.content)
        return f"File twic{folio}.zip downloaded successfully!"
    else:
        return f"Failed to download file twic{folio}.zip. Status code: {r.status_code}"

if __name__ == '__main__':
    folios = list(range(1549, 1545, -1))
    links = [f'https://theweekinchess.com/zips/twic{fol}g.zip' for fol in folios]
    list_of_urls_and_folios = list(zip(links, folios))

    with Pool(processes=4) as pool:
        results = pool.map(download_file, list_of_urls_and_folios)
    
    for result in results:
        print(result)
