![Diwan logo](https://cairowestmag.com/wp-content/uploads/2017/03/Diwan-Bookstores-Logo.png)

# Diwan website books scrapping
**Note: All the links were collected on 11-02-2023**

In this notebook, we will scrape all the book's ISBN in the diwan website from links that we have collected before in the previous notebooks to be used later to get books information form Goodreads

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import threading
import requests
import csv

In [None]:
df = pd.read_csv('my_books_lib_diwan_arabic.csv')

In [None]:
ISBNs = {}
def collect_books_ISBN(links, thread_number):
    for index, link in enumerate(links):
        print(f'Thread #{thread_number}: Start Index {index}')
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        try:
            ISBN = soup.find('li', class_='isbn').text
        except Exception:
            continue
        ISBNs[link] = ISBN

In [None]:
dfs = np.array_split(df['Link'], 8) # split my main data frame to 8 data frames (number of threads that I will ran)

# start runing the threads and gave each one the df that it will scrape
threads = []
for i, splited_df in enumerate(dfs):
    thread = threading.Thread(target=collect_books_ISBN, kwargs={'links': splited_df, 'thread_number': str(i + 1)})
    threads.append(thread)
    thread.start()
    
for thread in threads:
    thread.join()

In [None]:
# save both Link and ISBN
with open(f'Diwan_books_isbn_links.csv', 'w', encoding='utf-8-sig', newline="") as file:
        writer = csv.writer(file)
        writer.writerow(['Link', 'ISBN'])
        for link, isbn in ISBNs.items():
            writer.writerow([link, isbn])

In [None]:
# save only ISBN
with open(f'Diwan_books_only_isbn.csv', 'w', encoding='utf-8-sig', newline="") as file:
        writer = csv.writer(file)
        writer.writerow(['ISBN'])
        for link, isbn in ISBNs.items():
            writer.writerow([isbn])