### Web Scrapper with Command Line Interface


In [2]:
!pip install requests beautifulsoup4 beautifultable certifi cffi charset-normalizer cryptography h11 idna outcome pycparser pyOpenSSL PySocks requests selenium sniffio sortedcontainers soupsieve trio trio-websocket urllib3 wcwidth wsproto

Collecting beautifultable
  Downloading beautifultable-1.1.0-py2.py3-none-any.whl.metadata (13 kB)
Collecting h11
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Collecting outcome
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting selenium
  Downloading selenium-4.24.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting wsproto
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading beautifultable-1.1.0-py2.py3-none-any.whl (28 kB)
Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Downloading selenium-4.24.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Import required modules
import json
import requests
from datetime import datetime
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from beautifultable import BeautifulTable
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the LLaMA model from Hugging Face
model_name = "meta-llama/Llama-2-7b-hf"
access_token = "Your_huggingface_token_here" # Create Token from Hugging Face 

tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token)

# LLaMA3 Prompt template
template = (
    "You are tasked with extracting specific information from the following text content: {dom_content}. "
    "Please follow these instructions carefully: \n\n"
    "1. **Extract Information:** Only extract the information that directly matches the provided description: {parse_description}. "
    "2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. "
    "3. **Empty Response:** If no information matches the description, return an empty string ('')."
    "4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text."
)

# Load existing scraped data from JSON
def load_json(database_json_file="scraped_data.json"):
    try:
        with open(database_json_file, "r") as read_it:
            all_data_base = json.loads(read_it.read())
            return all_data_base
    except FileNotFoundError:
        return {"scraped_data": {}}

# Save scraped data to JSON
def save_scraped_data_in_json(data, database_json_file="scraped_data.json"):
    with open(database_json_file, "w") as file_obj:
        file_obj.write(json.dumps(data, indent=4))

# Initialize scraped data in JSON
def existing_scraped_data_init(json_db):
    if json_db.get("scraped_data") is None:
        json_db['scraped_data'] = {}

# Get current time in a specific format
def scraped_time_is():
    now = datetime.now()
    return now.strftime("%d/%m/%Y %H:%M:%S")

# Make a request to the URL and get the page content
def process_url_request(website_url):
    requets_data = requests.get(website_url)
    if requets_data.status_code == 200:
        soup = BeautifulSoup(requets_data.text, 'html.parser')
        return soup
    return None

# Process the BeautifulSoup object to extract relevant data
def proccess_beautiful_soup_data(soup):
    return {
        'title': soup.find('title').text if soup.find('title') else 'No title found',
        'all_anchor_href': [i['href'] for i in soup.find_all('a', href=True)],
        'all_anchors': [str(i) for i in soup.find_all('a')],
        'all_images_data': [str(i) for i in soup.find_all('img')],
        'all_images_source_data': [i.get('src') for i in soup.find_all('img')],
        'all_h1_data': [i.text for i in soup.find_all('h1')],
        'all_h2_data': [i.text for i in soup.find_all('h2')],
        'all_h3_data': [i.text for i in soup.find_all('h3')],
        'all_p_data': [i.text for i in soup.find_all('p')]
    }

# Function to interact with LLaMA model for parsing
def prompt_model_for_parsing(dom_content, parse_description):
    # Use the template to generate the prompt
    prompt = template.format(dom_content=dom_content, parse_description=parse_description)
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=200)
    model_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # This is to check if the output is in valid JSON format
    try:
        # Attempt to parse the model's output as JSON
        parsed_output = json.loads(model_output)
    except json.JSONDecodeError:
        print("Model output is not valid JSON. Saving as text.")
        parsed_output = {'parsed_data': model_output}  # Save raw text inside a JSON structure

    return parsed_output

# Main Logic starts here
while True:
    print("""  ================ Welcome to this scraping program =============
    ==>> press 1 for checking existing scraped websites
    ==>> press 2 to scrape a single website
    ==>> press 3 to exit
    """)

    choice = int(input("==>> Please enter your choice :"))

    # Load data
    local_json_db = load_json()
    existing_scraped_data_init(local_json_db)

    if choice == 1:
        scraped_websites_table = BeautifulTable()
        scraped_websites_table.columns.header = ["Sr no.", "Website domain", "Title", "Scraped at", "Status"]
        scraped_websites_table.set_style(BeautifulTable.STYLE_BOX_DOUBLED)

        for count, data in enumerate(local_json_db['scraped_data']):
           scraped_websites_table.rows.append([count + 1,
                            local_json_db['scraped_data'][data]['domain'],
                            local_json_db['scraped_data'][data]['title'],
                            local_json_db['scraped_data'][data]['scraped_at'],
                            local_json_db['scraped_data'][data]['status']])
        if not local_json_db['scraped_data']:
            print('===> No existing data found !!!')
        print(scraped_websites_table)

    elif choice == 2:
        parse_description = input("Enter the specific data you want to extract (e.g., all headers, links, etc.): ")
        url_for_scrap = input("===> Please enter the URL you want to scrape:")

        # Make the request and process the data
        is_accessable = process_url_request(url_for_scrap)
        if is_accessable:
            scraped_data_packet = proccess_beautiful_soup_data(is_accessable)
            print(' =====> Data scraped successfully !!!')

            scraped_data_packet['url'] = url_for_scrap
            scraped_data_packet['scraped_at'] = scraped_time_is()
            scraped_data_packet['status'] = True
            scraped_data_packet['domain'] = urlparse(url_for_scrap).netloc

            # Extract the relevant data based on user instructions
            dom_content = str(is_accessable)
            model_output = prompt_model_for_parsing(dom_content, parse_description)

            # Automatically save as JSON
            timestamp = scraped_time_is().replace("/", "_").replace(" ", "_").replace(":", "_")
            json_filename = f"parsed_data_{timestamp}.json"

            # Save the parsed output
            with open(json_filename, "w") as f:
                json.dump(model_output, f, indent=4)
            print(f'Parsed data saved to {json_filename}.')

    elif choice == 3:
        print('Thank you for using the scraper!')
        break

    else:
        print("Please enter a valid choice.")


    ==>> press 1 for checking existing scraped websites
    ==>> press 2 for scrap a single website
    ==>> press 3 for exit
    
==>> Please enter your choice :2

===> Please enter url you want to scrap:https://google.com/

 =====> Data scraped successfully !!!
enter alias name for saving scraped data :googl.
scraped data is: {'title': 'Google', 'all_anchor_href': ['https://www.google.com/imghp?hl=en&tab=wi', 'https://maps.google.com/maps?hl=en&tab=wl', 'https://play.google.com/?hl=en&tab=w8', 'https://www.youtube.com/?tab=w1', 'https://news.google.com/?tab=wn', 'https://mail.google.com/mail/?tab=wm', 'https://drive.google.com/?tab=wo', 'https://www.google.com/intl/en/about/products?tab=wh', 'http://www.google.com/history/optout?hl=en', '/preferences?hl=en', 'https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/&ec=GAZAAQ', '/advanced_search?hl=en&authuser=0', 'https://www.google.com/url?q=https://www.google.com/search%3Fq%3Dhow%2Bto%2Bregister%