In [None]:
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urlparse

def get_page_load_time(url):
    try:
        start_time = time.time()
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}#a t
        response = requests.get(url, headers=headers)
        load_time = time.time() - start_time
        return load_time if response.status_code == 200 else None
    except requests.exceptions.RequestException:
        return None

def check_https(url):
    parsed_url = urlparse(url)
    return parsed_url.scheme == 'https'

def get_meta_data(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}#added this

        response = requests.get(url, headers=headers)

        soup = BeautifulSoup(response.text, 'html.parser')

        # Check for title and description
        title = soup.title.string if soup.title else ''
        description = soup.find('meta', attrs={'name': 'description'})
        description = description['content'] if description else ''

        return title, description
    except requests.exceptions.RequestException:
        return '', ''

def check_image_optimization(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}#a t
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        images = soup.find_all('img')

        optimized_images = sum(1 for img in images if img.has_attr('alt') and img['alt'])
        total_images = len(images)
        return optimized_images / total_images if total_images > 0 else 1
    except requests.exceptions.RequestException:
        return 0

def calculate_seo_score(url):
    # Parameters and their weightage
    weights = {
        'load_time': 20,
        'https': 20,
        'meta_tags': 20,
        'image_optimization': 20,
        'mobile_friendly': 20
    }


    load_time = get_page_load_time(url)
    load_time_score = max(0, (2 - load_time) / 2) if load_time else 0


    https_score = 1 if check_https(url) else 0


    title, description = get_meta_data(url)
    meta_tags_score = 1 if title and description else 0

    image_optimization_score = check_image_optimization(url)


    mobile_friendly_score = 0.9  # Replace with actual score from PageSpeed Insights API

    total_score = (
        load_time_score * weights['load_time'] +
        https_score * weights['https'] +
        meta_tags_score * weights['meta_tags'] +
        image_optimization_score * weights['image_optimization'] +
        mobile_friendly_score * weights['mobile_friendly']
    )

    max_score = sum(weights.values())  # Maximum possible score
    seo_score = (total_score / max_score) * 100  # Convert to percentage

    print(f"SEO Score for {url}: {seo_score:.2f}/100")
    return seo_score

def get_website_data(url):
    try:
        # Fetch the page content
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}#a t
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses

        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Get title
        title = soup.title.string if soup.title else 'Title not found'

        # Get meta description
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        description = meta_desc['content'] if meta_desc and meta_desc.has_attr('content') else 'Meta description not found'

        # Return extracted values
        return {
            'title': title,
            'meta_description': description,
        }
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return {}
from ping3 import ping
from urllib.parse import urlparse

def get_ping(url):
    try:
        # Extract hostname from URL (remove http/https)
        parsed_url = urlparse(url)
        hostname = parsed_url.hostname

        if not hostname:
            print("Invalid URL")
            return None

        # Ping the hostname
        response_time = ping(hostname)  # Returns time in seconds or None if the host is unreachable

        if response_time is None:
            print(f"Failed to ping {hostname}")
            return None
        else:
            # Convert response time to milliseconds
            response_time_ms = response_time * 1000
            print(f"Ping to {hostname}: {response_time_ms:.2f} ms")
            return response_time_ms
    except Exception as e:
        print(f"Error pinging {url}: {e}")
        return None

import dns.resolver
import time
from urllib.parse import urlparse

def get_dns_response_time(url):
    # Extract the hostname from the URL
    parsed_url = urlparse(url)
    hostname = parsed_url.hostname

    if not hostname:
        print("Invalid URL")
        return None

    try:
        # Start measuring time
        start_time = time.time()

        # Perform DNS resolution
        answers = dns.resolver.resolve(hostname, 'A')  # 'A' record for IPv4 addresses

        # End measuring time
        end_time = time.time()

        # Calculate response time
        dns_response_time = end_time - start_time

        # Print the resolved IP addresses
        resolved_ips = [answer.to_text() for answer in answers]
        print(f"Resolved IP addresses for {hostname}: {resolved_ips}")
        print(f"DNS Response Time for {hostname}: {dns_response_time:.4f} seconds")

        return dns_response_time
    except Exception as e:
        print(f"Error resolving {hostname}: {e}")
        return None



def get_ttfb(url):
    session = requests.Session()
    request = requests.Request("GET", url)
    prepared_request = session.prepare_request(request)
    start = time.time()

    # Only receive the headers to measure TTFB
    response = session.send(prepared_request, stream=True)
    ttfb = time.time() - start
    return ttfb if response.status_code == 200 else None



def check_https(url):
    parsed_url = urlparse(url)
    return parsed_url.scheme == 'https'

def get_content_size(url):
    response = requests.get(url)
    if response.status_code == 200:
        return len(response.content) / 1024  # Return size in KB
    else:
        return None




#accessbility



def check_alt_attributes(soup):
    images = soup.find_all('img')
    images_with_alt = [img for img in images if img.has_attr('alt') and img['alt'].strip()]
    return len(images_with_alt), len(images)

def check_form_labels(soup):
    forms = soup.find_all('form')
    labels = soup.find_all('label')
    inputs_with_labels = sum(1 for label in labels if label.get('for'))
    total_inputs = len(soup.find_all(['input', 'select', 'textarea']))
    return inputs_with_labels, total_inputs

def check_headings_structure(soup):
    headings = [soup.find_all(f"h{i}") for i in range(1, 7)]
    heading_levels = [len(h) for h in headings]
    return heading_levels

def check_aria_attributes(soup):
    aria_elements = soup.find_all(attrs={"aria-label": True})
    return len(aria_elements)

def analyze_accessibility(url):
    print(f"Analyzing accessibility for {url}...\n")

    # Request the page content
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to load the page.")
        return None

    # Parse the HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # 1. Check alt attributes in images
    alt_ok, total_images = check_alt_attributes(soup)
    print(f"Images with alt text: {alt_ok}/{total_images}")

    # 2. Check form labels
    labels_ok, total_inputs = check_form_labels(soup)
    print(f"Inputs with labels: {labels_ok}/{total_inputs}")

    # 3. Check heading structure (ensuring h1 exists and structure flows)
    heading_levels = check_headings_structure(soup)
    print(f"Heading structure: {[f'h{i+1}: {heading_levels[i]}' for i in range(6)]}")

    # 4. Check ARIA attributes
    aria_count = check_aria_attributes(soup)
    print(f"ARIA-labeled elements: {aria_count}")

    # Calculate accessibility score
    accessibility_score = (
        (alt_ok / total_images if total_images > 0 else 1) * 25 +
        (labels_ok / total_inputs if total_inputs > 0 else 1) * 25 +
        (1 if heading_levels[0] > 0 else 0) * 25 +  # H1 check
        (aria_count / 10 if aria_count > 0 else 0) * 25
    )
    #minor change
    if accessibility_score > 100:
        accessibility_score = 100
    print(f"\nAccessibility Score: {accessibility_score:.2f}/100")

url = "https://github.com/topics/seo"
print(get_website_data(url))
calculate_seo_score(url)
get_ping(url)
get_dns_response_time(url)
https_check = check_https(url)
print(f"HTTPS Enabled: {'Yes' if https_check else 'No'}")
print(f"size of the content {get_content_size(url)}kb")
ttfb = get_ttfb(url)
print(f"Time to First Byte (TTFB): {ttfb:.4f} seconds" if ttfb else "Failed to retrieve TTFB.")
analyze_accessibility(url)

#additions

#1
import requests
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor

def check_link(url, link):
    full_link = urljoin(url, link)
    try:
        res = requests.head(full_link, timeout=5)
        if res.status_code >= 400:
            return full_link
    except requests.exceptions.RequestException:
        return full_link
    return None

def check_broken_links(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a.get('href') for a in soup.find_all('a', href=True)]

        broken_links = []
        with ThreadPoolExecutor(max_workers=10) as executor:
            results = executor.map(lambda link: check_link(url, link), links)
            broken_links = [link for link in results if link]

        print(f"Broken links found: {len(broken_links)}")
        for bl in broken_links:
            print(bl)
        return broken_links
    except requests.exceptions.RequestException:
        print("Failed to retrieve page content.")
        return []
print(f"Checking broken links for {url}...\n")

broken_links = check_broken_links(url)

if broken_links:
    print("\nBroken links summary:")
    for link in broken_links:
        print(f"Broken link: {link}")
    else:
        print("No broken links found.")

#2
def check_schema_markup(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        schema_data = soup.find_all('script', attrs={'type': 'application/ld+json'})

        if schema_data:
            print("Schema Markup Found")
            for schema in schema_data:
                print(schema.text)
        else:
            print("No Schema Markup found")
    except requests.exceptions.RequestException:
        print("Failed to retrieve page content.")
print(f"Checking schema markup for {url}...\n")
check_schema_markup(url)

#3
from collections import Counter
import re

def keyword_analysis(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text().lower()
        words = re.findall(r'\b\w+\b', text)
        common_words = Counter(words).most_common(10)

        print("Most Common Keywords:")
        return common_words
    except requests.exceptions.RequestException:
        print("Failed to retrieve page content.")
keyword_analysis(url)

import requests
from bs4 import BeautifulSoup
import re
from collections import Counter

def check_content_duplication(url, threshold=10, phrase_length=1):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text().lower()

        # Create phrases based on the specified phrase length
        words = re.findall(r'\b\w+\b', text)
        phrases = [' '.join(words[i:i + phrase_length]) for i in range(len(words) - phrase_length + 1)]
        word_count = len(phrases)

        repeated_phrases = [phrase for phrase, count in Counter(phrases).items() if count > threshold]
        print(f"Repeated phrases: {repeated_phrases}")

        duplication_percentage = (len(repeated_phrases) / word_count) * 100 if word_count > 0 else 0
        print(f"Content duplication percentage: {duplication_percentage:.2f}%")
        return duplication_percentage
    except requests.exceptions.RequestException:
        print("Failed to retrieve page content.")

threshold = int(input("Enter the repetition threshold (default 10): ") or 10)
phrase_length = int(input("Enter the phrase length (default 1 for single words): ") or 1)

duplication_percentage = check_content_duplication(url, threshold, phrase_length)
print(f"Final content duplication percentage: {duplication_percentage:.2f}%")


{'title': 'seo · GitHub Topics · GitHub', 'meta_description': 'GitHub is where people build software. More than 100 million people use GitHub to discover, fork, and contribute to over 420 million projects.'}
SEO Score for https://github.com/topics/seo: 89.11/100
Ping to github.com: 8.35 ms
Resolved IP addresses for github.com: ['140.82.116.4']
DNS Response Time for github.com: 0.0030 seconds
HTTPS Enabled: Yes
size of the content 478.599609375kb
Time to First Byte (TTFB): 0.0339 seconds
Analyzing accessibility for https://github.com/topics/seo...

Images with alt text: 3/3
Inputs with labels: 4/11
Heading structure: ['h1: 4', 'h2: 5', 'h3: 21', 'h4: 0', 'h5: 0', 'h6: 0']
ARIA-labeled elements: 88

Accessibility Score: 100.00/100
Checking broken links for https://github.com/topics/seo...

Broken links found: 12
https://github.com/vuesion/vuesion/pulls
https://github.com/topics/best-practises
https://github.com/kpumuk/meta-tags/issues
https://github.com/kpumuk/meta-tags/pulls
https://git

In [None]:
!pip install ping3



In [None]:
!pip install dnspython




In [None]:
url = "https://colab.research.google.com/drive/1F_-mWLrNWXrRQIWjs40M9Qpg5v-h3CxC#scrollTo=61tll6yVWiu8"

# Get website data
website_data = get_website_data(url)
print(website_data)

# Calculate SEO score
seo_score = calculate_seo_score(url)

# Check ping time
ping_time = get_ping(url)

# Check DNS response time
dns_time = get_dns_response_time(url)

# Check HTTPS
https_check = check_https(url)

# Get content size
content_size = get_content_size(url)

# Check Time to First Byte (TTFB)
ttfb = get_ttfb(url)

# Analyze accessibility
analyze_accessibility(url)

# Check broken links
broken_links = check_broken_links(url)

# Check schema markup
check_schema_markup(url)


{'title': 'Google Colab', 'meta_description': 'Meta description not found'}
SEO Score for https://colab.research.google.com/drive/1F_-mWLrNWXrRQIWjs40M9Qpg5v-h3CxC#scrollTo=61tll6yVWiu8: 76.86/100
Ping to colab.research.google.com: 1.72 ms
Resolved IP addresses for colab.research.google.com: ['74.125.135.138', '74.125.135.100', '74.125.135.102', '74.125.135.101', '74.125.135.139', '74.125.135.113']
DNS Response Time for colab.research.google.com: 0.0035 seconds
Analyzing accessibility for https://colab.research.google.com/drive/1F_-mWLrNWXrRQIWjs40M9Qpg5v-h3CxC#scrollTo=61tll6yVWiu8...

Images with alt text: 0/0
Inputs with labels: 0/0
Heading structure: ['h1: 0', 'h2: 0', 'h3: 0', 'h4: 0', 'h5: 0', 'h6: 0']
ARIA-labeled elements: 1

Accessibility Score: 52.50/100
Broken links found: 0
No Schema Markup found
