# SEO Analysis - Web Scraping

This SEO analyzer works is just based on some simple criteria that is and H1 should be present or some other headings all the images should have an ALT attribute and there should be a meta description and the website should have a title so

In [1]:
# import libraries 

from bs4 import BeautifulSoup
import requests
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt') 
from nltk.util import ngrams

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AMF29\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AMF29\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def seo_analysis(url):
    # Save the good and the warnings in lists
    good = []
    bad = []
    
    # Send a GET request to the website
    response = requests.get(url)
    
    # Check the response status code
    if response.status_code != 200:
        print("Error: Unable to access the website.")
        return
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title and description
    title = soup.find('title').get_text()
    description = soup.find('meta', attrs={'name': 'description'})['content']
    
    
    # Check if the title and description exist
    if title:
        good.append("Title Exists! Great!")
    else:
        bad.append("Title does not exist! Add a Title")

    if description:
        good.append("Description Exists! Great!")
    else:
        bad.append("Description does not exist! Add a Meta Description")    
    
    # Grab the Headings
    hs = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
    h_tags = []
    for h in soup.find_all(hs):
        good.append(f"{h.name}-->{h.text.strip()}")
        h_tags.append(h.name)

    if 'h1' not in h_tags:
        bad.append("No H1 found!")

    # Extract the images without Alt
    for i in soup.find_all('img', alt=''):
        bad.append(f"No Alt: {i}") 

    # Extract the images without Alt
    for i in soup.find_all('img', alt=''):
        bad.append(f"No Alt: {i}")    
        
    # Extract keywords
    # Grab the text from the body of html
    body = soup.find('body').text

    # Extract all the words in the body and lowercase them in a list
    words = [i.lower() for i in word_tokenize(body)]
    
    # Extract bigrams from the tokens (each two words that came together) 
    bigrams =  ngrams(words,2)
    freqBigrams = nltk.FreqDist(bigrams)
    Bigrams_keywords= freqBigrams.most_common(10)    

    # Grab a list of English stopwords
    stop_words = nltk.corpus.stopwords.words('english')
    
    new_words = []
    # create a list that have the tokens which are not stopwords and are actual words (no punctuation) in a new list
    for i in words:
        if i not in stop_words and i.isalpha():
            new_words.append(i)

    # Extract the fequency of the words and get the 10 most common ones
    freq = nltk.FreqDist(new_words)
    keywords= freq.most_common(10)

    # Print the results
    print("Keywords: ", keywords)
    print(" ")
    print("Bigrams Keywords: ", Bigrams_keywords)
    print(" ")
    print("The Good: ", good)
    print(" ")
    print("The Bad: ", bad)

### Example form Random Medium Articles

In [3]:
# Call the function to see the results
seo_analysis("https://medium.com/better-marketing/how-to-track-seo-content-performance-and-automate-your-seo-reports-for-free-57b95ba09026")

Keywords:  [('seo', 4), ('upsign', 2), ('content', 2), ('performance', 2), ('people', 2), ('open', 1), ('appsign', 1), ('inwritesign', 1), ('storyhow', 1), ('track', 1)]
 
Bigrams Keywords:  [(('seo', '&'), 2), (('&', 'content'), 2), (('open', 'in'), 1), (('in', 'appsign'), 1), (('appsign', 'upsign'), 1), (('upsign', 'inwritesign'), 1), (('inwritesign', 'upsign'), 1), (('upsign', 'inmember-only'), 1), (('inmember-only', 'storyhow'), 1), (('storyhow', 'to'), 1)]
 
The Good:  ['Title Exists! Great!', 'Description Exists! Great!', 'h1-->How to Track SEO & Content Performance, and Automate Your SEO Reports for Free', 'h2-->Particularly useful for measuring SEO & content optimization efforts', 'h2-->Written by Yiqian']
 
The Bad:  ['No Alt: <img alt="" class="l fc bx by bz cw" height="32" loading="lazy" role="presentation" src="https://miro.medium.com/v2/resize:fill:64:64/1*dmbNkD5D-u45r44go_cf0g.png" width="32"/>', 'No Alt: <img alt="" class="bg mw oc c" height="467" loading="eager" role="

### Example form National Care Group

In [4]:
# Call the function to see the results
seo_analysis("https://national-g.com/en/articles/98/%D8%AF%D9%84%D9%8A%D9%84%D9%83-%D8%A7%D9%84%D9%83%D8%A7%D9%85%D9%84-%D9%84%D9%84%D8%AA%D8%B9%D8%B1%D9%81-%D8%B9%D9%84%D9%89-%D8%B3%D9%84%D8%B3-%D8%A7%D9%84%D8%A8%D9%88%D9%84-%D9%88%D8%B7%D8%B1%D9%82-%D8%B9%D9%84%D8%A7%D8%AC%D9%87")

Keywords:  [('incontinence', 75), ('urinary', 52), ('skin', 28), ('diaper', 26), ('bladder', 25), ('urine', 23), ('leakage', 20), ('care', 19), ('soft', 19), ('comfortable', 19)]
 
Bigrams Keywords:  [(('.', '-'), 51), (('urinary', 'incontinence'), 47), (('of', 'the'), 22), (('the', 'bladder'), 20), ((',', 'and'), 20), (('to', 'the'), 19), (('the', 'diaper'), 19), ((',', 'which'), 18), (('the', 'skin'), 18), (('of', 'urinary'), 16)]
 
The Good:  ['Title Exists! Great!', 'h4-->Your Complete Guide to Understanding Urinary Incontinence and Its Treatment Methods', 'h4-->Your Complete Guide to Understanding Urinary Incontinence and Its Treatment Methods', 'h1-->Urinary incontinence is a disease that refers to the inability to control or control the bladder, which leads to leakage of urine, although the likelihood of developing it increases with age; however, it is not related to a specific age, gender, or even the stage of aging.', 'h2-->What is the reason for the occurrence of urinary inco