Import necessary packages

In [None]:
import requests
import json
import urllib
from urllib.parse import urlparse, urljoin, quote
from requests_html import HTML, HTMLSession
import undetected_chromedriver as uc
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from datetime import date
import time
import pandas as pd
import numpy as np
import whois
import datetime
import warnings

Read selected keywords to collect URL and URL information

In [None]:
data = pd.read_excel('Final_kw.xlsx')
data.head()

Function to process keyword search in Google incognito mode

In [None]:
def incognito(keyword):
    url = 'https://www.google.com/search?q=' + urllib.parse.quote(str(keyword)) + '&pws=0&gl=uk&gws_rd=cr'
    return url

Function to extract age of domain

In [None]:
def domain_age(url):
    try:
        w = whois.whois(url)
        if w.status is None:
            return None
        else:
            create_date = w.creation_date
            if create_date is not None:
                if type(create_date) == list:
                    create_date = create_date[0]
                current_date = datetime.datetime.now()
                domain_age = (current_date - create_date).days
                return domain_age
            else:
                return None
    except:
        return None

Function to extract meta tags number and size of page/total/html

In [None]:
def info(url):
    driver = webdriver.Chrome()  
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    driver.get(url)
    
    # Pagesize
    page_source = driver.page_source
    page_size = len(page_source.encode('utf-8'))/1024
    
    #Meta tag
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'meta')))
    meta_tags = driver.find_elements(By.TAG_NAME, 'meta')
    num_meta_tags = len(meta_tags)
    
    #Size
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    resource_elements = []
    for element in soup.find_all(['img', 'script', 'link', 'audio', 'video', 'iframe', 'object', 'embed', 'form', 'canvas']):
        if 'src' in element.attrs:
            url = element['src']
            if url.startswith('http') or url.startswith('https'):
                resource_elements.append(urljoin(url, element['src']))
        elif 'href' in element.attrs:
            url = element['href']
            if url.startswith('http') or url.startswith('https'):
                resource_elements.append(urljoin(url, element['href']))
    total_size = len(html)
    html_size = len(html)
    for resource_url in resource_elements:
        try:
            script = f"""
                var xhr = new XMLHttpRequest();
                xhr.open('GET', '{resource_url}', false);
                xhr.send(null);
                return xhr.responseText;
            """
            resource_content = driver.execute_script(script)
            resource_size = len(resource_content)
            total_size += resource_size
        except:
            pass
    
    data = {
        'page_size': page_size,
        'meta_tag': num_meta_tags,
        'total_size': total_size,
        'html_size': html_size
    }
    driver.close()
    return data

Function to extract page size

In [None]:
def pagesize_KB(url):
    driver = webdriver.Chrome()  
    
    # Ignore DeprecationWarning
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    
    # Navigate to the URL
    driver.get(url)
    
    # Get the page source (HTML content) of the webpage
    page_source = driver.page_source
    
    # Calculate the size of the page source in bytes
    page_size = len(page_source.encode('utf-8'))/1024
    
    driver.close()
    return page_size

Function to extract number of links

In [None]:
def internal_external_link_count(url):
    driver = webdriver.Chrome()
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    driver.get(url)
    total_link = 0
    internal_link = 0
    external_link = 0
    domain = urlparse(url).netloc
    link = driver.find_elements(By.TAG_NAME, 'a')
    try:
        link = driver.find_elements(By.TAG_NAME, 'a')
        for i in link:
            href = i.get_attribute('href')
            if href is not None and href != 'javascript:void(0)' and not href.startswith('http'):
                total_link += 1
            elif href is not None and href != 'javascript:void(0)' and href.startswith('http') and domain in href:
                internal_link += 1
                total_link += 1
            elif href is not None and href != 'javascript:void(0)' and href.startswith('http') and domain not in href:
                external_link +=1
                total_link += 1
    except:
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.TAG_NAME, "a")))
        link = driver.find_elements(By.TAG_NAME, 'a')
        for i in link:
            href = i.get_attribute('href')
            if href is not None and href != 'javascript:void(0)' and not href.startswith('http'):
                total_link += 1
            elif href is not None and href != 'javascript:void(0)' and href.startswith('http') and domain in href:
                internal_link += 1
                total_link += 1
            elif href is not None and href != 'javascript:void(0)' and href.startswith('http') and domain not in href:
                external_link +=1
                total_link += 1
    else:
        pass
    driver.close()
    return total_link, internal_link, external_link

Function to extract meta tag number

In [None]:
def meta_count(url):
    driver = webdriver.Chrome() 
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.TAG_NAME, 'meta')))
    meta_tags = driver.find_elements(By.TAG_NAME, 'meta')
    num_meta_tags = len(meta_tags)
    driver.close()
    return num_meta_tags

Function to check if URL is https

In [None]:
def is_secure(url):
    if url.startswith('https://'):
        return 1
    else:
        return 0

Function to extract total size and html size

In [None]:
def size_measure(url):
    driver = webdriver.Chrome()
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    resource_elements = []
    for element in soup.find_all(['img', 'script', 'link', 'audio', 'video', 'iframe', 'object', 'embed', 'form', 'canvas']):
        if 'src' in element.attrs:
            url = element['src']
            if url.startswith('http') or url.startswith('https'):
                resource_elements.append(urljoin(url, element['src']))
        elif 'href' in element.attrs:
            url = element['href']
            if url.startswith('http') or url.startswith('https'):
                resource_elements.append(urljoin(url, element['href']))
    total_size = len(html)
    html_size = len(html)
    for resource_url in resource_elements:
        try:
            resource_response = requests.get(resource_url)
        except:
            headers = {'Accept-Encoding': 'gzip, deflate'}
            resource_response = requests.get(resource_url, headers=headers)
        resource_size = len(resource_response.content)
        total_size += resource_size
    driver.close()
    return total_size, html_size

Function to extract content download time and response time

In [None]:
def response_time_measure(url):
    try:
        start_time = time.time()
        content_download_time = requests.get(url, timeout=30).elapsed.total_seconds()
        end_time = time.time()
        response_time = end_time - start_time
    except:
        content_download_time = 0
        response_time = 0
    return content_download_time, response_time

Function to extract Page Experience metrics

In [None]:
class cwv_measure():
    
    def __init__(self, url, key="AIzaSyA_QVKsQY9IQWqZSpVj3rTwtnWxzSzTIUQ", strategy='desktop'):
        self.url = url
        self.key = key
        self.strategy = strategy

    def query(self):
        endpoint = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"\
        +"?strategy="+ self.strategy\
        +"&url={}"\
        +"&key="+ self.key

        response = urllib.request.urlopen(endpoint.format(self.url)).read().decode('UTF-8')
        data = json.loads(response)  

        return data

    def get_core_web_vitals(self):
        report = self.query()
        final_url = report['lighthouseResult']['finalUrl']
        fetch_time = report['lighthouseResult']['fetchTime']
        form_factor = report['lighthouseResult']['configSettings']['formFactor']
        
        #index
        overall_score = report["lighthouseResult"]["categories"]["performance"]["score"] * 100
        speed_index = report["lighthouseResult"]["audits"]["speed-index"]["score"] * 100
        first_meaningful_paint_index = report["lighthouseResult"]["audits"]["first-meaningful-paint"]["score"] * 100
        first_contentful_paint_index = report["lighthouseResult"]["audits"]["first-contentful-paint"]["score"] * 100
        time_to_interactive_index = report["lighthouseResult"]["audits"]["interactive"]["score"] * 100
        total_blocking_time_index = report["lighthouseResult"]["audits"]["total-blocking-time"]["score"] * 100
        largest_contentful_paint_index = report["lighthouseResult"]["audits"]["largest-contentful-paint"]["score"] * 100
        first_input_delay_index = report["lighthouseResult"]["audits"]["max-potential-fid"]["score"] * 100
        cumulative_layout_shift_index = report["lighthouseResult"]["audits"]["cumulative-layout-shift"]["score"] * 100
        
        #Serve static assets with an efficient cache policy
        uses_long_cache_ttl_index = report["lighthouseResult"]["audits"]["uses-long-cache-ttl"]["score"] * 100
        
        #Reduce unused JavaScript
        unused_javascript_index = report["lighthouseResult"]["audits"]["unused-javascript"]["score"] * 100
        
        #absolute value
        speed = report["lighthouseResult"]["audits"]["speed-index"]["numericValue"]
        time_to_interactive = report["lighthouseResult"]["audits"]["interactive"]["numericValue"]
        total_byte_weight = report["lighthouseResult"]["audits"]["total-byte-weight"]['numericValue']
        largest_contentful_paint = report["lighthouseResult"]["audits"]["largest-contentful-paint"]["numericValue"]
        first_input_delay = report["lighthouseResult"]["audits"]["max-potential-fid"]["numericValue"]
        cumulative_layout_shift = report["lighthouseResult"]["audits"]["cumulative-layout-shift"]["numericValue"]
        first_contentful_paint = report["lighthouseResult"]["audits"]["first-contentful-paint"]["numericValue"] #millisecond
        first_meaningful_paint = report["lighthouseResult"]["audits"]["first-meaningful-paint"]["numericValue"] #millisecond
        total_blocking_time = report["lighthouseResult"]["audits"]["total-blocking-time"]["numericValue"] #millisecond
        server_response_time = report["lighthouseResult"]["audits"]["server-response-time"]["numericValue"] #millisecond
        numTasks = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['numTasks']
        maxRtt = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['maxRtt']
        mainDocumentTransferSize = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['mainDocumentTransferSize']
        numScripts = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['numScripts']
        totalTaskTime = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['totalTaskTime']
        numTasksOver500ms = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['numTasksOver500ms']
        numTasksOver100ms = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['numTasksOver100ms']
        numTasksOver50ms = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['numTasksOver50ms']
        numTasksOver25ms = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['numTasksOver25ms']
        numTasksOver10ms = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['numTasksOver10ms']
        numRequests = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['numRequests']
        numStylesheets = report["lighthouseResult"]["audits"]["diagnostics"]["details"]['items'][0]['numStylesheets']
        uses_long_cache_ttl = report["lighthouseResult"]["audits"]["uses-long-cache-ttl"]["numericValue"] #bytes
        unused_javascript = report["lighthouseResult"]["audits"]["unused-javascript"]["numericValue"] #millisecond
        

        data = {
            'final_url': final_url,
            'fetch_time': fetch_time,
            'form_factor': form_factor,
            'overall_score': overall_score,
            'speed_index': speed_index,    
            'first_meaningful_paint_index': first_meaningful_paint_index,
            'first_contentful_paint_index': first_contentful_paint_index,
            'time_to_interactive_index': time_to_interactive_index,
            'total_blocking_time_index': total_blocking_time_index,
            'largest_contentful_paint_index': largest_contentful_paint_index,
            'first_input_delay_index': first_input_delay_index,
            'cumulative_layout_shift_index': cumulative_layout_shift_index,
            'uses_long_cache_ttl_index': uses_long_cache_ttl_index,
            'unused_javascript_index': unused_javascript_index,
            'speed': speed,
            'time_to_interactive': time_to_interactive,
            'total_byte_weight': total_byte_weight,
            'largest_contentful_paint': largest_contentful_paint,
            'first_input_delay': first_input_delay,
            'cumulative_layout_shift': cumulative_layout_shift,
            'first_meaningful_paint': first_meaningful_paint,
            'first_contentful_paint': first_contentful_paint,
            'total_blocking_time': total_blocking_time,
            'server_response_time': server_response_time,
            'numTasks': numTasks,
            'maxRtt': maxRtt,
            'mainDocumentTransferSize': mainDocumentTransferSize,
            'numScripts': numScripts,
            'totalTaskTime': totalTaskTime,
            'numTasksOver500ms': numTasksOver500ms,
            'numTasksOver100ms': numTasksOver100ms,
            'numTasksOver50ms': numTasksOver50ms,
            'numTasksOver25ms': numTasksOver25ms,
            'numTasksOver10ms': numTasksOver10ms,
            'numRequests': numRequests,
            'numStylesheets': numStylesheets,
            'uses_long_cache_ttl': uses_long_cache_ttl,
            'unused_javascript': unused_javascript 
        }

        return data

Function to extract content-related metrics

In [None]:
def html_content_crawl(url, keyword):
    driver = webdriver.Chrome() 
    driver.get(url)
    h1 = driver.find_elements(By.TAG_NAME, 'h1')
    h2 = driver.find_elements(By.TAG_NAME, 'h2')
    h3 = driver.find_elements(By.TAG_NAME, 'h3')
    headings = driver.find_elements(By.XPATH, "//h1 | //h2 | //h3 | //h4 | //h5 | //h6")
    paragraphs = driver.find_elements(By.TAG_NAME, 'p')
    anchors = driver.find_elements(By.TAG_NAME, 'a')
    footers = driver.find_elements(By.ID, 'footer')
    images = driver.find_elements(By.TAG_NAME, 'img')
    body = driver.find_elements(By.XPATH, "//h1 | //h2 | //h3 | //h4 | //h5 | //h6 | //p")
    meta = driver.find_elements(By.TAG_NAME, 'meta')
    title = driver.find_elements(By.TAG_NAME, 'title')
    heading1_cnt = len(h1)
    heading2_cnt = len(h2)
    heading3_cnt = len(h3)
    total_heading_cnt = len(headings)
    img_cnt = len(images)
    
    heading1_length = 0
    if heading1_cnt != 0:
        for i in range(len(h1)):
            heading1_length += len(h1[i].text)
    else:
        pass
        
    heading2_length = 0
    if heading2_cnt != 0:
        for i in range(len(h2)):
            heading2_length += len(h2[i].text)
    else:
        pass
        
    heading3_length = 0
    if heading3_cnt != 0:
        for i in range(len(h3)):
            heading3_length += len(h3[i].text)
    else:
        pass
        
    total_heading_length = 0
    if total_heading_cnt != 0:
        for i in range(len(headings)):
            total_heading_length += len(headings[i].text)
    else:
        pass
    
    if heading1_cnt != 0: 
        heading1_length = heading1_length/heading1_cnt
    else: 
        0
    if heading2_cnt != 0: 
        heading2_length = heading2_length/heading2_cnt
    else: 
        0
    if heading3_cnt != 0: 
        heading3_length = heading3_length/heading3_cnt
    else: 
        0
    if total_heading_cnt != 0: 
        total_heading_length = total_heading_length/total_heading_cnt
    else: 
        0
    
    h1_kw_cnt = 0
    if heading1_cnt != 0: 
        for i in range(len(h1)):
            if keyword in h1[i].text:
                h1_kw_cnt +=1
    else:
        pass
            
    h2_kw_cnt = 0
    if heading2_cnt != 0: 
        for i in range(len(h2)):
            if keyword in h2[i].text:
                h2_kw_cnt +=1
    else:
        pass
            
    h3_kw_cnt = 0
    if heading3_cnt != 0: 
        for i in range(len(h3)):
            if keyword in h3[i].text:
                h3_kw_cnt +=1
    else:
        pass
            
    heading_kw_cnt = 0
    if total_heading_cnt != 0: 
        for i in range(len(headings)):
            if keyword in headings[i].text:
                heading_kw_cnt +=1
    else:
        pass
    
    
    all_paragraphs_text = ''
    for paragraph in paragraphs:
        try:
            all_paragraphs_text += paragraph.text
        except:
            pass
    p_kw_cnt = all_paragraphs_text.lower().count(keyword.lower())
    

    anchor_text = ''
    for anchor in anchors:
        try:
            anchor_text += anchor.text
        except:
            pass
    is_kw_in_anchor = 1 if keyword.lower() in anchor_text.lower() else 0
    is_any_kw_in_anchor = 1 if any(kw.lower() in anchor_text.lower() for kw in keyword.split()) else 0
    

    footer_text = ''
    for footer in footers:
        try:
            footer_text += footer.text
        except:
            pass
    is_kw_in_footer = 1 if keyword.lower() in footer_text.lower() else 0
    is_any_kw_in_footer = 1 if any(kw.lower() in footer_text.lower() for kw in keyword.split()) else 0
    
    is_kw_in_url = 1 if keyword.lower() in url.lower() else 0
    is_any_kw_in_url = 1 if any(kw.lower() in url.lower() for kw in keyword.split()) else 0
    

    body_text = ''
    for i in body:
        try:
            body_text += i.text
        except:
            pass
    text_amount = len(body_text)
    
    img_alt_kw = 0
    if img_cnt != 0:
        for i in images:
            try:
                alt_text = i.get_attribute("alt")
                img_alt_kw += alt_text.lower().count(keyword.lower())
            except:
                img_alt_kw += 0
    else:
        pass
    
    meta_length = 0
    meta_kw = 0
    if len(meta) != 0:
        for i in meta:
            try:
                name = i.get_attribute('name')
                meta_length += len(name)
                meta_kw += name.lower().count(keyword.lower())
            except:
                meta_length += 0
                meta_kw += 0
            try:
                content = i.get_attribute('content')
                meta_length += len(content)
                meta_kw += content.lower().count(keyword.lower())
            except:
                meta_length += 0
                meta_kw += 0
                
    else: 
        pass
    
    is_title_tag_used = 1 if len(title) != 0 else 0
    
    data = {
        'amount_of_text': text_amount,
        'total_heading': total_heading_cnt,
        'total_heading_length': total_heading_length,
        'keyword_in_total_heading': heading_kw_cnt,
        'heading1': heading1_cnt,
        'heading1_length': heading1_length,
        'keyword_in_heading1': h1_kw_cnt,
        'heading2': heading2_cnt,
        'heading2_length': heading2_length,
        'keyword_in_heading2': h2_kw_cnt,
        'heading3': heading3_cnt,
        'heading3_length': heading3_length,
        'keyword_in_heading3': h3_kw_cnt,
        'total_img': img_cnt,
        'keyword_in_img_alt': img_alt_kw,
        'keyword_in_anchor': is_kw_in_anchor,
        'any_kw_in_anchor': is_any_kw_in_anchor,
        'keyword_in_footer': is_kw_in_footer,
        'any_kw_in_footer': is_any_kw_in_footer,
        'keyword_in_url': is_kw_in_url,
        'any_kw_in_url': is_any_kw_in_url,
        'keyword_in_body': p_kw_cnt,
        'meta_length': meta_length,
        'keyword_in_meta': meta_kw,
        'is_title_tag_used': is_title_tag_used
        
    }
    driver.close()
    return data

Extract data using all above function

In [None]:
se_results = []

for i in data['Keyword'][600:800]:
    try:
        session = HTMLSession()
        url = incognito(i)
        print(i, ': ', url)
        rep = session.get(url)
        soup = BeautifulSoup(rep.text, 'lxml')
        n_rank = 0 #reset rank
        results_selector=soup.select('div[class*="yuRUbf"]')

        #Loop over the results
        for result_selector in results_selector:
            if result_selector['class'][0].startswith('v5yQqb'):
                result_type = 'SEM'
                domain_name = result_selector.cite.get_text().split(' › ')[0]
            else:
                result_type = 'Organic'
                domain_name = result_selector.cite.get_text().split(' › ')[0]
            link = result_selector.select('a')[0]['href']
            print(link)
            srp_title = result_selector.find('h3', class_ = 'LC20lb').get_text()
            is_kw_in_srp_title = 1 if str(i).lower() in srp_title.lower() else 0
            is_any_kw_in_srp_title = 1 if any(kw.lower() in srp_title.lower() for kw in i.split()) else 0
            age = domain_age(link)
            info_ = info(link)
            total_link, internal_link, external_link = internal_external_link_count(link)
            is_secure_url = is_secure(link)
            content_download_time, response_time = response_time_measure(link)
            cwv = cwv_measure(link).get_core_web_vitals()
            html_content = html_content_crawl(link, keyword=str(i))
            n_rank += 1
            temp_dict = {
         'query_date' : date.today().strftime('%Y%m%d'),
         'keyword': str(i),
         'gg_srp' : url,
         'rank' : n_rank,
         'result_type' : result_type,
         'srp_title': srp_title,
         'srp_title_length': len(srp_title),
         'keyword_in_srp_title': is_kw_in_srp_title,
         'any_kw_in_srp_title': is_any_kw_in_srp_title,
         'domain_name' : domain_name,
         'domain_age': domain_age(domain_name),
         'LP_link' : link,
         'LP_pagesize': info_['page_size'],
         'total_links': total_link,
         'internal_links': internal_link,
         'external_links': external_link,
         'meta_tag': info_['meta_tag'],
         'is_secure': is_secure_url,
         'content_download_time': content_download_time,
         'response_time': response_time,
         'content_size': info_['total_size'],
         'html_size': info_['html_size'],
         'overall_score': cwv['overall_score'],
         'speed_index': cwv['speed_index'],    
         'first_meaningful_paint_index': cwv['first_meaningful_paint_index'],
         'first_contentful_paint_index': cwv['first_contentful_paint_index'],
         'time_to_interactive_index': cwv['time_to_interactive_index'],
         'total_blocking_time_index': cwv['total_blocking_time_index'],
         'largest_contentful_paint_index': cwv['largest_contentful_paint_index'],
         'first_input_delay_index': cwv['first_input_delay_index'],
         'cumulative_layout_shift_index': cwv['cumulative_layout_shift_index'],
         'uses_long_cache_ttl_index': cwv['uses_long_cache_ttl_index'],
         'unused_javascript_index': cwv['unused_javascript_index'],
         'speed': cwv['speed'],
         'time_to_interactive': cwv['time_to_interactive'],
         'total_byte_weight': cwv['total_byte_weight'],
         'largest_contentful_paint': cwv['largest_contentful_paint'],
         'first_input_delay': cwv['first_input_delay'],
         'cumulative_layout_shift': cwv['cumulative_layout_shift'],
         'first_meaningful_paint': cwv['first_meaningful_paint'],
         'first_contentful_paint': cwv['first_contentful_paint'],
         'total_blocking_time': cwv['total_blocking_time'],
         'server_response_time': cwv['server_response_time'],
         'numTasks': cwv['numTasks'],
         'maxRtt': cwv['maxRtt'],
         'mainDocumentTransferSize': cwv['mainDocumentTransferSize'],
         'numScripts': cwv['numScripts'],
         'totalTaskTime': cwv['totalTaskTime'],
         'numTasksOver500ms': cwv['numTasksOver500ms'],
         'numTasksOver100ms': cwv['numTasksOver100ms'],
         'numTasksOver50ms': cwv['numTasksOver50ms'],
         'numTasksOver25ms': cwv['numTasksOver25ms'],
         'numTasksOver10ms': cwv['numTasksOver10ms'],
         'numRequests': cwv['numRequests'],
         'numStylesheets': cwv['numStylesheets'],
         'uses_long_cache_ttl': cwv['uses_long_cache_ttl'],
         'unused_javascript': cwv['unused_javascript'],
         'amount_of_text': html_content['amount_of_text'],
         'total_heading': html_content['total_heading'],
         'total_heading_length': html_content['total_heading_length'],
         'keyword_in_total_heading': html_content['keyword_in_total_heading'],
         'heading1': html_content['heading1'],
         'heading1_length': html_content['heading1_length'],
         'keyword_in_heading1': html_content['keyword_in_heading1'],
         'heading2': html_content['heading2'],
         'heading2_length': html_content['heading2_length'],
         'keyword_in_heading2': html_content['keyword_in_heading2'],
         'heading3': html_content['heading3'],
         'heading3_length': html_content['heading3_length'],
         'keyword_in_heading3': html_content['keyword_in_heading3'],
         'total_img': html_content['total_img'],
         'keyword_in_img_alt': html_content['keyword_in_img_alt'],
         'keyword_in_anchor': html_content['keyword_in_anchor'],
         'any_kw_in_anchor': html_content['any_kw_in_anchor'],
         'keyword_in_footer': html_content['keyword_in_footer'],
         'any_kw_in_footer': html_content['any_kw_in_footer'],
         'keyword_in_url': html_content['keyword_in_url'],
         'any_kw_in_url': html_content['any_kw_in_url'],
         'keyword_in_body': html_content['keyword_in_body'],
         'meta_length': html_content['meta_length'],
         'keyword_in_meta': html_content['keyword_in_meta'],
         'is_title_tag_used': html_content['is_title_tag_used']
         } 
            se_results.append(temp_dict)
    except:
        pass

Show data

In [None]:
result = pd.DataFrame(se_results)
result

Export data to csv

In [None]:
result.to_csv('result_601-800.csv')