notes:
https://myxalandri.gr/sitemap.xml

In [71]:
from bs4 import BeautifulSoup
import requests
import re
import json
import os
import shutil
import zipfile

### Create diretories 

In [79]:
urls = []
site_maps_file = 'sitemaps.txt'

# choice to delete or not delete the existing directories
delete_existing_directories = False


# Check if the file exists
if os.path.exists(site_maps_file):
    # Read the file and save each URL from each line to a list
    with open(site_maps_file, 'r') as f:
        urls = f.readlines()
else:
    # Warn that the file does not exist
    print('File does not exist')


for url in urls:
    # Extract the domain name and top-level domain (TLD) from the URL
    domain_dot_tld = re.search(r'(?<=https://)(.*?)(?=/)', url).group(0)

    # Check if the directory exists (directory name contains the domain name and TLD)
    if not os.path.exists(domain_dot_tld):
        print(f'Creating new directory {domain_dot_tld}')
        os.makedirs(domain_dot_tld)
    elif delete_existing_directories:
        print(f'Directory {domain_dot_tld} already exists. Deleting it and creating a new one')
        # Delete the directory and create a new one
        shutil.rmtree(domain_dot_tld)
        os.makedirs(domain_dot_tld)
    else:
        print(f'Directory {domain_dot_tld} already exists')
    
    # create 3 directories under the domain_dot_tld directory: html_files, txt_files,  metadata_files
    html_files = os.path.join(domain_dot_tld, 'html_files')
    txt_files = os.path.join(domain_dot_tld, 'txt_files')
    metadata_files = os.path.join(domain_dot_tld, 'metadata_files')

    # Check if the directories exist
    if not os.path.exists(html_files):
        print(f'Creating new directory {html_files}')
        os.makedirs(html_files)
    else: 
        print(f'Directory {html_files} already exists')

    if not os.path.exists(txt_files):
        print(f'Creating new directory {txt_files}')
        os.makedirs(txt_files)
    else: 
        print(f'Directory {txt_files} already exists')

    if not os.path.exists(metadata_files):
        print(f'Creating new directory {metadata_files}')
        os.makedirs(metadata_files)
    else:
        print(f'Directory {metadata_files} already exists')
    
    


Creating new directory myxalandri.gr
Creating new directory myxalandri.gr/html_files
Creating new directory myxalandri.gr/txt_files
Creating new directory myxalandri.gr/metadata_files


### Filter out the index pages and extract the relevant information from the articles 
takes around 8-9 minutes to execute

In [132]:
def is_index_page(soup, tag):
    """
    Check if the URL is an index page.
    """

    # get all meta tags with name author
    meta_tags = soup.find_all('meta', attrs={'name': 'author'})
    if meta_tags:
        print(f'Website: {tag.text}')
        # print(meta_tags)
        print('--' * 80)
        return 0 # return 0 if it is not an index page
    else:
        print(f'Website: {tag.text}')
        print("This is an index page. No meta tags with name author")
        print('--' * 80)
        return 1 # return 1 if it is an index page


def extract_html(html_document, domain_dot_tld, idx_file):
    """
    Extract the HTML document from the URL
    """

    # Write the HTML document to a .html file
    with open(domain_dot_tld + "/html_files/" +  str(idx_file) + '.html', 'w') as f:
        f.write(html_document.text)
        f.close()    

# def extract_headline_and_description(soup, domain_dot_tld, idx_file):
#     """
#     Extract the headline and description from the JSON data
#     """
    
#     script_tags = soup.find_all('script', attrs={'data-type': 'gsd'})

#     # Extract the headline and description from each script tag
#     for script_tag in script_tags:
#         # Extract the JSON data
#         json_data = script_tag.string.strip()

#         # Parse the JSON data
#         data = json.loads(json_data)

#         # Extract the headline and description using the get() method
#         headline = data.get('headline')
#         description = data.get('description')

#         # Print the extracted values
#         if headline is not None and description is not None:
#             with open(domain_dot_tld + "/txt_files/" +  str(idx_file) + '.txt', 'w') as f:
#                 f.write('Title:'+ '\n' + headline + '\n')
#                 f.write('Description:' + '\n' + description + '\n')
#                 f.close()



### Alternative method to extract headline and description (doesn't work yet, and probably not optimal for this website) ### 
# def extract_headline_and_description(soup, domain_dot_tld, idx_file):
#     """
#     Extract the headline and description from the JSON data
#     """

    
#     h1_tags = soup.find_all('h1', class_='uk-article-title uk-margin-top-remove')
#     # extracted_h1_content = [tag.get_text() for tag in h1_tags]

#     # p_tags = soup.find_all('p')
#     p_tags = soup.find_all('span')
#     # extracted_p_content = [tag.find('span', style=re.compile(r'font-size: medium;')).get_text() for tag in p_tags]

#     for h1_tag in h1_tags:
#         print(h1_tag.get_text())

#     for p_tag in p_tags:
#         print(p_tag.get_text())
#         # print(p_tag)


def extract_metadata(soup, domain_dot_tld, idx_file):
    """
    Extract the metadata from the HTML document
    """

    # Find all <meta> tags
    meta_tags = soup.find_all('meta')

    # Extract the desired metadata from each meta tag
    metadata = {}
    for meta_tag in meta_tags:
        name = meta_tag.get('name')
        content = meta_tag.get('content')
        if name and content:
            metadata[name] = content

    # Write the metadata to a .meta file in JSON format
    meta_file = domain_dot_tld + "/metadata_files/" +  str(idx_file) + '.meta'
    with open(meta_file, 'w') as f:
        f.write(json.dumps(metadata, indent=4, ensure_ascii=False))

    

In [133]:
idx_article_page = 0
idx_index_page = 0

# Retrieve the sitemap.xml file
xml_document = requests.get(xml[1])
# Parse the XML document
soup = BeautifulSoup(xml_document.text, 'html.parser')
# Find all the <loc> tags
loc_tags = soup.find_all('loc')


# Loop over each tag and extract the data
for tag in loc_tags:
    # Retrieve the HTML document using the get() method of the requests module
    html_document = requests.get(tag.text)

    # Create a BeautifulSoup object for parsing the HTML document
    soup = BeautifulSoup(html_document.text, 'html.parser')

    # if url is front page, skip
    if tag.text.split('/')[-1] == '': 
        continue

    # if url is index page, skip
    if is_index_page(soup, tag):
        idx_index_page += 1
        continue
    
    # Extract the domain name and TLD from the URL
    domain_dot_tld = re.search(r'(?<=https://)(.*?)(?=/)', xml[1]).group(0)

    # Extracting phase
    extract_html(html_document, domain_dot_tld, idx_article_page)

    extract_headline_and_description(soup, domain_dot_tld, idx_article_page)

    extract_metadata(soup, domain_dot_tld, idx_article_page)

    idx_article_page += 1

# Show information about the files extracted
print('###' * 30)
print(f'Number of all pages: {idx_article_page + idx_index_page}')
print(f'Number of article pages: {idx_article_page}')
print(f'Number of index pages: {idx_index_page}')
print(f'Percentage of index pages: {round(idx_index_page/idx_article_page*100, 2)}%')


Website: https://myxalandri.gr/eidiseis
This is an index page. No meta tags with name author
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Website: https://myxalandri.gr/athlitismos
This is an index page. No meta tags with name author
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Website: https://myxalandri.gr/politismos
This is an index page. No meta tags with name author
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Website: https://myxalandri.gr/apopseis
This is an index page. No meta tags with name author
--------------------------------------------------------------------------------------------------------------------------------------------

KeyboardInterrupt: 

In [68]:
import zipfile
import os

# List of directories to be included in the zip file


# Name of the output zip file
zip_file_name = "mydirectories.zip"

# Create a new zip file and open it in write mode
with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zip_file:
    # Add each directory and its contents to the zip file
    for directory_name in directory_list:
        # Get the absolute path of the directory
        directory_path = os.path.abspath(directory_name)
        
        # Iterate over all the files and folders in the directory
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                # Get the absolute path of each file
                file_path = os.path.join(root, file)
                
                # Add the file to the zip file using the relative path
                zip_file.write(file_path, os.path.relpath(file_path, directory_path))

print("Zip file created successfully: ", zip_file_name)


Zip file created successfully:  mydirectories.zip


----

In [38]:
def extract_websites_from_sitemap(url_to_scrape):
    """
    Extracts the websites from a sitemap.xml file and saves the HTML documents in the html/ folder.
    """
    # Retrieve the sitemap.xml file
    html_document = requests.get(url_to_scrape)
    # Parse the XML document
    soup = BeautifulSoup(html_document.text, 'html.parser')
    # Find all the <loc> tags
    loc_tags = soup.find_all('loc')
    websites = []
    
    for loc_tag in loc_tags[:50]:
        # Extract the URL from the <loc> tag
        website_url = loc_tag.text
        
        # Extract the last part of the URL as the website name
        website_name = website_url.split('/')[-1]  
        
        # Save the HTML document with the website name
        response = requests.get(website_url)
        if response.status_code == 200:
            websites.append(website_name)
        else:
            print(f"Failed to retrieve HTML document for website: {website_url}")
    
    return websites


url_to_scrape = "https://myxalandri.gr/sitemap.xml"
websites = extract_websites_from_sitemap(url_to_scrape)
print(websites)


Failed to retrieve HTML document for website: https://myxalandri.gr/athlitismos/18600-halbike-i-halari-podilatovolta-tou-xalandriou-stis-9-9
Failed to retrieve HTML document for website: https://myxalandri.gr/out-xalandri/18354-pame-sinema-sto-xalandri-3-9-9-2020
['', 'eidiseis', 'athlitismos', 'politismos', 'apopseis', 'out-xalandri', 'mymag', '18694-to-neo-orario-kai-o-tropos-leitourgias-ton-dimotikon-vivliothikon-xalandriou', '18688-kalosorisma-tis-enosis-goneon-xalandriou-gia-ti-nea-sxoliki-xronia', '18691-me-156-687-88-epixorigeitai-o-dimos-xalandriou-gia-pliromi-misthomaton-sxolikon-monadon', '18693-fotia-se-aporrimmatoforo-tou-dimou-xalandriou-pos-proklithike', '18690-synelifthi-sto-xalandri-50xronos-allodapos-gia-tilefonikes-apates-me-to-prosxima-troxaiou-atyximatos', '18689-xalandri-orizontas-2023-o-athlitismos-exei-koinoniko-prosimo-kai-einai-to-xamogelo-kathe-topikis-koinonias', '18687-dimos-xalandriou-nea-sxoliki-xronia-se-kainoyrgia-sxoleia', '18686-i-eggrafi-apantisi-tou-

In [7]:
urls = []
site_maps_file = 'sitemaps.txt'

# choice to delete or not delete the existing directories
delete_existing_directories = True


# Check if the file exists
if os.path.exists(site_maps_file):
    # Read the file and save each URL from each line to a list
    with open(site_maps_file, 'r') as f:
        urls = f.readlines()
else:
    # Warn that the file does not exist
    print('File does not exist')


for url in urls:
    # Extract the domain name and top-level domain (TLD) from the URL
    domain_dot_tld = re.search(r'(?<=https://)(.*?)(?=/)', url).group(0)

    # Check if the directory exists (directory name contains the domain name and TLD)
    if not os.path.exists(domain_dot_tld):
        print(domain_dot_tld)
        os.makedirs(domain_dot_tld)
    elif delete_existing_directories:
        print(f'Directory {domain_dot_tld} already exists. Deleting it and creating a new one')
        # Delete the directory and create a new one
        shutil.rmtree(domain_dot_tld)
        os.makedirs(domain_dot_tld)
    else:
        print(f'Directory {domain_dot_tld} already exists')
    
    # create 3 directories under the domain_dot_tld directory: html_files, txt_files,  metadata_files
    html_files = os.path.join(domain_dot_tld, 'html_files')
    txt_files = os.path.join(domain_dot_tld, 'txt_files')
    metadata_files = os.path.join(domain_dot_tld, 'metadata_files')

    # Check if the directories exist
    if not os.path.exists(html_files):
        os.makedirs(html_files)
    else: 
        print(f'Directory {html_files} already exists')

    if not os.path.exists(txt_files):
        os.makedirs(txt_files)
    else: 
        print(f'Directory {txt_files} already exists')

    if not os.path.exists(metadata_files):
        os.makedirs(metadata_files)
    else:
        print(f'Directory {metadata_files} already exists')
    


Directory myxalandri.gr already exists. Deleting it and creating a new one


### Cells that extract data from urls in the sitemap (without the index pages) - accuracy optimized

In [87]:
def extract_websites_from_sitemap(url_to_scrape):
    """
    Extracts the websites from a sitemap.xml file and saves the HTML documents in the html/ folder.
    """
    # Retrieve the sitemap.xml file
    html_document = requests.get(url_to_scrape)
    # Parse the XML document
    soup = BeautifulSoup(html_document.text, 'html.parser')
    # Find all the <loc> tags
    loc_tags = soup.find_all('loc')
    websites = []
    
    for loc_tag in loc_tags:
        # Extract the URL from the <loc> tag
        website_url = loc_tag.text
        
        # Extract the last part of the URL as the website name
        website_name = website_url.split('/')[-1]  
        
        # Save the HTML document with the website name
        response = requests.get(website_url)
        if response.status_code == 200:
            with open(f"html/{website_name}.html", "w", encoding="utf-8") as file:
                file.write(response.text)
                websites.append(website_name)
        else:
            print(f"Failed to retrieve HTML document for website: {website_url}")
    
    return websites


url_to_scrape = "https://myxalandri.gr/sitemap.xml"
websites = extract_websites_from_sitemap(url_to_scrape)
print(websites)




Failed to retrieve HTML document for website: https://myxalandri.gr/athlitismos/18600-halbike-i-halari-podilatovolta-tou-xalandriou-stis-9-9
Failed to retrieve HTML document for website: https://myxalandri.gr/out-xalandri/18354-pame-sinema-sto-xalandri-3-9-9-2020
Failed to retrieve HTML document for website: https://myxalandri.gr/athlitismos/18646-protoporiako-programma-proponiseon-ston-athlitiko-omilo-dioni
Failed to retrieve HTML document for website: https://myxalandri.gr/athlitismos/11573-halbike-podilatovolta-tin-tetarti-29-11
Failed to retrieve HTML document for website: https://myxalandri.gr/athlitismos/11662-halbike-podilatovolta-tin-tetarti13-12
Failed to retrieve HTML document for website: https://myxalandri.gr/athlitismos/11622-halbike-podilatovolta-tin-tetarti-6-12
['', 'eidiseis', 'athlitismos', 'politismos', 'apopseis', 'out-xalandri', 'mymag', '18694-to-neo-orario-kai-o-tropos-leitourgias-ton-dimotikon-vivliothikon-xalandriou', '18688-kalosorisma-tis-enosis-goneon-xaland

In [88]:
def extract_headline_and_description(links):
    for link in links:
        html_file = 'html/' + link.split('/')[-1] + '.html'
        
        # Check if the file exists
        if not os.path.isfile(html_file):
            print(f"File {html_file} does not exist.")
            continue

        with open(html_file, "r") as file:
            html_data = file.read()

        soup = BeautifulSoup(html_data, 'html.parser')

        # Find all <script> tags with data-type="gsd"
        script_tags = soup.find_all('script', attrs={'data-type': 'gsd'})

        # Extract the headline and description from each script tag
        for script_tag in script_tags:
            # Extract the JSON data
            json_data = script_tag.string.strip()

            # Parse the JSON data
            data = json.loads(json_data)

            # Extract the headline and description using the get() method
            headline = data.get('headline')
            description = data.get('description')

            # Print the extracted values
            if headline is not None and description is not None:
                # print("Headline:", headline)
                # print("Description:", description)
                # print("------------------")
                # write to a .txt file the headline and description
                with open('txt/' + link.split('/')[-1] + '.txt', 'w') as f:
                    f.write('Title:'+ '\n' + headline + '\n')
                    f.write('Description:' + '\n' + description + '\n')
                    f.close()


extract_headline_and_description(websites)

In [89]:
def extract_metadata(links):
    """
    Extracts the metadata from the HTML documents of the websites to a file in JSON format.
    """
    for link in links:
        html_file = 'html/' + link + '.html'

        # Check if the file exists
        if not os.path.isfile(html_file):
            print(f"File {html_file} does not exist.")
            continue

        with open(html_file, "r") as file:
            html_data = file.read()

        soup = BeautifulSoup(html_data, 'html.parser')

        # Find all <meta> tags
        meta_tags = soup.find_all('meta')

        # Extract the desired metadata from each meta tag
        metadata = {}
        for meta_tag in meta_tags:
            name = meta_tag.get('name')
            content = meta_tag.get('content')
            if name and content:
                metadata[name] = content

        # Write the metadata to a .meta file in JSON format
        meta_file = 'meta/' + link + '.meta'
        with open(meta_file, 'w') as f:
            f.write(json.dumps(metadata, indent=4, ensure_ascii=False))

extract_metadata(websites)

### Cells that extract data from urls in the sitemap (without the index pages) - storage optimized

In [5]:

def extract_href_from_html_files(file_list):
    href_list_all = []
    i = 0
    for file_path in file_list:

        # Here we will store all the hrefs
        href_list = []
        href_list_xal = []
        href_list_other = []

        if os.path.isfile('html/' + file_path + '.html'):
            with open('html/' + file_path + '.html' , 'r') as file:
                # Read the file content
                content = file.read()
                # Create a BeautifulSoup object for parsing the HTML document
                soup = BeautifulSoup(content, 'html.parser')
                # Find all <a> tags
                href_tags = soup.find_all('a')
                for tag in href_tags:
                    # Extract the href attribute from the <a> tag
                    href = tag.get('href')
                    # Append the href to the list of hrefs
                    href_list_all.append(href)
                    if href:
                        href_list.append(href)
                        if 'myxalandri' in href:
                            href_list_xal.append(href)
                        else: 
                            href_list_other.append(href)

            # Print the number of hrefs
            print(file_path+":\t"+ "total\t" +str(len(href_list)))
            print(file_path+":\t"+ "xalandri\t" +str(len(href_list_xal)))
            print(file_path+":\t"+ "other\t" +str(len(href_list_other)))
            if len(href_list_other) > 50:
                print(file_path+":\t"+ "is index page")
                i += 1

            print("--------------------------------------------------"*2)
        else:
            print(f"File not found: {file_path}")
    print(i)
    return href_list_all


hrefs = extract_href_from_html_files(websites)

NameError: name 'websites' is not defined

In [None]:
def extract_headline_and_description(links):
    for link in links:
        
        html_link = 'https://myxalandri.gr/' + link 
        
        html_document = requests.get(html_link)
        soup = BeautifulSoup(html_document, 'html.parser')

        # Find all <script> tags with data-type="gsd"
        script_tags = soup.find_all('script', attrs={'data-type': 'gsd'})

        # Extract the headline and description from each script tag
        for script_tag in script_tags:
            # Extract the JSON data
            json_data = script_tag.string.strip()

            # Parse the JSON data
            data = json.loads(json_data)

            # Extract the headline and description using the get() method
            headline = data.get('headline')
            description = data.get('description')

            # Print the extracted values
            if headline is not None and description is not None:
                # print("Headline:", headline)
                # print("Description:", description)
                # print("------------------")
                # write to a .txt file the headline and description
                with open('txt/' + link + '.txt', 'w') as f:
                    f.write('Title:'+ '\n' + headline + '\n')
                    f.write('Description:' + '\n' + description + '\n')
                    f.close()

extract_headline_and_description(websites)

In [None]:

def extract_websites_from_sitemap(url_to_scrape):
    """
    Extracts the websites from the sitemap of the given URL (urls that don't contain index pages).
    """
    # Retrieve the HTML document of the sitemap
    html_document = requests.get(url_to_scrape)
    html_document.encoding = 'utf-8'

    # Create a BeautifulSoup object for parsing the HTML document
    soup = BeautifulSoup(html_document.text, 'html.parser')

    # Find all <loc> tags in the sitemap
    loc_tags = soup.find_all('loc')
    websites = []
    
    i = 0
    
    # Extract the website from each <loc> tag
    for loc_tag in loc_tags:
        
        href_list_other = []

        website_url = loc_tag.text

        # Retrieve the HTML document of the website
        response = requests.get(website_url)

        # Create a BeautifulSoup object for parsing the HTML document
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all <a> tags
        href_tags = soup.find_all('a')
        
        for tag in href_tags:
            # Extract the href attribute from the <a> tag
            href = tag.get('href')
            if href and 'myxalandri' not in href: 
                href_list_other.append(href) 
                        
        # Print the number of hrefs and if the website is an index 
        if len(href_list_other) > 50:
            print(website_url+":\t"+ "is index page")
            print("--------------------------------------------------"*2)
            i += 1
        elif len(href_list_other) <= 50 and response.status_code == 200:
            # Write the HTML document to a .html file
            with open('html/' + website_url.split('/')[-1] + '.html', 'w') as f:
                f.write(response.text)
                f.close()
            # Append the website to the list of websites
            websites.append(website_url)

    print(i)
    return websites


url_to_scrape = "https://myxalandri.gr/sitemap.xml"
websites = extract_websites_from_sitemap(url_to_scrape)




https://myxalandri.gr/:	is index page
----------------------------------------------------------------------------------------------------
https://myxalandri.gr/eidiseis:	is index page
----------------------------------------------------------------------------------------------------
https://myxalandri.gr/athlitismos:	is index page
----------------------------------------------------------------------------------------------------
https://myxalandri.gr/politismos:	is index page
----------------------------------------------------------------------------------------------------
https://myxalandri.gr/apopseis:	is index page
----------------------------------------------------------------------------------------------------
https://myxalandri.gr/out-xalandri:	is index page
----------------------------------------------------------------------------------------------------
https://myxalandri.gr/mymag:	is index page
--------------------------------------------------------------------------

In [None]:
def save_html_files(websites):
    """
    Saves the HTML files of the given websites.
    """
    for website in websites:
        # Retrieve the HTML document of the website
        response = requests.get(website)
        response.encoding = 'utf-8'

        # Write the HTML document to a .html file
        with open('html/' + website.split('/')[-1] + '.html', 'w') as f:
            f.write(response.text)
            f.close()

save_html_files(websites)

In [None]:
import requests
from bs4 import BeautifulSoup

def extract_websites_from_sitemap(url_to_scrape):
    """
    Extracts the websites from the sitemap of the given URL (urls that don't contain index pages).
    """
    # Retrieve the HTML document of the sitemap
    html_document = requests.get(url_to_scrape)
    html_document.encoding = 'utf-8'

    # Create a BeautifulSoup object for parsing the HTML document
    soup = BeautifulSoup(html_document.text, 'html.parser')

    # Find all <loc> tags in the sitemap
    loc_tags = soup.find_all('loc')
    websites = []
    index_count = 0

    # Extract the website from each <loc> tag
    for loc_tag in loc_tags:
        website_url = loc_tag.text

        # Retrieve the HTML document of the website
        response = requests.get(website_url)

        # Create a BeautifulSoup object for parsing the HTML document
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <a> tags
        href_tags = soup.find_all('a')
        href_list_other = []

        for tag in href_tags:
            # Extract the href attribute from the <a> tag
            href = tag.get('href')
            if href and 'myxalandri' not in href:
                href_list_other.append(href)

        # Print the number of hrefs and if the website is an index
        if len(href_list_other) > 50:
            # print(len(href_list_other))
            # print(website_url + ":\t" + "is an index page")
            # print("--------------------------------------------------" * 2)
            index_count += 1
        elif len(href_list_other) <= 50 and response.status_code == 200:
            # Write the HTML document to a .html file
            with open('html/' + website_url.split('/')[-1] + '.html', 'w') as f:
                f.write(response.text)

            # Append the website to the list of websites
            websites.append(website_url)

    print(index_count)
    return websites


url_to_scrape = "https://myxalandri.gr/sitemap.xml"
websites = extract_websites_from_sitemap(url_to_scrape)


106


In [None]:
print(websites)

['https://myxalandri.gr/politismos/18694-to-neo-orario-kai-o-tropos-leitourgias-ton-dimotikon-vivliothikon-xalandriou', 'https://myxalandri.gr/eidiseis/18688-kalosorisma-tis-enosis-goneon-xalandriou-gia-ti-nea-sxoliki-xronia', 'https://myxalandri.gr/eidiseis/18691-me-156-687-88-epixorigeitai-o-dimos-xalandriou-gia-pliromi-misthomaton-sxolikon-monadon', 'https://myxalandri.gr/eidiseis/18693-fotia-se-aporrimmatoforo-tou-dimou-xalandriou-pos-proklithike', 'https://myxalandri.gr/eidiseis/18690-synelifthi-sto-xalandri-50xronos-allodapos-gia-tilefonikes-apates-me-to-prosxima-troxaiou-atyximatos', 'https://myxalandri.gr/athlitismos/18689-xalandri-orizontas-2023-o-athlitismos-exei-koinoniko-prosimo-kai-einai-to-xamogelo-kathe-topikis-koinonias', 'https://myxalandri.gr/eidiseis/18687-dimos-xalandriou-nea-sxoliki-xronia-se-kainoyrgia-sxoleia', 'https://myxalandri.gr/eidiseis/18686-i-eggrafi-apantisi-tou-deddie-gia-ti-revmatoklopi-ston-katavlismo-roma-tou-nomismatokopeiou', 'https://myxalandri.gr

In [None]:
print(websites[0].split('/')[-1])

18694-to-neo-orario-kai-o-tropos-leitourgias-ton-dimotikon-vivliothikon-xalandriou
