In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

# Note, Issues with .htm extension starts from https://www.resmigazete.gov.tr/eskiler/2000/06/20000627.htm
# From the first issue to https://www.resmigazete.gov.tr/eskiler/2005/01/20050102.htm, inside links direct to footers in page.
# Example: https://www.resmigazete.gov.tr/eskiler/2004/12/20041231.htm#1 etc.
# After this issue, inside links direct to separate pages https://www.resmigazete.gov.tr/eskiler/2005/01/20050102-1.htm etc.



# Until there, the URL style is https://www.resmigazete.gov.tr/arsiv/23894.pdf where the last number indicates Issue number
# The first issue is https://www.resmigazete.gov.tr/arsiv/1.pdf , in Arabic
# The first Turkish issue is https://www.resmigazete.gov.tr/arsiv/1054.pdf 


# Most of the content is in "a" or "p" attribute. Some instances also have "h1" attribute for the first title. (shown in 20 Feb 2006)

In [5]:
# Data storage
data_RG = []

# URL content for looping
base_url = 'https://www.resmigazete.gov.tr/eskiler'
year = [] # Use a specific year for demonstration
month = []  # Specific month for demonstration
day = []  # Specific days for demonstration

# Get URL
url = f"{base_url}/{year}/{month}/{year}{month}{day}.htm"
response = requests.get(url)

# The function for getting the XPATH
def get_xpath(element):
    """Generate XPath for a BeautifulSoup element by walking up its tree."""
    components = []
    child = element if element.name else element.parent
    while child is not None and child.name != '[document]':
        siblings = list(child.parent.children) if child.parent else []
        # Count occurrences of this tag type before the current element
        count = 1 + sum(1 for sib in siblings[:siblings.index(child)] if sib.name == child.name)
        components.append(f"{child.name}[{count}]")
        child = child.parent
    components.reverse()
    return '/' + '/'.join(components)


## Working note

Before I use the loop, I am trying to understand how to parse efficiently and effectively. The problem is, subclasses are not explicative in html. There are differences throughout the years. Provided links for the contents might be a footer or a redirection. Here are the examples.

The first block shows different Xpath styles used.

The second part shows full texts of 4 days

### Xpaths from sample dates

In [9]:
# Sample URLS with sample dates
base_url = 'https://www.resmigazete.gov.tr/eskiler'
years = [2000, 2002, 2004, 2006, 2007, 2011] # Use a specific year for demonstration
months = ["{:02d}".format(i) for i in [2, 6, 7]]  # Specific month for demonstration
days = ["{:02d}".format(i) for i in [2, 14, 24, 27]]  # Specific days for demonstration

content = ""  # Initialize empty string to collect data

# Iterate through sample dates for html styles
for year in years:
    for month in months:
        for day in days:
            url = f"{base_url}/{year}/{month}/{year}{month}{day}.htm"
            time.sleep(5)  # Delay to prevent too frequent requests
            response = requests.get(url)
            
            if 'charset' in response.headers.get('Content-Type', ''):
                encoding = response.headers['Content-Type'].split('charset=')[-1]
            else:
                encoding = 'utf-8'  # Default to UTF-8 if charset is not specified
                
            # Set the encoding of the response
            response.encoding = encoding


            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'lxml')
                
                for element in soup.find_all(["h1", 'p', 'a']): 
                    xpath = get_xpath(element)
                    text_preview = element.get_text(strip=True)[:20]  # Get first 20 chars of text
                    if text_preview:  # Check if there is any text
                        if element.name == 'a':
                            href = element['href'] if 'href' in element.attrs else 'No href available'
                            href = urljoin(url, href)  # Make href absolute
                            content += f"URL: {url}, XPath: {xpath}, Tag: {element.name}, Link: {href}, Text: {text_preview}\n"
                        else:
                            content += f"URL: {url}, XPath: {xpath}, Tag: {element.name}, Text: {text_preview}\n"
                
            else:
                print(f"Failed to retrieve the page: Status code {response.status_code}")

            content += "\n, \n, \n, \n, \n"
# Save to a file
filename = "Xpaths_resmigazete.txt"
with open(filename, "w", encoding="utf-8") as file:
    file.write(content)

# Reopen and read some lines
with open(filename, "r", encoding="utf-8") as file:
    lines = file.readlines()
    for line in lines[2619:2820]: 
        print(line.strip())

Failed to retrieve the page: Status code 404
Failed to retrieve the page: Status code 404
Failed to retrieve the page: Status code 404
Failed to retrieve the page: Status code 404
Failed to retrieve the page: Status code 404
Failed to retrieve the page: Status code 404
Failed to retrieve the page: Status code 404

Ziyaret s�ras�nd
URL: https://www.resmigazete.gov.tr/eskiler/2000/07/20000727.htm, XPath: /html[1]/body[1]/font[2]/p[1], Tag: p, Text: 3.

Bakanlar,
URL: https://www.resmigazete.gov.tr/eskiler/2000/07/20000727.htm, XPath: /html[1]/body[1]/font[4]/p[1], Tag: p, Text: 4.
URL: https://www.resmigazete.gov.tr/eskiler/2000/07/20000727.htm, XPath: /html[1]/body[1]/font[7]/p[1], Tag: p, Text: 5.

Taraflar g�r��me
URL: https://www.resmigazete.gov.tr/eskiler/2000/07/20000727.htm, XPath: /html[1]/body[1]/font[8]/p[1], Tag: p, Text: (i)

�ifte Vergilend
URL: https://www.resmigazete.gov.tr/eskiler/2000/07/20000727.htm, XPath: /html[1]/body[1]/font[9]/p[1], Tag: p, Text: (ii)

�ki �lke ara

### Full text examples

In [6]:
# Test first htm issue, 27 June 2000

url = "https://www.resmigazete.gov.tr/eskiler/2000/06/20000627.htm"
time.sleep(2)
response = requests.get(url)


if 'charset' in response.headers.get('Content-Type', ''):
    encoding = response.headers['Content-Type'].split('charset=')[-1]
else:
    encoding = 'utf-8'  # Default to UTF-8 if charset is not specified

# Set the encoding of the response
response.encoding = encoding

# Check if the request was successful
if response.status_code == 200:
    # Parse the content of the request with Beautiful Soup using the lxml parser
    soup = BeautifulSoup(response.content, 'lxml')

    # Make all 'a' tags' href attributes absolute
    

    # Print selected tag content (p and a) and absolute links with their full XPath
    for element in soup.find_all(["h1", 'p', 'a']):
        xpath = get_xpath(element)
        if element.name == 'a':
            href = element['href'] if 'href' in element.attrs else 'No href available'
            href = urljoin(url, href)  # Make href absolute
            print(f"XPath: {xpath}, Link: {href}, Text: {element.get_text(strip=True)}")
        else:
            print(f"XPath: {xpath}, Tag: {element.name}, Text: {element.get_text(strip=True)}")
else:
    print("Failed to retrieve the page: Status code", response.status_code)

XPath: /html[1]/body[1]/div[1]/p[1], Tag: p, Text: 
XPath: /html[1]/body[1]/div[1]/div[1]/table[1]/tr[1]/td[1]/p[1], Tag: p, Text: T.C.Resmî Gazete
XPath: /html[1]/body[1]/div[1]/div[1]/table[1]/tr[2]/td[1]/p[1], Tag: p, Text: Başbakanlık
XPath: /html[1]/body[1]/div[1]/div[1]/table[1]/tr[2]/td[1]/p[2], Tag: p, Text: Mevzuatı Geliştirme ve Yayın Genel Müdürlüğünce Yayımlanır
XPath: /html[1]/body[1]/div[1]/div[1]/table[1]/tr[3]/td[1]/p[1], Tag: p, Text: Kuruluşu: 7 Ekim
  1920
XPath: /html[1]/body[1]/div[1]/div[1]/table[1]/tr[3]/td[2]/p[1], Tag: p, Text: 27 Haziran 2000SALI
XPath: /html[1]/body[1]/div[1]/div[1]/table[1]/tr[3]/td[3]/p[1], Tag: p, Text: Sayı : 24092
XPath: /html[1]/body[1]/div[1]/p[2], Tag: p, Text: 
XPath: /html[1]/body[1]/div[1]/table[1]/tr[1]/td[1]/p[1], Tag: p, Text: Å
XPath: /html[1]/body[1]/div[1]/table[1]/tr[1]/td[1]/p[1]/span[1]/a[1], Link: http://www.resmigazete.gov.tr/eskiler/2000/06/20000627.htm, Text: Å
XPath: /html[1]/body[1]/div[1]/table[1]/tr[1]/td[2]/p[1], 

In [35]:
# Issue of 02 Feb 2002

url = "https://www.resmigazete.gov.tr/eskiler/2002/02/20020202.htm"
time.sleep(2)
response = requests.get(url)


if 'charset' in response.headers.get('Content-Type', ''):
    encoding = response.headers['Content-Type'].split('charset=')[-1]
else:
    encoding = 'utf-8'  # Default to UTF-8 if charset is not specified

# Set the encoding of the response
response.encoding = encoding

# Check if the request was successful
if response.status_code == 200:
    # Parse the content of the request with Beautiful Soup using the lxml parser
    soup = BeautifulSoup(response.content, 'lxml')

    # Make all 'a' tags' href attributes absolute
    links = soup.find_all('a', href=True)
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(url, link['href'])

    # Print selected tag content (p and a) and absolute links with their full XPath
    for element in soup.find_all(["h1", 'p', 'a']):
        xpath = get_xpath(element)
        if element.name == 'a':
            href = element['href'] if 'href' in element.attrs else 'No href available'
            print(f"XPath: {xpath}, Link: {href}, Text: {element.get_text(strip=True)}")
        else:
            print(f"XPath: {xpath}, Tag: {element.name}, Text: {element.get_text(strip=True)}")
else:
    print("Failed to retrieve the page: Status code", response.status_code)

XPath: /html[1]/body[1]/p[1], Tag: p, Text: 
XPath: /html[1]/body[1]/p[1]/font[1]/a[1], Link: No href available, Text: 
XPath: /html[1]/body[1]/p[2], Tag: p, Text: 
XPath: /html[1]/body[1]/p[2], Tag: p, Text: 
XPath: /html[1]/body[1]/p[2], Tag: p, Text: 
XPath: /html[1]/body[1]/p[2], Tag: p, Text: 
XPath: /html[1]/body[1]/p[2], Tag: p, Text: 
XPath: /html[1]/body[1]/table[1]/tr[1]/td[1]/p[1], Tag: p, Text: Başbakanlık
XPath: /html[1]/body[1]/table[1]/tr[1]/td[1]/p[2], Tag: p, Text: Mevzuatı
      Geliştirme ve Yayın Genel Müdürlüğünce Yayımlanır
XPath: /html[1]/body[1]/table[1]/tr[2]/td[1]/table[1]/tr[1]/td[1]/p[1], Tag: p, Text: Kuruluş : 7 Ekim 1920
XPath: /html[1]/body[1]/table[1]/tr[2]/td[1]/table[1]/tr[1]/td[2]/p[1], Tag: p, Text: 2
            Şubat 2002
XPath: /html[1]/body[1]/table[1]/tr[2]/td[1]/table[1]/tr[1]/td[2]/p[2], Tag: p, Text: CUMARTESİ
XPath: /html[1]/body[1]/table[1]/tr[2]/td[1]/table[1]/tr[1]/td[3]/p[1], Tag: p, Text: Sayı : 24659
XPath: /html[1]/body[1]/p[7], Tag:

In [36]:
# Issue of 02 Feb 2006

url = "https://www.resmigazete.gov.tr/eskiler/2006/02/20060202.htm"
time.sleep(2)
response = requests.get(url)


if 'charset' in response.headers.get('Content-Type', ''):
    encoding = response.headers['Content-Type'].split('charset=')[-1]
else:
    encoding = 'utf-8'  # Default to UTF-8 if charset is not specified

# Set the encoding of the response
response.encoding = encoding

# Check if the request was successful
if response.status_code == 200:
    # Parse the content of the request with Beautiful Soup using the lxml parser
    soup = BeautifulSoup(response.content, 'lxml')

    # Make all 'a' tags' href attributes absolute
    links = soup.find_all('a', href=True)
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(url, link['href'])

    # Print selected tag content (p and a) and absolute links with their full XPath
    for element in soup.find_all(["h1", 'p', 'a']):
        xpath = get_xpath(element)
        if element.name == 'a':
            href = element['href'] if 'href' in element.attrs else 'No href available'
            print(f"XPath: {xpath}, Link: {href}, Text: {element.get_text(strip=True)}")
        else:
            print(f"XPath: {xpath}, Tag: {element.name}, Text: {element.get_text(strip=True)}")
else:
    print("Failed to retrieve the page: Status code", response.status_code)

XPath: /html[1]/body[1]/table[1]/tr[4]/td[1]/table[1]/tr[1]/td[1]/span[1]/a[1], Link: http://www.resmigazete.gov.tr/eskiler/2006/02/20060201.htm, Text: Å
XPath: /html[1]/body[1]/table[1]/tr[4]/td[1]/table[1]/tr[1]/td[3]/p[1], Tag: p, Text: SONRAKİ
XPath: /html[1]/body[1]/table[1]/tr[4]/td[1]/table[1]/tr[1]/td[4]/p[1], Tag: p, Text: Æ
XPath: /html[1]/body[1]/table[1]/tr[4]/td[1]/table[1]/tr[1]/td[4]/p[1]/span[1]/font[1]/span[1]/a[1], Link: http://www.resmigazete.gov.tr/eskiler/2006/02/20060203.htm, Text: Æ
XPath: /html[1]/body[1]/table[1]/tr[4]/td[1]/h1[1], Tag: h1, Text: YÜRÜTME VE İDARE BÖLÜMÜ
XPath: /html[1]/body[1]/table[1]/tr[4]/td[1]/p[1], Tag: p, Text: 
XPath: /html[1]/body[1]/table[1]/tr[4]/td[1]/p[2], Tag: p, Text: Yönetmelikler
XPath: /html[1]/body[1]/table[1]/tr[4]/td[1]/p[3], Tag: p, Text: — Türkiye Büyük 
    Millet Meclisi Milli Saray, Köşk, Kasır ve Müştemilatının Bakım, Onarım ve 
    Korunmasına İlişkin Yönetmelikte Değişiklik Yapılmasına Dair Yönetmelik
XPath: /html[1]

In [37]:
# Issue of 02 Feb 2008

url = "https://www.resmigazete.gov.tr/eskiler/2008/02/20080202.htm"
time.sleep(2)
response = requests.get(url)


if 'charset' in response.headers.get('Content-Type', ''):
    encoding = response.headers['Content-Type'].split('charset=')[-1]
else:
    encoding = 'utf-8'  # Default to UTF-8 if charset is not specified

# Set the encoding of the response
response.encoding = encoding

# Check if the request was successful
if response.status_code == 200:
    # Parse the content of the request with Beautiful Soup using the lxml parser
    soup = BeautifulSoup(response.content, 'lxml')

    # Make all 'a' tags' href attributes absolute
    links = soup.find_all('a', href=True)
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(url, link['href'])

    # Print selected tag content (p and a) and absolute links with their full XPath
    for element in soup.find_all(["h1", 'p', 'a']):
        xpath = get_xpath(element)
        if element.name == 'a':
            href = element['href'] if 'href' in element.attrs else 'No href available'
            print(f"XPath: {xpath}, Link: {href}, Text: {element.get_text(strip=True)}")
        else:
            print(f"XPath: {xpath}, Tag: {element.name}, Text: {element.get_text(strip=True)}")
else:
    print("Failed to retrieve the page: Status code", response.status_code)

XPath: /html[1]/body[1]/table[1]/tr[1]/td[1]/table[1]/tr[1]/td[1]/p[1], Tag: p, Text: 2 Şubat 2008 Tarihli ve 26775 Sayılı 
			Resmî Gazete
XPath: /html[1]/body[1]/table[1]/tr[1]/td[1]/table[1]/tr[1]/td[2]/p[1], Tag: p, Text: MEVZUAT
XPath: /html[1]/body[1]/table[1]/tr[1]/td[1]/table[2]/tr[1]/td[1]/span[1]/a[1], Link: http://www.resmigazete.gov.tr/eskiler/2008/02/20080201.htm, Text: Å
XPath: /html[1]/body[1]/table[1]/tr[1]/td[1]/table[2]/tr[1]/td[3]/p[1], Tag: p, Text: SONRAKİ
XPath: /html[1]/body[1]/table[1]/tr[1]/td[1]/table[2]/tr[1]/td[4]/p[1], Tag: p, Text: Æ
XPath: /html[1]/body[1]/table[1]/tr[1]/td[1]/table[2]/tr[1]/td[4]/p[1]/span[1]/font[1]/span[1]/a[1], Link: http://www.resmigazete.gov.tr/eskiler/2008/02/20080203.htm, Text: Æ
XPath: /html[1]/body[1]/table[1]/tr[1]/td[1]/p[1], Tag: p, Text: YÜRÜTME VE İDARE BÖLÜMÜ
XPath: /html[1]/body[1]/table[1]/tr[1]/td[1]/p[2], Tag: p, Text: 
XPath: /html[1]/body[1]/table[1]/tr[1]/td[1]/p[3], Tag: p, Text: MİLLETLERARASI 
	ANDLAŞMA
XPath: /h