In [None]:
# title: osmand_ap_stats.ipynb
# gets size of India related osmand download file size over time from archive.org
# wip: still to handle html read errors gracefully, currently abort the cell and save the capture_df to file manually
# input:  "data/osmand_ap_stats.csv"
# input: "data/osmand_ap_captures.csv"
# output: updating_ap_stats, when there is new data



In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import requests
from pathlib import Path
import io
from bs4 import BeautifulSoup
import time
import wayback
from datetime import datetime
from datetime import date
from dateutil.relativedelta import relativedelta

In [2]:
arx_file_path=Path("data/osmand_ap_captures.csv")
if arx_file_path.exists():
    print(f"The path '{arx_file_path}' exists, will not load new captures_data, If updated needed delete the existing file and rerun.")
    arx_df = pd.read_csv(arx_file_path)
    arx_df['timestamp']=arx_df['timestamp'].astype('str')
    
else:
    print(f"The path '{arx_file_path}' does not exist, will create.")
    # check for existence of backups on archive
    osmand_downloads_uri= 'download.osmand.net/list.php'
    # wayback machine cdx search url 
    # wayback_url= 'http://web.archive.org/cdx/search/cdx?url='archive.org&from=2010&to=2011'
    # get osmand download page captures 
    wayback_url='https://web.archive.org/cdx/search/cdx?url='+osmand_downloads_uri+'&from=2015&to=2025&output=json'
    row_filterexp=r'^India_andhra-pradesh'
    # get response
    # %%
    # Send the request to Overpass API
    response = requests.get(wayback_url)

    # Check for successful response
    if response.status_code != 200:

                raise Exception(f"Wayback machine request failed with status code {response.status_code}")
    else:
        print("✅ Query executed successfully.")
    
        import io
        # Convert the JSON response directly to a DataFrame
        arx_df = pd.read_json(io.StringIO(response.text))

        # 2. Assign the first row to the column headers
        new_header = arx_df.iloc[0]  # grabs the first row (index 0)
        arx_df.columns = new_header # set the new headers

        # 3. Remove the first row from the data
        arx_df = arx_df[1:]


        # remove rows, where the statuscode is not 200
        arx_df= arx_df[arx_df['statuscode'].str.contains('200')]
        # 4. (Optional) Reset the index so it starts from 0 again
        arx_df = arx_df.reset_index(drop=True)
        arx_df['timestamp']=arx_df['timestamp'].astype('str')
        # save data so that reattempt is not required
        arx_df.to_csv(arx_file_path, index=False, encoding="utf-8")
        print(arx_df.head())

The path 'data/osmand_ap_captures.csv' exists, will not load new captures_data, If updated needed delete the existing file and rerun.


In [3]:


# The format corresponding to YYYYMMDDHHMMSS, with day set to 01
format_pattern = '%Y%m%d%H%M%S'
# check for the data that is already written during prior runs
file_path=Path("data/osmand_ap_stats.csv")
if file_path.exists():
    print(f"The path '{file_path}' exists, will append any newdata.")
    capture_df = pd.read_csv(file_path)
    capture_df['File']=capture_df['File'].astype('str')
    capture_df['Date']=capture_df['Date'].astype('str')
    capture_df['Description']=capture_df['Description'].astype('str')
    capture_df.head()
    min_ts_str=capture_df['Date'].iloc[-1]
    min_ts_yy=min_ts_str[-4:]
    min_ts_mm=min_ts_str[3:5]
    min_ts=min_ts_yy+min_ts_mm+'01000000'
    min_ts_object= datetime.strptime(min_ts, format_pattern)+relativedelta(months=3)
    min_ts=min_ts_object.strftime("%Y%m%d%H%M%S")
    
else:
    #initialise capture_df to gather relevant data from archive captures.

    capture_df = pd.DataFrame({'File': pd.Series(dtype='str'),
                'Date': pd.Series(dtype='str'),
                'Size': pd.Series(dtype='float'),
                'Description': pd.Series(dtype='str')
                })
    print(f"The path '{file_path}' does not exist, will create if I get new data.")

if 'min_ts' not in locals():
    min_ts=arx_df['timestamp'][0]
max_ts=arx_df['timestamp'].iloc[-1]

print(f' data to be captured: min_timestamp:{min_ts}; max_timestamp:{max_ts}')


The path 'data/osmand_ap_stats.csv' exists, will append any newdata.
 data to be captured: min_timestamp:20250501000000; max_timestamp:20251207200921


In [4]:
# Convert the string to a datetime object
# only retain year and month
min_ts=min_ts[0:6]+'01000000'
min_ts_object= datetime.strptime(min_ts, format_pattern)
max_ts_object= datetime.strptime(max_ts, format_pattern)

# Print the resulting datetime object and its type
print(f"Datetime object: {min_ts_object}")
print(f"Type: {type(min_ts_object)}")

Datetime object: 2025-05-01 00:00:00
Type: <class 'datetime.datetime'>


In [5]:
from requests.exceptions import HTTPError, RequestException
# 
# Path to your CSV file (change 'data.csv' to your file name or path)
# capture_df=pd.DataFrame()  already initiated in the first cell
# 'MS' frequency generates the start of each month (Month Start)
for dt in pd.date_range(start=min_ts_object, end=max_ts_object, freq='3MS'):
    dt_yymm=dt.strftime("%Y%m")
    check_ts=dt_yymm+'01000000'
    dt_pattern=r'^'+dt_yymm
    print(f'Checking archived page for {dt_pattern}') 
    # get the first timestamp in df ['timestamp] that contains dt_str
    # 1. Create a boolean Series indicating which rows match the regex
    matches=arx_df.index[arx_df['timestamp'].str.contains(dt_pattern,regex=True,na=False)]
    # 2. Find the index of the last True value
    # Get all indices where the value matches

    # The last matching index is the last element of the resulting Index object
    if len(matches)==0:
        print(f"capture for timestamp not available, move to next")



    else:
        # process webpages
        # get last match index 
        last_match_index = matches[len(matches)-1]
        print(f"The index of the last match is: {last_match_index}")

        capture_ts = arx_df.loc[last_match_index]['timestamp']
        capture_url= arx_df.loc[last_match_index]['original']
        wayback_url='https://web.archive.org/web/'+capture_ts+'id_/'+capture_url
        print(wayback_url)
        # get captured page
        attempt=1 
        while True:
            try:
                response = requests.get(wayback_url, timeout=10) # Set a timeout to prevent hanging
                response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

                # If no exception was raised, the request was successful
                print("✅ Query executed successfully.")
                # Parse the HTML content using BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')
                table = soup.find("table")
                page_df = pd.read_html(io.StringIO(str(table)))[0]
                #check if india related file rows exist
                page_df=page_df[page_df['File'].str.contains(r'^India_andhra-pradesh',case=False, regex=True)]

                if len(page_df)>0 :
                    capture_df = pd.concat([capture_df, page_df], ignore_index=True)
                    print(f"record(s) added {dt_pattern}")
                break

                if(attempt==3):
                    break
                attempt=attempt+1
                time.sleep(5)  # wait for 5 seconds before retrying
            except HTTPError as http_err:
                print(f"HTTP error occurred: {http_err}") # e.g., 404 Not Found, 503 Service Unavailable

            except RequestException as req_err:
                print(f"A general request error occurred: {req_err}") # e.g., Connection error, Timeout, MissingSchema

            except ValueError:
                print("Successfully received a non-JSON response from the API.") # Handles cases where the response isn't JSON

            except Exception as err:
                print(f"An unexpected error occurred: {err}") 
capture_df.to_csv(file_path, index=False, encoding="utf-8")

Checking archived page for ^202505
The index of the last match is: 168
https://web.archive.org/web/20250502001157id_/http://download.osmand.net/list.php
✅ Query executed successfully.
record(s) added ^202505
Checking archived page for ^202508
The index of the last match is: 174
https://web.archive.org/web/20250823105239id_/http://download.osmand.net/list.php
✅ Query executed successfully.
record(s) added ^202508
Checking archived page for ^202511
The index of the last match is: 179
https://web.archive.org/web/20251126025324id_/http://download.osmand.net/list.php
✅ Query executed successfully.
record(s) added ^202511


In [None]:
capture_df.to_csv(file_path, index=False, encoding="utf-8")