In [None]:
# title: osmand_ap_stats.ipynb
# gets size of india_andhrapradesh osmand download file size over time from archive.org
# input:  "data/osmand_ap_stats.csv"
# output: visualisation of stats
# output: updating input_file

In [None]:
%pip install pandas
%pip install matplotlib


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
from pathlib import Path

file_path=Path("data/osmand_ap_stats.csv")
if file_path.exists():
    print(f"The path '{file_path}' exists, will append any newdata.")
    df = pd.read_file(file_path)
    df.head()
    
else:
    print(f"The path '{file_path}' does not exist, will create if I get new data.")
# 


In [8]:
# check for existence of backups on archive
osmand_downloads_uri= 'download.osmand.net/list.php'
# wayback machine cdx search url 
# wayback_url= 'http://web.archive.org/cdx/search/cdx?url='archive.org&from=2010&to=2011'
# get osmand download page captures 
wayback_url='http://web.archive.org/cdx/search/cdx?url='+osmand_downloads_uri+'&from=2015&to=2025&output=json'
row_filterexp=r'^India_andhra-pradesh'
# get response
# %%
# Send the request to Overpass API
response = requests.get(wayback_url)

# Check for successful response
if response.status_code == 200:
    print("✅ Query executed successfully.")
else:
    raise Exception(f"Overpass API request failed with status code {response.status_code}")
response

Exception: Overpass API request failed with status code 503

In [None]:
import io
# Convert the JSON response directly to a DataFrame
df = pd.read_json(io.StringIO(response.text))

# 2. Assign the first row to the column headers
new_header = df.iloc[0]  # grabs the first row (index 0)
df.columns = new_header # set the new headers

# 3. Remove the first row from the data
df = df[1:]

# 4. (Optional) Reset the index so it starts from 0 again
df = df.reset_index(drop=True)
print(df)

In [None]:
min_ts=df['timestamp'][0]
max_ts=df['timestamp'].iloc[-1]
print(f'number of captures: {len(df)}; min_timestamp:{min_ts}; max_timestamp:{max_ts}')


In [None]:
from datetime import datetime


# The format corresponding to YYYYMMDDHHMMSS
format_pattern = '%Y%m%d%H%M%S'

# Convert the string to a datetime object
min_ts_object= datetime.strptime(min_ts, format_pattern)
max_ts_object= datetime.strptime(max_ts, format_pattern)

# Print the resulting datetime object and its type
print(f"Datetime object: {min_ts_object}")
print(f"Type: {type(min_ts_object)}")

In [None]:
%pip install pandas lxml beautifulsoup4
from bs4 import BeautifulSoup
# Path to your CSV file (change 'data.csv' to your file name or path)
capture_df=pd.DataFrame() 
# 'MS' frequency generates the start of each month (Month Start)
for dt in pd.date_range(start=min_ts_object, end=max_ts_object, freq='MS'):
    dt_pattern=dt.strftime("^%Y%m")
    print(f'processing archived page {dt_pattern}')
    # get the first timestamp in df ['timestamp] that contains dt_str

    # 1. Create a boolean Series indicating which rows match the regex
    matches= df['timestamp'].str.contains(dt_pattern,regex=True,na=False)
    # 2. Find the index of the first True value
    # idxmax() returns the first index of the maximum value (True is max)
    first_match_index = matches.idxmax()

    # 3. Check if any match was found before attempting to access the row
    if matches.any():
        capture_ts = df.loc[first_match_index]['timestamp']
        capture_url= df.loc[first_match_index]['original']
        wayback_url='http://web.archive.org/web/'+capture_ts+'/'+capture_url
        print(wayback_url)
        # get captured page
        # %%
        # Send the request to Overpass API
        response = requests.get(wayback_url)

        # Check for successful response
        if response.status_code == 200:
            print("✅ Query executed successfully.")
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            # Find the first h1 tag
            first_h1 = soup.find('h1')

            # Use find_next_sibling() to find the next 'table' tag
            # This works if they are direct siblings (no other tags in between)
            table_after_first_h1 = first_h1.find_next_sibling('table')

            if table_after_first_h1:
                print("Found table after the first h1 (using find_next_sibling):")
                # Convert the specific table (as a string) into a pandas DataFrame
                # read_html returns a list, so we access the first element [0]
                page_df = pd.read_html(io.StringIO(str(table_after_first_h1)))[0]
                #check if india related file rows exist
                page_df=page_df[page_df['File'].str.contains(r'^India_andhra-pradesh',case=False, regex=True)]
                if len(page_df)>=1 :
                    capture_df = pd.concat([capture_df, page_df], ignore_index=True)
                    print(f"record(s) added {dt_pattern}")
                    break
                    sleep(5)

            else:
                print("No immediate sibling table found after the first h1 using find_next_sibling.")
        else:
            print(f"response-code from archive {response.status_code}")            
    else:
        print("No matching archive page for timestamp found.")