## Web Scraper to scrape data from Sierra Avalanche Center
(and potentially other avalanche centers in USA)


* Observations can be searched on the website
* The search results are resulted as Json data to the browser
* The resulted Json data has most of the information which can be used to download the images and store the meta information about them


### Web Scraping Process:

* We will first manually go to Avalanche Center website and search for observations (it is important to not pull too much data in one go (due to Json return size limit). So we will limit to search 3 months at a time

* The resultes json data will be stored in our S3 bucket s3://s3-avalanche-guard/data/avalanche_center/webscrape/SAC/json/unprocessed

* We will run a Python script to download the images in the Json and flatten their meta data. After processing the original json file will be moved to s3://s3-avalanche-guard/data/avalanche_center/webscrape/SAC/json/processed

>* The downloaded image files will be kept at s3://s3-avalanche-guard/data/avalanche_center/webscrape/images

>* The corresponding meta data for each image will be kept at s3://s3-avalanche-guard/data/avalanche_center/webscrape/images-metadata





To access S3 or any other AWS services we need SDK
The SDK is composed of two key Python packages: Botocore (the library providing the low-level functionality shared between the Python SDK and the AWS CLI) and Boto3 (the package implementing the Python SDK itself).

In [99]:
import boto3
import random
import shutil
import json
from io import BytesIO, StringIO
import pandas as pd


In [100]:

# move an S3 object to another object
# example:
#move_s3_object('my_bucket', old_key='tmp/test.txt', new_key='tmp/tmp2/test.txt')
def move_s3_object(bucket: str, old_key: str, new_key: str) -> None:
    boto3.resource('s3').Object(bucket,  new_key).copy_from(CopySource=f'{bucket}/{old_key}')
    boto3.client('s3').delete_object(Bucket=bucket, Key=old_key)

# move an S3 object to another folder
# example:
#move_s3_object('my_bucket', old_folder='tmp/', new_folder='tmp/tmp2/', object_name='test.txt')
def s3_move_to_another_folder(bucket: str, old_folder: str, new_folder: str, object_name:str) -> None:
    
    old_key = old_folder+object_name
    new_key = new_folder+object_name
    boto3.resource('s3').Object(bucket,  new_key).copy_from(CopySource=f'{bucket}/{old_key}')
    boto3.client('s3').delete_object(Bucket=bucket, Key=old_key)

    

#print bucket contents
def print_all_s3_objects(bucket_name:str, prefix:str):
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name,Prefix=prefix)

    for page in page_iterator:
        if 'Contents' in page:
            for obj in page['Contents']:
                print(f'Name: {obj["Key"]} | Size: {obj["Size"]}')
        else:
            print("Bucket is empty or does not exist")


# Get bucket contents as a list
def list_all_s3_objects(bucket_name:str, prefix:str)-> [str]:
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name,Prefix=prefix)

    objlist = []
    for page in page_iterator:
        if 'Contents' in page:
            for obj in page['Contents']:
                if obj["Key"].replace(prefix, "") != "":
                    objlist.append( obj["Key"].replace(prefix, ""))

    return objlist
            

    
# send an url based image to s3
def send_image_to_s3(url, foldername, filename, bucket_name ):
    print(f"sending image {url}")
    ##bucket_name = 'XXX'
    ##AWS_SECRET_ACCESS_KEY = "XXX"
    ##AWS_ACCESS_KEY_ID = "XXX"

    s3 = boto3.client('s3') ##, aws_access_key_id=AWS_ACCESS_KEY_ID,         aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    response = requests.get(url)
    img = BytesIO(response.content)

    file_name = f'{foldername}/{filename}'
    print('sending {}'.format(file_name))
    r = s3.upload_fileobj(img, bucket_name, file_name)

    s3_path = file_name
    return s3_path    
    
###################################################################

# process manually downloaded Json data from Avalanche Center
# Download the image and save meta data based on the Json
def process_json_download():
    S3_BUCKET_NAME = "s3-avalanche-guard"
    UNPROC_JSON_FOLDER = "data/avalanche_center/webscrape/SAC/json/unprocessed/"
    PROC_JSON_FOLDER = "data/avalanche_center/webscrape/SAC/json/processed/"
    IMAGE_UPLOAD_FOLDER = "data/avalanche_center/webscrape/Images"
    CSV_UPLOAD_FOLDER = "data/avalanche_center/webscrape/Images-metadata"
    
    
    
    
    
    s3 = boto3.resource('s3')
    jsonfilelist = list_all_s3_objects(S3_BUCKET_NAME,UNPROC_JSON_FOLDER)
    for jsonfile in jsonfilelist:
        print(jsonfile)
    
        content_object = s3.Object(S3_BUCKET_NAME, UNPROC_JSON_FOLDER+jsonfile)
        file_content = content_object.get()['Body'].read().decode('utf-8')
        #print(str(file_content)[:100])
        #print(str(file_content)[len(str(file_content))-10:])
        #json_content = json.loads(repr(file_content))
        json_content = json.loads(str(file_content))
        
        proc_avalanche_center_json(jsonfile, json_content,S3_BUCKET_NAME, IMAGE_UPLOAD_FOLDER,CSV_UPLOAD_FOLDER )
        s3_move_to_another_folder(S3_BUCKET_NAME, UNPROC_JSON_FOLDER, PROC_JSON_FOLDER, jsonfile) 
    
        
    return None

# process given Json content of Avalanche Center data
def proc_avalanche_center_json(jsonfile, jsondata, bucket_name, folder_name, csv_folder_name):
    print(str(jsondata)[:100])
    observation_list = jsondata["data"]["getObservationList"]
    print(len(observation_list))

    #create a dictionary structure for storing meta data
    obs_dict = {}
    obs_dict_list = []
    MAX_COUNT = 10000
    
    i = 0
    for obs in observation_list:
        i = i + 1
        if i> MAX_COUNT:
            break
            
        #print(f"Observation = {obs}")
        for media in obs["media"]:
            #print(f"Media = {media}")
            print(f'MediaType = {media["type"]}')
            if media["type"] != "image":
                print(f'Skipping MediaType = {media["type"]}')
                pass
            #print(f"Media = {media}")
            media_seq = media["id"]
            media_seq_s = str(media_seq)
            media_id = obs["id"] + "-"+media_seq_s
            obs_dict = dict( Media_id = media_id,
                             Media_type =  media["type"],
                             ObservationId = obs["id"],
                             Obs_startDate = obs["startDate"],
                             Obs_createdAt = obs["createdAt"],
                             Obs_source = obs["obsSource"],
                             Obs_loc_lat = obs["locationPoint"]["lat"],
                             Obs_loc_lng = obs["locationPoint"]["lng"],
                             Obs_loc_Name = obs["locationName"],
                             Obs_submitterName = obs["name"],
                             Obs_route = obs["route"],
                             Obs_private = obs["private"],
                             Instability_cracking = obs["instability"]["cracking"],
                             Instability_collapsing = obs["instability"]["collapsing"],
                             Avalanches_caught = obs["instability"]["avalanches_caught"],
                             Avalanches_observed = obs["instability"]["avalanches_observed"],
                             Avalanches_triggered = obs["instability"]["avalanches_triggered"],
                             Cracking_description = obs["instability"]["cracking_description"],
                             Collapsing_description = obs["instability"]["collapsing_description"],
                             ObserverType = obs["observerType"],
                             Organization = obs["organization"],
                             Obs_status = obs["status"],
                             Media_Title = media["title"],
                             Media_Caption = media["caption"],
                             Media_Orig_Url = media["url"]["original"]
                        )
            print("***************************************************")
            print(f"Media Id = {media_id}")
            #print(obs_dict)
            obs_dict_list.append(obs_dict)
            send_image_to_s3(obs_dict["Media_Orig_Url"], folder_name, media_id+".jpg" , bucket_name)
    
    df = pd.DataFrame(obs_dict_list)
    
    csv_buffer = StringIO()
    df.to_csv(csv_buffer)
    s3_resource = boto3.resource('s3')
    csv_file_name = f"{csv_folder_name}/{jsonfile}.csv"
    s3_resource.Object(bucket_name, csv_file_name).put(Body=csv_buffer.getvalue())
            
    

###########################################################################################




### Now Process

In [101]:

##print_all_s3_objects(s3_bucket_name,"data/cv/uibk/ResNetClassify/train/negative/")

SKIP = False
print(SKIP)
if SKIP :
    print('Skipping the Process entirely')
    pass
else:
    process_json_download()

False
SAC_240124_TO_240331.json
{'data': {'getObservationList': [{'id': '035e6998-f107-4d3b-a19d-5300a1ce5623', 'startDate': '2024-0
313
MediaType = image
***************************************************
Media Id = 035e6998-f107-4d3b-a19d-5300a1ce5623-66ce6700-c387-11ee-8959-23bb16b7c4e3
sending image https://avalanche-org-media.s3.us-west-2.amazonaws.com/IMG_0683_65bfd0ee25aa2.jpeg
sending data/avalanche_center/webscrape/Images/035e6998-f107-4d3b-a19d-5300a1ce5623-66ce6700-c387-11ee-8959-23bb16b7c4e3.jpg
MediaType = image
***************************************************
Media Id = 035e6998-f107-4d3b-a19d-5300a1ce5623-67242d16-c387-11ee-a429-834da70d4466
sending image https://avalanche-org-media.s3.us-west-2.amazonaws.com/IMG_0673_65bfd0ee4c58d.jpeg
sending data/avalanche_center/webscrape/Images/035e6998-f107-4d3b-a19d-5300a1ce5623-67242d16-c387-11ee-a429-834da70d4466.jpg
MediaType = image
***************************************************
Media Id = fbfbc17a-753d-4936-b127-999a