<a href="https://colab.research.google.com/github/abdalrahmenyousifMohamed/LLM/blob/main/USCIS_Sitemap_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# USCIS Sitemap Downloader

## Package Downloads and Imports

In [1]:
!wget https://github.com/jgm/pandoc/releases/download/3.1.9/pandoc-3.1.9-1-amd64.deb
!sudo dpkg -i pandoc-3.1.9-1-amd64.deb
!pip install pypandoc

--2023-12-23 13:08:53--  https://github.com/jgm/pandoc/releases/download/3.1.9/pandoc-3.1.9-1-amd64.deb
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/571770/65edb7bd-6b98-46d3-9508-a9ab84f29d30?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20231223%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20231223T130853Z&X-Amz-Expires=300&X-Amz-Signature=5a9bcca80848d33ba75f0896f785bd99ea42a6c9ade2933fb4d4ffe0539c93fb&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=571770&response-content-disposition=attachment%3B%20filename%3Dpandoc-3.1.9-1-amd64.deb&response-content-type=application%2Foctet-stream [following]
--2023-12-23 13:08:53--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/571770/65edb7bd-6b98-46d3-9508-a9ab84f29d30?X-Amz-

In [2]:
from bs4 import BeautifulSoup
import requests
import json
import pickle
import pypandoc
from tqdm import tqdm
import pandas as pd
import os

## Download Process

### Sitemap Download

In [3]:
def augment_link(link):
    if link.startswith("/"):
        return "https://www.uscis.gov" + link
    return link

In [4]:
def parse_li_item(li_item):
    link_tag = li_item.find_all("a", recursive=False)[0]
    item_text = link_tag.text.strip()
    item_link = link_tag["href"]
    sub_ul_items = li_item.find_all("ul", recursive=False)
    if len(sub_ul_items) == 0:
        return item_text, augment_link(item_link)
    return {
        "text": item_text,
        "link": augment_link(item_link),
        "sub_list": parse_ul_item(sub_ul_items[0])
    }

In [5]:
def parse_ul_item(ul_item):
    li_items = ul_item.find_all("li", recursive=False)
    return [parse_li_item(item) for item in li_items]

In [6]:
def parse_sitemap_item(sitemap_item):
    top_lv_ul = sitemap_item.find_all("div", recursive=False)[0].find_all("ul", recursive=False)[0]
    return parse_ul_item(top_lv_ul)

In [8]:
uscis_sitemap_url = "https://www.uscis.gov/sitemap"

In [9]:
page = requests.get(uscis_sitemap_url)
soup = BeautifulSoup(page.content, "html.parser")

In [None]:
soup

In [11]:
results = soup.find_all("div", class_="sitemap")
if len(results) != 1:
    raise Exception("This scraper is out of date.")

In [16]:
type(results[0])

bs4.element.Tag

In [17]:
sitemap_items = results[0].find_all("div", class_="sitemap-item", recursive=False)
sitemap_data = [parse_sitemap_item(item) for item in sitemap_items]

In [None]:
sitemap_data

In [19]:
with open("sitemap.json", "w") as f:
    f.write(json.dumps(sitemap_data))

### Sitemap Pages Download

In [20]:
with open("sitemap.json", "r") as f:
    sitemap_data = json.loads(f.read())

In [21]:
def cleanup_soup(link, page, md=True):
    soup = BeautifulSoup(page.content, "html.parser")

    article = soup.find("article", class_="node")
    if article is None:
        return [link, None]

    if not md:
        return [link, article.text]

    decompose_list = []
    for tag in article.find_all("div", class_="accordion__header"):
        new_tag = soup.new_tag("h3")
        new_tag.string = tag.text.strip()
        tag.replaceWith(new_tag)
    for tag in article.find_all(True):
        new_attrs = {}
        if "src" in tag.attrs:
            new_attrs["src"] = augment_link(tag.attrs["src"])
        if 'href' in tag.attrs:
            tag_link = augment_link(tag.attrs["href"])
            if tag_link == "#":
                decompose_list.append(tag)
            new_attrs["href"] = tag_link
        tag.attrs = new_attrs
    for tag in decompose_list:
        try:
            tag.decompose()
        except Exception as e:
            pass
    cleaned_html = str(article)
    md = pypandoc.convert_text(cleaned_html, 'md', format='html')
    return [link, md]

In [22]:
def download_link(link):
    cur_page = requests.get(link)
    if cur_page is None:
        return [link, None]
    return cleanup_soup(link, cur_page)

In [29]:
def unwrap_list(sitemap_list):
    output = []
    for item in sitemap_list:
        if type(item) is list:
            output.append(item)
        else:
            output.append([item["text"], item["link"]])
            output += unwrap_list(item["sub_list"])
    return output

In [None]:
(sitemap_data[0][0])

In [31]:
unwrapped = unwrap_list(sitemap_data[0])

In [36]:
unwrapped

[['About Us', 'https://www.uscis.gov/about-us'],
 ['Mission and Core Values',
  'https://www.uscis.gov/about-us/mission-and-core-values'],
 ['What We Do',
  'https://www.uscis.gov/about-us/mission-and-core-values/what-we-do'],
 ['Organization', 'https://www.uscis.gov/about-us/organization/organization'],
 ['Leadership', 'https://www.uscis.gov/about-us/organization/leadership'],
 ['Directorates and Program Offices',
  'https://www.uscis.gov/about-us/organization/directorates-and-program-offices'],
 ['Administrative Appeals Office (AAO)',
  'https://www.uscis.gov/about-us/organization/directorates-and-program-offices/the-administrative-appeals-office-aao'],
 ['Office of Performance and Quality',
  'https://www.uscis.gov/about-us/organization/directorates-and-program-offices/office-of-performance-and-quality'],
 ['External Affairs Directorate',
  'https://www.uscis.gov/about-us/organization/directorates-and-program-offices/external-affairs-directorate'],
 ['Office of Citizenship',
  'http

In [38]:
data = []
for item in tqdm(unwrapped):
    link_text = item[0]
    link_url = item[1]
    try:
        data.append(download_link(link_url))
    except Exception as e:
        print(link_text, link_url)
        raise e

100%|██████████| 564/564 [06:38<00:00,  1.42it/s]


In [None]:
data

In [39]:
sitemap_download = {item_text: item_data for item_text, item_data in data}

In [40]:
with open("sitemap_data.pkl", "wb") as pickle_file:
    pickle.dump(sitemap_download, pickle_file)

## Save in ZIP

In [41]:
with open("sitemap_data.pkl", "rb") as pickle_file:
    sitemap_download = pickle.load(pickle_file)

In [42]:
def make_path(path):
    isExist = os.path.exists(path)
    if not isExist:
        os.makedirs(path)

In [43]:
def populate_dir(path, item_list):
    for item in item_list:
        if type(item) is dict:
            text = item["text"]
            link = item["link"]
            dir = path + text + "/"
            make_path(dir)
            populate_dir(dir, item["sub_list"])
        else:
            text = item[0]
            link = item[1]
        file_name_noext = text
        file_name_noext = "".join(x for x in file_name_noext if x.isalnum() or x == " ")
        with open(path + file_name_noext + ".md", "w") as mdfile:
            cur_data = sitemap_download[link]
            if cur_data is None:
                cur_data = "Unable to parse"
            mdfile.write(cur_data)

In [44]:
base_dir = "uscis/"
make_path(base_dir)
populate_dir(base_dir, sitemap_data[0])

In [None]:
sitemap_data[0]

In [None]:
!zip -r uscis.zip -r uscis/

In [None]:
unwrapped

In [49]:
sitemap_download[0]

KeyError: ignored

In [51]:
site_df_raw = {
    "heading": [],
    "link": [],
    "text": []
}
for item in unwrapped:
    site_df_raw["link"].append(item[1])
    site_df_raw["heading"].append(item[0])
    site_df_raw["text"].append(sitemap_download[item[1]])

In [52]:
df = pd.DataFrame.from_dict(site_df_raw)

In [53]:
df.to_csv("uscis.csv")

In [54]:
df.head()

Unnamed: 0,heading,link,text
0,About Us,https://www.uscis.gov/about-us,<div>\n\n# About Us\n\n<div>\n\n## Topics\n\n:...
1,Mission and Core Values,https://www.uscis.gov/about-us/mission-and-cor...,<div>\n\n# Mission and Core Values\n\n<div>\n\...
2,What We Do,https://www.uscis.gov/about-us/mission-and-cor...,<div>\n\n# What We Do\n\n<div>\n\n<div>\n\nU.S...
3,Organization,https://www.uscis.gov/about-us/organization/or...,<div>\n\n# Organization\n\n<div>\n\n<div>\n\nT...
4,Leadership,https://www.uscis.gov/about-us/organization/le...,<div>\n\n# Leadership\n\n<div>\n\n<div>\n\n- ...
