In [34]:
import requests
import argparse
import time
import json
from io import StringIO
import gzip
import csv
import codecs
from bs4 import BeautifulSoup
import sys
from io import BytesIO

In [35]:
index_list = ["2019-04","2019-09","2019-13"]
domain= "https://www.uscis.gov/news/all-news"

In [36]:
def search_domain(domain):

    record_list = []
    
    print ("[*] Trying target domain: %s" % domain)
    
    for index in index_list:
        
        print("[*] Trying index %s" % index)
        
        cc_url  = "http://index.commoncrawl.org/CC-MAIN-%s-index?" % index
        cc_url += "url=%s&matchType=domain&output=json" % domain
        
        response = requests.get(cc_url)
        
        if response.status_code == 200:
            
            records = response.content.splitlines()
            
            for record in records:
                record_list.append(json.loads(record))
            
            print ("[*] Added %d results." % len(records))
            
    
    print ("[*] Found a total of %d hits." % len(record_list))
    
    return record_list       

In [37]:
def download_page(record):

    offset, length = int(record['offset']), int(record['length'])
    offset_end = offset + length - 1

    # We'll get the file via HTTPS so we don't need to worry about S3 credentials
    # Getting the file on S3 is equivalent however - you can request a Range
    prefix = 'https://commoncrawl.s3.amazonaws.com/'
    
    # We can then use the Range header to ask for just this set of bytes
    resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
    
    # The page is stored compressed (gzip) to save space
    # We can extract it using the GZIP library
    raw_data =BytesIO(resp.content)
    f = gzip.GzipFile(fileobj=raw_data)
    
    # What we have now is just the WARC response, formatted:
    data = f.read()
    return data


In [38]:
def extract_external_links(html_content,link_list):

    parser = BeautifulSoup(html_content)
        
    links = parser.find_all("a")
    
    if links:
        
        for link in links:
            href = link.attrs.get("href")
            
            if href is not None:
                
                if domain not in href:
                    if href not in link_list and href.startswith("http"):
                        print("[*] Discovered external link: %s" % href)
                        link_list.append(href)

    return link_list

In [39]:
record_list = search_domain(domain)
link_list   = []
loop= 0
for record in record_list:
    
    html_content = download_page(record)
    loop+=1
    #print("[*] Retrieved %d bytes for %s" % (len(html_content),record['url']))
    if loop==200:
        break
    link_list = extract_external_links(html_content,link_list)
    

print ("[*] Total external links discovered: %d" % len(link_list))



[*] Trying target domain: https://www.uscis.gov/news/all-news
[*] Trying index 2019-04
[*] Added 8122 results.
[*] Trying index 2019-09
[*] Added 8667 results.
[*] Trying index 2019-13
[*] Added 10343 results.
[*] Found a total of 27132 hits.
[*] Discovered external link: https://www.uscis.gov/
[*] Discovered external link: https://www.facebook.com/uscis
[*] Discovered external link: https://www.twitter.com/uscis
[*] Discovered external link: https://www.instagram.com/uscis
[*] Discovered external link: https://www.youtube.com/uscis
[*] Discovered external link: https://public.govdelivery.com/accounts/USDHSCIS/subscriber/new
[*] Discovered external link: https://www.linkedin.com/company/uscis
[*] Discovered external link: https://my.uscis.gov/account/onboarding
[*] Discovered external link: https://egov.uscis.gov/formsbymail/
[*] Discovered external link: http://www.cbp.gov/travel/international-visitors/i-94-instructions
[*] Discovered external link: http://travel.state.gov/passport/pa

[*] Discovered external link: https://help.usajobs.gov/index.php/Main_Page
[*] Discovered external link: https://www.uscis.gov/sites/default/files/USCIS/files/FY18/FY18NOFO_CINAS.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/USCIS/files/FY18/FY18NOFO_Refugee_and_Asylee.pdf
[*] Discovered external link: http://www.uscis.gov/grants
[*] Discovered external link: http://www.grants.gov
[*] Discovered external link: https://www.uscis.gov/about-us/leadership
[*] Discovered external link: http://www.uscis.gov/i-290b
[*] Discovered external link: https://egov.uscis.gov/coa/displayCOAForm.do
[*] Discovered external link: https://www.uscis.gov/about-us/directorates-and-program-offices/administrative-appeals-office-aao/administrative-appeals-forms
[*] Discovered external link: https://www.uscis.gov/about-us/directorates-and-program-offices/administrative-appeals-office-aao/contact-us
[*] Discovered external link: https://www.uscis.gov/about-us/directorates-and-program

[*] Discovered external link: https://www.uscis.gov/sites/default/files/USCIS/About%20Us/Electronic%20Reading%20Room/Applicant%20Service%20Reference%20Guide/File_Forms.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/USCIS/About%20Us/Electronic%20Reading%20Room/Applicant%20Service%20Reference%20Guide/Forms_Orders.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/USCIS/About%20Us/Electronic%20Reading%20Room/Applicant%20Service%20Reference%20Guide/US_Citizens.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/USCIS/About%20Us/Electronic%20Reading%20Room/Applicant%20Service%20Reference%20Guide/Permanent-Residents4-3.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/USCIS/About%20Us/Electronic%20Reading%20Room/Applicant%20Service%20Reference%20Guide/Nonimmigrant_Services.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/Asylees_Refugees.pdf

[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/New_England_Center_for_Business_Development_-_Senator_Collins.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/Unlawful_presence_-_Senator_Murray.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/O_Visa_Program_-_Chairman_Grassley.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/H-2B_2018_Visa_cap_-_Representative_Rothfus_10-16-2018.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/H-4_Rule_-_Senator_Harris.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/Public_Charge_Rule_-_Representative_Blake.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/U.S._Refugee_Admissions_Program_USRAP_-_Chairman_Grassley.pdf
[

[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/Central_American_Minors_CAM_refugee_program_-_Canny.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/Asylum_Workload_Statistics_for_Approval_Rates_and_Credible_Fear_Found_Rates_FY17_-_FY18_Q3.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/TPS_El_Salvador_Honduras_and_Haiti_-_Representative_King.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/N-400_processing_times_-_Representative_Lofgren.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/STEM_OPT_employers_roles_and_responsibilities_-_Filipiak.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/TPS_Somalia_-_Representative_Ellison.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/file

[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/TPS_Nepal_-_Mayor_de_Blasio.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/TPS_Honduras_Nicaragua_and_Haiti_-_Bussey.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/TPS_-_Darkangelo.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/TPS_El_Salvador_Haiti_Honduras_Nepal_Nicaragua_Somalia_Sudan_South_Sudan_Syria_and_Yemen_-_Mark-Viverito.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/TPS_Honduras_-_Mayor_Walsh.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/TPS_Honduras_-_Sister_Kvale.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/CFGI_Enforcement_Panel_Transcript_11_June_2018.pdf
[*] Discovered exter

[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/H-2B_Visa_cap_-_Representative_Visclosky.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/Encore_Boston_Regional_Center.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/Encore_S._CA_Regional_Center_LLC.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/Encore_Washington-Oregon_Regional_Center_LLC.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/Home_Paradise_Texas_Regional_Center.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/SAA_Cedisus_EB-5_Projects_-_SW_Indiana_Regional_Center_LLC.pdf
[*] Discovered external link: https://www.uscis.gov/sites/default/files/files/nativedocuments/TPS_Nepal_-_Representative_Crowley.pdf
[*] Discovered external link: https

[*] Discovered external link: https://www.uscis.gov/about-us/find-uscis-office/field-offices/washington-seattle-field-office
[*] Discovered external link: https://www.uscis.gov/about-us/find-uscis-office/field-offices/washington-spokane-field-office
[*] Discovered external link: https://www.uscis.gov/about-us/find-uscis-office/field-offices/washington-yakima-field-office
[*] Discovered external link: https://www.uscis.gov/about-us/find-uscis-office/field-offices/oregon-portland-field-office
[*] Discovered external link: https://www.uscis.gov/about-us/find-uscis-office/field-offices/alaska-anchorage-field-office
[*] Discovered external link: https://www.uscis.gov/about-us/find-uscis-office/field-offices/california-san-francisco-field-office
[*] Discovered external link: https://www.uscis.gov/about-us/find-uscis-office/field-offices/california-san-jose-field-office
[*] Discovered external link: https://www.uscis.gov/about-us/find-uscis-office/field-offices/california-los-angeles-field-of

[*] Discovered external link: http://maps.apple.com/?daddr=790 Sandhill Road,Reno,Nevada,89521
[*] Discovered external link: https://www.google.com/maps/search/parking/@39.445361,-119.757511,14z
[*] Discovered external link: http://maps.apple.com/?daddr=9 Ridgewood Road,Bedford,New Hampshire,03110
[*] Discovered external link: https://www.google.com/maps/search/parking/@42.960228,-71.479954,14z
[*] Discovered external link: http://maps.apple.com/?daddr=Peter Rodino Federal Building, 970 Broad Street,Newark,New Jersey,07102
[*] Discovered external link: https://www.google.com/maps/search/parking/@40.734152,-74.173891,14z
[*] Discovered external link: https://www.uscis.gov/about-us/find-uscis-office/field-offices/district-4-newark-district-office
[*] Discovered external link: http://maps.apple.com/?daddr=Jacob Javits Federal Building, 26 Federal Plaza, 8th Floor, Room 8-800,New York,New York,10278
[*] Discovered external link: https://www.google.com/maps/search/parking/@40.714643,-74.004

[*] Discovered external link: https://gr.usembassy.gov/holiday-calendar/
[*] Discovered external link: http://www.unhcr.org/cgi-bin/texis/vtx/contact?iso=GRC
[*] Discovered external link: http://travel.state.gov/content/adoptionsabroad/en/country-information/learn-about-a-country/greece.html
[*] Discovered external link: https://gr.usembassy.gov/visas/
[*] Discovered external link: https://www.uscis.gov/about-us/find-uscis-office/international-offices/contact-us-uscis-nairobi-field-office
[*] Discovered external link: http://www.unhcr.org/cgi-bin/texis/vtx/contact?iso=KEN
[*] Discovered external link: http://travel.state.gov/content/adoptionsabroad/en/country-information/learn-about-a-country/kenya.html
[*] Discovered external link: https://travel.state.gov/content/travel/en/Intercountry-Adoption/Intercountry-Adoption-Country-Information/Kenya.html
[*] Discovered external link: https://ke.usembassy.gov/visas/
[*] Discovered external link: http://www.uscis.gov/about-us/find-uscis-office

In [42]:
with codecs.open("common_crawl/uscis.csv","wb",encoding="utf-8") as output:

    fields = ["URL"]
    
    logger = csv.DictWriter(output,fieldnames=fields)
    logger.writeheader()
    
    for link in link_list:
        logger.writerow({"URL":link})
        

In [41]:
len(link_list)

711

## As we have stated in the read me document,extracting data from common crawl was extremely challenging. Hence, we took the help of a blog post src:https://www.bellingcat.com/resources/2015/08/13/using-python-to-mine-common-crawl/. We obviously modified the code to fit our requirement.