# NVR Crawl & Scrape
Useful tutorial: https://www.youtube.com/watch?v=XjNm9bazxn8&index=5&list=WL  

Crawl and scrape the Navy's ship registry for current and historical ship info: http://www.nvr.navy.mil  

In [14]:
import requests
from bs4 import BeautifulSoup
import json
import datetime
import time
import pandas as pd

In [2]:
# Starting url.
base_url = 'http://www.nvr.navy.mil/'
start_url = 'http://www.nvr.navy.mil/QUICKFIND/HULLLIST_SHIPS.HTML'

In [3]:
def nvr_links(url, target_string):
    """Return a list of the top level links.
    
    Args:
        url (str): URL to pull html links from.
        target_string (str): String to look for.

    Returns:
        list: List containing html links.
    
    """
    
    url = url
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml') # Pull the raw html and store it as text in soup

    # Parse soup and look for links that contain '/NVRSHIPS/HULL_'.
    links_list = []
    
    for link in soup.find_all('a'):
        try:
            if target_string in link.get('href'):
                links_list.append(link.get('href'))
        except:
            pass
    
    return links_list


In [4]:
top_level_links = nvr_links(start_url, '/NVRSHIPS/HULL_')

In [21]:
top_level_links

['../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_CV_90.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_CVA_91.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_CVHE_94.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_CVN_95.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_CVS_96.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_BB_82.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_CA_83.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_CC_84.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_CG_85.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_CGN_86.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_CL_87.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_CLG_88.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_DD_98.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_DDG_99.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_DE_102.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_DER_104.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_DL_105.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_EDD_111.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEGORY_EDDG_112.HTML',
 '../NVRSHIPS/HULL_SHIPS_BY_CATEG

In [5]:
def nvm_scraper(url):
    """Return a dictionary of info for the requested URL.
    
    Args:
        url (str): URL to scrape.

    Returns:
        dict: Contains scraped ship info with key = ship name, and values as ship info.
    
    """
    info = {}
    
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    
    ship_name = soup.find('td', {'class': 'ShipName'}).get_text()
    
    info[ship_name] = {'class': soup.find('span', {'id': 'MainContent_Repeater1_PrototypeClassNumber_0'}).get_text(),
                       'uic'  : soup.find('span', {'id': 'MainContent_Repeater1_UIC_0'}).get_text(),
                       'status': soup.find('a', {'id': 'MainContent_Repeater1_HyperLink3_0'}).get_text(),
                       'fleet': soup.find('span', {'id': 'MainContent_Repeater1_Fleet_0'}).get_text(),
                       'date_status_change': soup.find('span', {'id': 'MainContent_Repeater1_DateStatusChanged_0'}).get_text(),
                       'homeport': soup.find('span', {'id': 'MainContent_Repeater1_Homeport_0'}).get_text(),
                       'maintenance_category': soup.find('span', {'id': 'MainContent_Repeater1_rfc_0'}).get_text(),
                       'berth': soup.find('span', {'id': 'MainContent_Repeater1_BerthName_0'}).get_text(),
                       'force': soup.find('a', {'id': 'MainContent_Repeater1_Force_0'}).get_text(),
                       'builder': soup.find('span', {'id': 'MainContent_Repeater1_builder_0'}).get_text(),
                       'award_date': soup.find('span', {'id': 'MainContent_Repeater1_AwardDate_0'}).get_text(),
                       'commission_date': soup.find('span', {'id': 'MainContent_Repeater1_CommissionDate_0'}).get_text(),
                       'keel_date': soup.find('span', {'id': 'MainContent_Repeater1_KeelDate_0'}).get_text(),
                       'inactivation_date': soup.find('span', {'id': 'MainContent_Repeater1_InactivationDate_0'}).get_text(),
                       'launch_date': soup.find('span', {'id': 'MainContent_Repeater1_LaunchDate_0'}).get_text(),
                       'decommission_date': soup.find('span', {'id': 'MainContent_Repeater1_DecommissionDate_0'}).get_text(),
                       'age_since_launch': soup.find('span', {'id': 'MainContent_Repeater1_LaunchAge_0'}).get_text(),
                       'years_commission_decommission': soup.find('span', {'id': 'MainContent_Repeater1_YearsOfService_0'}).get_text(),
                       'delivery_date': soup.find_all('span', {'id': 'MainContent_Repeater1_DeliveryDate_0'})[0].get_text(),
                       'in-service_date': soup.find_all('span', {'id': 'MainContent_Repeater1_lblInServiceDate_0'})[0].get_text(),
                       'age_since_delivery': soup.find_all('span', {'id': 'MainContent_Repeater1_DeliveryDate_0'})[1].get_text(),
                       'out_of_service_date': soup.find_all('span', {'id': 'MainContent_Repeater1_lblInServiceDate_0'})[1].get_text(),
                       'stricken_date': soup.find('span', {'id': 'MainContent_Repeater1_StrickenDate_0'}).get_text(),
                       'overall_length': soup.find('span', {'id': 'MainContent_Repeater1_OverallLength_0'}).get_text(),
                       'waterline_length': soup.find('span', {'id': 'MainContent_Repeater1_WaterlineLength_0'}).get_text(),
                       'extreme_beam': soup.find('span', {'id': 'MainContent_Repeater1_ExtremeBeam_0'}).get_text(),
                       'waterline_beam': soup.find('span', {'id': 'MainContent_Repeater1_WaterlineBeam_0'}).get_text(),
                       'max_navigational_draft': soup.find('span', {'id': 'MainContent_Repeater1_MaxNavigationalDraft_0'}).get_text(),
                       'draft_limit': soup.find('span', {'id': 'MainContent_Repeater1_FullLoadDraft_0'}).get_text(),
                       'light_displacement': soup.find('span', {'id': 'MainContent_Repeater1_LightDisplacement_0'}).get_text(),
                       'full_displacement': soup.find('span', {'id': 'MainContent_Repeater1_FullDisplacement_0'}).get_text(),
                       'dead_weight': soup.find('span', {'id': 'MainContent_Repeater1_DeadWeight_0'}).get_text(),
                       'hull material': soup.find('span', {'id': 'MainContent_Repeater1_hullMaterial_0'}).get_text(),
                       'num_propellers': soup.find('span', {'id': 'MainContent_Repeater1_NumberOfPropellers_0'}).get_text(),
                       'num_waterjet': soup.find('span', {'id': 'MainContent_Repeater1_NumberOfWaterJets_0'}).get_text(),
                       'propulsion_type': soup.find('span', {'id': 'MainContent_Repeater1_PropulsionName_0'}).get_text(),
                       'officer_accom': soup.find('span', {'id': 'MainContent_Repeater1_NumberOfOfficers_0'}).get_text(),
                       'enlisted_accom': soup.find('span', {'id': 'MainContent_Repeater1_NumberOfEnlisted_0'}).get_text(),
                       'custodian': soup.find('span', {'id': 'MainContent_Repeater1_CustodianName_0'}).get_text(),
                       'planning_yard': soup.find('span', {'id': 'MainContent_Repeater1_PlanningYardName_0'}).get_text(),
                       'nuclear_planning_yard': soup.find('span', {'id': 'MainContent_Repeater1_NukePlanningYardName_0'}).get_text(),
                       'ship_program_mgr': soup.find('span', {'id': 'MainContent_Repeater1_shapmName_0'}).get_text(),
                       'comments': soup.find('span', {'id': 'MainContent_Repeater1_ExternalComments_0'}).get_text(),
                       'last_updated': soup.find('span', {'id': 'MainContent_Repeater1_ModifiedDate_0'}).get_text(),
                      }
    return info

# Test Single Ship

In [19]:
pd.DataFrame.from_dict(nvm_scraper('http://www.nvr.navy.mil/SHIPDETAILS/SHIPSDETAIL_CVN_76_5300.HTML'), orient='index')

Unnamed: 0,class,uic,status,fleet,date_status_change,homeport,maintenance_category,berth,force,builder,...,num_waterjet,propulsion_type,officer_accom,enlisted_accom,custodian,planning_yard,nuclear_planning_yard,ship_program_mgr,comments,last_updated
USS RONALD REAGAN (CVN 76),CVN 68,22178,"Active, in commission",U.S. PACIFIC FLEET,07/12/2003,"YOKOSUKA, JAPAN",,,Battle Force,Newport News Shipbuilding,...,,Steam Turbines (Nuclear),200,6075,US NAVY,Norfolk Naval Shipyard,"Newport News Shipbuilding and Drydock Co., New...",PMS 312,,06/27/2018


In [12]:
# Main scraping loop.
# Requires top_level_links above.

ship_info = {}
count = 0
start_time = time.time()

for top_link in top_level_links:
    # Grab next level links.
    clean_link = top_link.replace(r'../', base_url)
    second_level_links = nvr_links(clean_link, 'SHIPDETAILS')
    
    # Go to each link.
    for second_link in second_level_links:
        clean_second_link = second_link.replace('..\\SHIPDETAILS\\', 'http://www.nvr.navy.mil/SHIPDETAILS/')
        scraped_info = nvm_scraper(clean_second_link) # dict
        ship_info.update(scraped_info) # Merges dict
        
        # Take a break to not hammer the site.
        count += 1
        if count % 100 == 0:
            print(count)
            print('{:.2f} min elapsed'.format((time.time() - start_time)/ 60))
        time.sleep(1)


    print('Completed download of {} records in {:.2f} minutes!'.format(count, (time.time() - start_time)/60))

100
2.59 min elapsed
200
5.33 min elapsed
300
7.95 min elapsed
400
10.56 min elapsed
500
13.28 min elapsed
600
16.08 min elapsed
700
18.62 min elapsed
800
21.37 min elapsed
900
24.02 min elapsed
1000
26.58 min elapsed
1100
29.37 min elapsed
1200
31.97 min elapsed
1300
34.59 min elapsed
1400
37.25 min elapsed
1500
40.09 min elapsed
1600
42.69 min elapsed
1700
45.34 min elapsed
1800
48.03 min elapsed
1900
50.64 min elapsed
2000
53.31 min elapsed
2100
56.21 min elapsed
2200
58.78 min elapsed
2300
61.36 min elapsed
2400
64.21 min elapsed
2500
66.85 min elapsed
2600
69.47 min elapsed


In [17]:
# Save to csv.
current_datetime = datetime.datetime.now()
output_name = 'ship_list_' + current_datetime.strftime("%Y-%m-%d_%H-%M") + '.csv'

pd.DataFrame.from_dict(ship_info, orient='index').to_csv(output_name, index_label='ship_name')