***
# Notebook for scraping popular baby names in the US from US Social Security Webpage
***

# Importing Libraries

In [9]:
import numpy as np
import pandas as pd
import requests

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys

# Getting the URL of the website

In [232]:
url = 'https://www.ssa.gov/cgi-bin/namesbystate.cgi'

### Defining helper function for extracting state names from page

In [250]:
def state_extractor(soup):
    states = []
    for s in soup.find_all('form'):
        for option in s.find_all('option'):
            states.append(option.text)
    
    return states

In [252]:
states = state_extractor(soup)

### Defining helper function for extracting names, gender and numbers from page

In [282]:
def name_extractor(soup, year, state):    
    rank = []
    name = []
    male_name = []
    male_num = []
    female_name = []
    female_num = []
    gender1 = []
    gender2 = []
    
    for table1 in soup.find_all('table'):
        for table2 in table1.find_all('table')[1:]:
            for tds in table2.find_all('tr')[1:]:
                data = tds.text.split('\n')
                rank.append(data[0])
                male_name.append(data[1])
                male_num.append(data[2])
                gender1.append('male')
                female_name.append(data[3])
                female_num.append(data[4])
                gender2.append('female')

    males = pd.DataFrame({'rank':rank,'name':male_name, 'num':male_num, 'gender':gender1})
    females = pd.DataFrame({'rank':rank,'name':female_name, 'num':female_num, 'gender':gender2})
    result = pd.concat([males, females])
    result['year'] = year
    result['state'] = state
    return result

# Scraping the Web Page

In [286]:
browser = webdriver.Chrome(executable_path='/Users/anirudhchandra/AnacondaProjects/WebDriver/chromedriver')
browser.get(url)

years = np.arange(2000,2019).astype(str).tolist()
states = states
result = pd.DataFrame([], columns=['rank','name','num','year','state'])

for year in years:
    for state in states:
        
        #Select state
        state_path = "//select[@name='state']/option[text()=" + "'" + state + "']"
        browser.find_element_by_xpath(state_path).click()
        
        #Locate the Year entry form, clear it and enter new year
        find_year_entry = browser.find_element_by_id('year')
        clear_year_entry = find_year_entry.clear()
        find_year_entry.send_keys(year)

        #Click 'GO' button to search
        go_button = browser.find_element_by_xpath("//input[@type='submit']")
        go_button.click()
        
        #Parsing the page for names
        soup = BeautifulSoup(browser.page_source)
        output = name_extractor(soup, year, state)
        result = pd.concat([result, output],axis=0)

browser.quit()

# Results of Scraped Information

In [287]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193800 entries, 0 to 99
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   rank    193800 non-null  object
 1   name    193800 non-null  object
 2   num     193800 non-null  object
 3   year    193800 non-null  object
 4   state   193800 non-null  object
 5   gender  193800 non-null  object
dtypes: object(6)
memory usage: 10.4+ MB


### Exporting the results as a csv file

In [289]:
result.to_csv('usa_top_names.csv')

***