# Masters In Data Science

In [1]:
# Import Dependencies
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
from sqlalchemy import create_engine
from config import username, password
from datetime import datetime as dt

## State Data
### https://www.mastersindatascience.org/schools/

### Extract

In [2]:
# Ctrl + F "Data Science Graduate Degrees by State" to see the card breakdown on the above page
# Use Requests to call the url and BeautifulSoup to get the state data elements
url = "https://www.mastersindatascience.org/schools/"
response = requests.get(url).text
soup = bs(response, "html.parser")

In [3]:
# Targeting col-md-6 returns only the state cards that we need
card_list = soup.find_all("div", class_="col-md-6")
print(card_list[0].text.strip().split("\n")[0]) # check the first element
print(card_list[-1].text.strip().split("\n")[0]) # check the last element

Alabama
District of Columbia


In [4]:
# Iterate through each state and store the information from each section
state_data = []
for card in card_list:
    today = dt.today().strftime("%Y-%m-%d")
    state = card.find("h3").text
    school_count = int(card.find("p").text.split(" ")[0]) # only captures the int, not the full string (i.e. 5, not "5 Schools")
    # capture the number of programs in each type
    program_list = card.find_all("div", class_="dept")
    prog_ba, prog_cs, prog_gs, prog_hi, prog_cert, prog_doc = 0, 0, 0, 0, 0, 0
    prog_wage, job_change_percent = "", ""
    for program in program_list:
        # check for each type of program
        if "Business Analytics" in program.text:
            prog_ba = program.strong.text.split(":")[0]
        elif "Computer Science" in program.text:
            prog_cs = program.strong.text.split(":")[0] 
        elif "Geospatial Science" in program.text:
            prog_gs = program.strong.text.split(":")[0]
        elif "Informatics" in program.text:
            prog_hi = program.strong.text.split(":")[0] 
        elif "Certificate" in program.text:
            prog_cert = program.strong.text.split(":")[0] 
        elif "Doctorate" in program.text:
            prog_doc = program.strong.text.split(":")[0] 
        elif "Annual Mean Wage" in program.text:
            prog_wage = program.text
        elif "Projected Job" in program.text:
            job_change = program.text.split(": ")[1]
        else:
            print(f"Program Not Found:\n{program}") # Alert if a new program is found
    item = {
        "Datetime": today,
        "State": state,
        "School Count": school_count,
        "Business Analytics": prog_ba,
        "Computer Science": prog_cs,
        "Geospatial Science": prog_gs,
        "Health Informatics": prog_hi,
        "Certificate": prog_cert,
        "Doctorate": prog_doc,
        "Annual Mean Wage": prog_wage,
        "Projected Job Change": job_change
    }

    state_data.append(item)

### Transform

In [5]:
# Store each state as a dataframe with all available columns (BA, CS, Geospatial, Health Informatics, Certificate, Doctorate, etc)
state_data_df = pd.DataFrame.from_dict(state_data)
state_data_updated=state_data_df.set_index('State')
state_data_updated.head()

Unnamed: 0_level_0,Datetime,School Count,Business Analytics,Computer Science,Geospatial Science,Health Informatics,Certificate,Doctorate,Annual Mean Wage,Projected Job Change
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,2021-02-23,5,7,1,1,0,1,0,"Annual Mean Wage: $138,510 yearly",20.7%
Arizona,2021-02-23,3,6,4,1,1,1,1,"Annual Mean Wage: $111,510 yearly",17.2%
Arkansas,2021-02-23,2,2,0,0,1,1,1,"Annual Mean Wage: $95,220 yearly",
California,2021-02-23,25,29,7,5,7,9,5,"Annual Mean Wage: $136,310 yearly",22%
Colorado,2021-02-23,8,12,5,1,1,3,1,"Annual Mean Wage: $121,180 yearly",22.6%


In [6]:
# Convert the Average Wage to an integer
# state_data_updated = state_data_updated['Annual Mean Wage'].apply(lambda x : x.strip().text("Annual Mean Wage:"))
# state_data_final = state_data_updated['Annual Mean Wage'].str.lstrip('Annual Mean Wage:')
# state_data_final = state_data_updated['Annual Mean Wage'].map(lambda x: x.lstrip('Annual Mean Wage:'))
state_data_drop = state_data_updated.assign(result=state_data_updated['Annual Mean Wage'].str.replace(r'Annual Mean Wage:',''))
state_data_drop.drop('Annual Mean Wage', inplace=True, axis=1)
state_data_drop.rename(columns = {'result':'Annual Mean Wage'}, inplace = True) 
state_data_drop.head()

Unnamed: 0_level_0,Datetime,School Count,Business Analytics,Computer Science,Geospatial Science,Health Informatics,Certificate,Doctorate,Projected Job Change,Annual Mean Wage
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,2021-02-23,5,7,1,1,0,1,0,20.7%,"$138,510 yearly"
Arizona,2021-02-23,3,6,4,1,1,1,1,17.2%,"$111,510 yearly"
Arkansas,2021-02-23,2,2,0,0,1,1,1,,"$95,220 yearly"
California,2021-02-23,25,29,7,5,7,9,5,22%,"$136,310 yearly"
Colorado,2021-02-23,8,12,5,1,1,3,1,22.6%,"$121,180 yearly"


In [7]:
state_data_final = state_data_drop.assign(result=state_data_drop['Annual Mean Wage'].str.replace(r'yearly',''))
state_data_final.drop('Annual Mean Wage', inplace=True, axis=1)
state_data_final.rename(columns = {'result':'Annual Mean Wage'}, inplace = True) 
state_data_final.head()

Unnamed: 0_level_0,Datetime,School Count,Business Analytics,Computer Science,Geospatial Science,Health Informatics,Certificate,Doctorate,Projected Job Change,Annual Mean Wage
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,2021-02-23,5,7,1,1,0,1,0,20.7%,"$138,510"
Arizona,2021-02-23,3,6,4,1,1,1,1,17.2%,"$111,510"
Arkansas,2021-02-23,2,2,0,0,1,1,1,,"$95,220"
California,2021-02-23,25,29,7,5,7,9,5,22%,"$136,310"
Colorado,2021-02-23,8,12,5,1,1,3,1,22.6%,"$121,180"


In [8]:
# Remove "NA" values
state_data_final[state_data_final.isna().any(axis=1)]

Unnamed: 0_level_0,Datetime,School Count,Business Analytics,Computer Science,Geospatial Science,Health Informatics,Certificate,Doctorate,Projected Job Change,Annual Mean Wage
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


### Load

In [9]:
# Create engine to connect with PostgreSQL
engine = create_engine(f'postgresql://{username}:{password}@localhost/datascience_db')
conn = engine.connect()

In [10]:
# Load the dataframe into PostgreSQL if the table does not already exist for today
try:
    table_name = f'programs_by_state-{today}'
    state_data_final.to_sql(table_name,conn)
except ValueError:
    pass

In [11]:
conn.close()

### Key Takeaways

In [12]:
# Draft ideas on what this data could be used for in the future.
# How often should we pull the data?
# What can be examined of the data over time?
# Who would benefit from seeing a breakdown of the data?
# What sort of conclusions can be drawn?
# Are there any other sources we could use in conjunction to bolster the data?