# Masters In Data Science

In [1]:
# Import Dependencies
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
from sqlalchemy import create_engine
# from config import username, password

## State Data
### https://www.mastersindatascience.org/schools/

### Extract - Romi

In [2]:
# Ctrl + F "Data Science Graduate Degrees by State" to see the card breakdown on the above page
# Use Requests to call the url and BeautifulSoup to get the state data elements
url = "https://www.mastersindatascience.org/schools/"
response = requests.get(url).text
soup = bs(response, "html.parser")

In [3]:
# Targeting col-md-6 returns only the state cards that we need
card_list = soup.find_all("div", class_="col-md-6")
print(card_list[0].text.strip().split("\n")[0]) # check the first element
print(card_list[-1].text.strip().split("\n")[0]) # check the last element

Alabama
District of Columbia


In [15]:
# Iterate through each state and store the information from each section
state_data = []
for card in card_list:
    state = card.find("h3").text
    school_count = int(card.find("p").text.split(" ")[0]) # only captures the int, not the full string (i.e. 5, not "5 Schools")
    # capture the number of programs in each type
    program_list = card.find_all("div", class_="dept")
    prog_ba = 0
    prog_cs = 0
    prog_gs = 0
    prog_hi = 0
    prog_cert = 0
    prog_doc = 0
    prog_wage = 0
    for program in program_list:
        # check for each type of program
        if "Business Analytics" in program.text:
            prog_ba = program.strong.text.split(":")[0]
        if "Computer Science" in program.text:
            prog_cs = program.strong.text.split(":")[0] 
        if "Geospatial Science" in program.text:
            prog_gs = program.strong.text.split(":")[0]
        if "Health Informatics" in program.text:
            prog_hi = program.strong.text.split(":")[0] 
        if "Certificate" in program.text:
            prog_cert = program.strong.text.split(":")[0] 
        if "Doctorate" in program.text:
            prog_doc = program.strong.text.split(":")[0] 
        if "Annual Mean Wage" in program:
            prog_wage = program.text.split(":")[1]
    item = {
        "state": state,
        "school_count": school_count,
        "prog_ba": prog_ba,
        "prog_cs": prog_cs,
        "prog_gs": prog_gs,
        "prog_hi": prog_hi,
        "prog_cert": prog_cert,
        "prog_doc": prog_doc,
        "prog_wage": prog_wage,
    }

    state_data.append(item)
print (state_data)

[{'state': 'Alabama', 'school_count': 5, 'prog_ba': '7', 'prog_cs': '1', 'prog_gs': '1', 'prog_hi': 0, 'prog_cert': '1', 'prog_doc': 0, 'prog_wage': 0}, {'state': 'Arizona', 'school_count': 3, 'prog_ba': '6', 'prog_cs': '4', 'prog_gs': '1', 'prog_hi': '1', 'prog_cert': '1', 'prog_doc': '1', 'prog_wage': 0}, {'state': 'Arkansas', 'school_count': 2, 'prog_ba': '2', 'prog_cs': 0, 'prog_gs': 0, 'prog_hi': '1', 'prog_cert': '1', 'prog_doc': '1', 'prog_wage': 0}, {'state': 'California', 'school_count': 25, 'prog_ba': '29', 'prog_cs': '7', 'prog_gs': '5', 'prog_hi': '7', 'prog_cert': '9', 'prog_doc': '5', 'prog_wage': 0}, {'state': 'Colorado', 'school_count': 8, 'prog_ba': '12', 'prog_cs': '5', 'prog_gs': '1', 'prog_hi': '1', 'prog_cert': '3', 'prog_doc': '1', 'prog_wage': 0}, {'state': 'Connecticut', 'school_count': 6, 'prog_ba': '6', 'prog_cs': '1', 'prog_gs': 0, 'prog_hi': 0, 'prog_cert': '1', 'prog_doc': 0, 'prog_wage': 0}, {'state': 'Delaware', 'school_count': 1, 'prog_ba': '7', 'prog_cs

In [3]:
# Store each state as a dataframe with all available columns (BA, CS, Geospatial, Health Informatics, Certificate, Doctorate, etc)

### Transform - Fernando & Mays

In [4]:
# Convert the Average Wage to an integer

In [5]:
# Convert the Job Change Percent to a float (remove the percent symbol)

In [6]:
# Remove "NA" values

Any other transformations we could do? (Cleaning, Aggregating, etc.)

### Load

In [8]:
# Create engine to connect with PostgreSQL
engine = create_engine(f'postgresql://{username}:{password}@localhost/datascience_db')

In [9]:
# Load the dataframe into PostgreSQL

### Key Takeaways

In [10]:
# Draft ideas on what this data could be used for in the future.
# Who would benefit from seeing a breakdown of the data?
# What sort of conclusions can be drawn?
# Are there any other sources we could use in conjunction to bolster the data?

## Career Data
### https://www.mastersindatascience.org/careers/

### Extract

In [14]:
# Scrape each of the following URLs for Career Salary data and store the html
## https://www.mastersindatascience.org/careers/business-analyst/
## https://www.mastersindatascience.org/careers/data-analyst/
## https://www.mastersindatascience.org/careers/data-architect/
## https://www.mastersindatascience.org/careers/data-engineer/
## https://www.mastersindatascience.org/careers/data-scientist/
## https://www.mastersindatascience.org/careers/marketing-analyst/
## https://www.mastersindatascience.org/careers/statistician/

### Do we want to just focus on the data-* ones to help simplify this part?

Each page has the salary section formatted differently so each section will likely need to be tailored to the specific page

In [None]:
# Initialize career list which will have salary data dictionaries appended and later be converted to a dataframe

I'm not 100% on the different fields/columns we're going to have for this so that will need to figured out as we code this next part. After we have that list of columns, we'll need to make another table in our database to be able to receive and store the data.

In [15]:
# Append salary data for Business Analyst

In [16]:
# Append salary data for Data Analyst

In [17]:
# Append salary data for Data Architect

In [18]:
# Append salary data for Data Engineer

In [19]:
# Append salary data for Data Scientist

In [20]:
# Append salary data for Marketing Analyst

In [21]:
# Append salary data for Statistician

Maybe make a function for these to streamline and prevent code redundancy?

In [22]:
# Convert career list to dataframe

### Transform

Due to each page being different, there will likely need to be a fair amount of cleaning. I'm not sure what specifically at the moment though

### Load

In [23]:
# Use the engine to load the dataframe into PostgreSQL

### Key Takeaways

In [24]:
# Draft ideas on what this data could be used for in the future.
# Who would benefit from seeing a breakdown of the data?
# What sort of conclusions can be drawn?
# Are there any other sources we could use in conjunction to bolster the data?