In [1]:
import pandas as pd
import numpy as np
import sqlite3 as sql

In this excersice I am using pandas to work on SQLite database with statistics of job data depending on the field of study completed.

### Connecting to database

In [2]:
database = "jobs.db"
connection = sql.connect(database)

In [3]:
query = '''SELECT * FROM recent_grads LIMIT 10'''

In [4]:
query

'SELECT * FROM recent_grads LIMIT 10'

### Getting the result from db

In [5]:
recent_grads = pd.read_sql_query(query, connection)
recent_grads.head(3)

Unnamed: 0,index,Rank,Major_code,Major,Major_category,Total,Sample_size,Men,Women,ShareWomen,...,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,0,1,2419,PETROLEUM ENGINEERING,Engineering,2339,36,2057,282,0.120564,...,270,1207,37,0.018381,110000,95000,125000,1534,364,193
1,1,2,2416,MINING AND MINERAL ENGINEERING,Engineering,756,7,679,77,0.101852,...,170,388,85,0.117241,75000,55000,90000,350,257,50
2,2,3,2415,METALLURGICAL ENGINEERING,Engineering,856,3,725,131,0.153037,...,133,340,16,0.024096,73000,50000,105000,456,176,0


# Basic SQL Queries

##### SQL query that returns the majors where females were a minority

In [6]:
query = '''SELECT Major, ShareWomen FROM recent_grads
            WHERE ShareWomen < 0.5'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,ShareWomen
0,PETROLEUM ENGINEERING,0.120564
1,MINING AND MINERAL ENGINEERING,0.101852
2,METALLURGICAL ENGINEERING,0.153037
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,0.107313
4,CHEMICAL ENGINEERING,0.341631


SQL query that returns:
- All majors with majority female and
- All majors had a median salary greater than 50000.
- Columns in order:
    - Major
    - Major_category
    - Median
    - ShareWomen

In [7]:
query = '''SELECT Major, Major_category, Median, ShareWomen FROM recent_grads
            WHERE ShareWomen > 0.5 AND Median > 50000'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,Major_category,Median,ShareWomen
0,ACTUARIAL SCIENCE,Business,62000,0.535714
1,COMPUTER SCIENCE,Computers & Mathematics,53000,0.578766


The first 20 majors that either:
- Have a Median salary greater than or equal to 10,000, or
- Have less than or equal to 1,000 Unemployed people
- Columns in order:
    - Major
    - Median
    - Unemployed

In [8]:
query = '''SELECT Major, Median, Unemployed FROM recent_grads
            WHERE Median > 10000 OR Unemployed <= 1000
            LIMIT 20'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,Median,Unemployed
0,PETROLEUM ENGINEERING,110000,37
1,MINING AND MINERAL ENGINEERING,75000,85
2,METALLURGICAL ENGINEERING,73000,16
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,70000,40
4,CHEMICAL ENGINEERING,65000,1672


SQL query that returns all majors that:
- Fell under the category of Engineering and either
- Had mostly women graduates
- Or had an unemployment rate below 5.1%, which was the rate in August 2015
- Columns in order:
    - Major
    - Major_category
    - ShareWomen
    - Unemployment_rate

In [9]:
query = '''SELECT Major, Major_category, ShareWomen, Unemployment_rate from recent_grads
            WHERE Major_category = 'Engineering' AND (ShareWomen > 0.5 OR Unemployment_rate < 0.051)'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,Major_category,ShareWomen,Unemployment_rate
0,PETROLEUM ENGINEERING,Engineering,0.120564,0.018381
1,METALLURGICAL ENGINEERING,Engineering,0.153037,0.024096
2,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,0.107313,0.050125
3,MATERIALS SCIENCE,Engineering,0.31082,0.023043
4,ENGINEERING MECHANICS PHYSICS AND SCIENCE,Engineering,0.183985,0.006334


Query that returns all majors where:
- ShareWomen is greater than 0.3
- And Unemployment_rate is less than .1
- Columns in order:
    - Major,
    - ShareWomen,
    - Unemployment_rate
    - Order the results in descending order by the ShareWomen column.

In [10]:
query = '''SELECT Major, ShareWomen, Unemployment_rate FROM recent_grads
            WHERE ShareWomen > 0.3 AND Unemployment_rate < 0.1
            ORDER BY ShareWomen DESC'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,ShareWomen,Unemployment_rate
0,EARLY CHILDHOOD EDUCATION,0.967998,0.040105
1,MATHEMATICS AND COMPUTER SCIENCE,0.927807,0.0
2,ELEMENTARY EDUCATION,0.923745,0.046586
3,ANIMAL SCIENCES,0.910933,0.050862
4,PHYSIOLOGY,0.906677,0.069163


Query that returns the Engineering or Physical Sciences majors in ascending order of unemployment rates.
   - The results contain the Major_category, Major, and Unemployment_rate columns.

In [11]:
query = '''SELECT Major_category, Major, Unemployment_rate FROM recent_grads
            WHERE Major_category = 'Engineering' OR Major_category = 'Physical Sciences'
            ORDER BY Unemployment_rate ASC'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major_category,Major,Unemployment_rate
0,Engineering,ENGINEERING MECHANICS PHYSICS AND SCIENCE,0.006334
1,Engineering,PETROLEUM ENGINEERING,0.018381
2,Physical Sciences,ASTRONOMY AND ASTROPHYSICS,0.021167
3,Physical Sciences,ATMOSPHERIC SCIENCES AND METEOROLOGY,0.022229
4,Engineering,MATERIALS SCIENCE,0.023043


# Summary Statistics

In the next steps I will be answering the questions:
- How many majors had mostly female students? How many had mostly male students? What proportion of majors had mostly female students?
- Which category of majors had the lowest unemployment rates? Which category of majors had the highest female representation?
- Which majors had the largest spread (difference) between the 25th and 75th percentile starting salaries?

##### Query that returns the number of majors with mostly male students

In [12]:
query = '''SELECT COUNT(Major) FROM recent_grads
            WHERE ShareWomen < 0.5'''
pd.read_sql_query(query, connection)

Unnamed: 0,COUNT(Major)
0,76


##### Query that returns the lowest median salary from the major category Engineering

In [13]:
query = '''SELECT MIN(Median) from recent_grads
            WHERE Major_category = 'Engineering\''''
pd.read_sql_query(query, connection)

Unnamed: 0,MIN(Median)
0,40000


##### Query that computes the sum of the Total column, only the total number of students integer value.

In [14]:
query = '''Select SUM(Total) from recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,SUM(Total)
0,6776015


- What's the lowest median salary?
- What's the highest median salary?
- What's the total number of students?

In [15]:
query = '''SELECT MIN(Median), MAX(Median), SUM(Total) from recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,MIN(Median),MAX(Median),SUM(Total)
0,22000,110000,6776015


##### Query that computes the average of the Total column, the minimum of the Men column, and the maximum of the Women column

In [16]:
query = '''SELECT AVG(Total), MIN(Men), MAX(Women) 
            FROM recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,AVG(Total),MIN(Men),MAX(Women)
0,39167.716763,119,307087


##### AS operator

Query that returns, in the following order:
- the number of rows as Number of Students
- the maximum value of Unemployment_rate as Highest Unemployment Rate

In [17]:
query = '''SELECT COUNT(Total) as "Number of Students", 
            MAX(Unemployment_rate) as "Highest Unemployment Rate"
            FROM recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,Number of Students,Highest Unemployment Rate
0,173,0.177226


##### Unique values - DISTINCT

##### Count the number of unique values in a column

In [18]:
query = '''SELECT COUNT(DISTINCT(Major_category)) unique_major_categories 
            FROM recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,unique_major_categories
0,16


Query that returns the number of unique values in the Major, Major_category, and Major_code columns. Aliases in the following order:
- For the unique value count of the Major column, use the alias unique_majors.
- For the unique value count of the Major_category column, use the alias unique_major_categories.
- For the unique value count of the Major_code column, use the alias unique_major_codes.

In [19]:
query = '''SELECT COUNT(DISTINCT(Major)) unique_majors,
                  COUNT(DISTINCT(Major_category)) unique_major_categories,
                  COUNT(DISTINCT(Major_code)) unique_major_codes
            FROM recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,unique_majors,unique_major_categories,unique_major_codes
0,173,16,173


##### Arithmetic operators

##### Which majors had the largest spread (difference) between the 25th and 75th percentile starting salaries?

In [20]:
query = '''SELECT Major, Major_category, P75th - P25th quartile_spread 
            FROM recent_grads
            ORDER BY quartile_spread ASC
            LIMIT 20'''
pd.read_sql_query(query, connection)

Unnamed: 0,Major,Major_category,quartile_spread
0,MILITARY TECHNOLOGIES,Industrial Arts & Consumer Services,0
1,SCHOOL STUDENT COUNSELING,Education,2000
2,LIBRARY SCIENCE,Education,2000
3,COURT REPORTING,Law & Public Policy,4000
4,PHARMACOLOGY,Biology & Life Science,5000
5,EDUCATIONAL ADMINISTRATION AND SUPERVISION,Education,6000
6,COUNSELING PSYCHOLOGY,Psychology & Social Work,6800
7,SPECIAL NEEDS EDUCATION,Education,10000
8,MATHEMATICS TEACHER EDUCATION,Education,10000
9,SOCIAL WORK,Psychology & Social Work,10000
