In [51]:
import pandas as pd
import numpy as np
import sqlite3 as sql

In this excersice I am using pandas to work on SQLite database with statistics of job data depending on the field of study completed.

The dataset I will be working on was released by [American Community Survey](https://www.census.gov/programs-surveys/acs/). The data cunducts surveys and aggregate data of job outcomes of students who graduated college between 2010 and 2012.
Each row in the dataset represents a different major in college and contains information.
- Rank - Rank by median earnings (the dataset is ordered by this column).
- Major_code - Major code.
- Major - Major description.
- Major_category - Category of major.
- Total - Total number of people with major.
- Sample_size - Sample size (unweighted) of full-time.
- Men - Male graduates.
- Women - Female graduates.
- ShareWomen - Women as share of total.
- Employed - Number employed.
- Median - Median salary of full-time, year-round workers.
- Low_wage_jobs - Number in low-wage service jobs.
- Full_time - Number employed 35 hours or more.
- Part_time - Number employed less than 35 hours.

In [52]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Connecting to database

In [53]:
database = "jobs.db"
connection = sql.connect(database)

In [54]:
query = '''SELECT * FROM recent_grads LIMIT 10'''

In [55]:
query

'SELECT * FROM recent_grads LIMIT 10'

### Getting the result from db

In [56]:
recent_grads = pd.read_sql_query(query, connection)
recent_grads.head(3)

Unnamed: 0,index,Rank,Major_code,Major,Major_category,Total,Sample_size,Men,Women,ShareWomen,Employed,Full_time,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,0,1,2419,PETROLEUM ENGINEERING,Engineering,2339,36,2057,282,0.120564,1976,1849,270,1207,37,0.018381,110000,95000,125000,1534,364,193
1,1,2,2416,MINING AND MINERAL ENGINEERING,Engineering,756,7,679,77,0.101852,640,556,170,388,85,0.117241,75000,55000,90000,350,257,50
2,2,3,2415,METALLURGICAL ENGINEERING,Engineering,856,3,725,131,0.153037,648,558,133,340,16,0.024096,73000,50000,105000,456,176,0


# Basic SQL Queries

##### SQL query that returns the majors where females were a minority

In [57]:
query = '''SELECT 
                Major, ShareWomen 
            FROM recent_grads
            WHERE ShareWomen < 0.5'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,ShareWomen
0,PETROLEUM ENGINEERING,0.120564
1,MINING AND MINERAL ENGINEERING,0.101852
2,METALLURGICAL ENGINEERING,0.153037
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,0.107313
4,CHEMICAL ENGINEERING,0.341631


SQL query that returns:
- All majors with majority female and
- All majors had a median salary greater than 50000.
- Columns in order:
    - Major
    - Major_category
    - Median
    - ShareWomen

In [58]:
query = '''SELECT 
                Major, Major_category, Median, ShareWomen 
            FROM recent_grads
            WHERE 
                ShareWomen > 0.5 AND Median > 50000'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,Major_category,Median,ShareWomen
0,ACTUARIAL SCIENCE,Business,62000,0.535714
1,COMPUTER SCIENCE,Computers & Mathematics,53000,0.578766


The first 20 majors that either:
- Have a Median salary greater than or equal to 10,000, or
- Have less than or equal to 1,000 Unemployed people
- Columns in order:
    - Major
    - Median
    - Unemployed

In [59]:
query = '''SELECT 
                Major, Median, Unemployed 
            FROM recent_grads
            WHERE 
                Median > 10000 OR Unemployed <= 1000
            LIMIT 20'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,Median,Unemployed
0,PETROLEUM ENGINEERING,110000,37
1,MINING AND MINERAL ENGINEERING,75000,85
2,METALLURGICAL ENGINEERING,73000,16
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,70000,40
4,CHEMICAL ENGINEERING,65000,1672


SQL query that returns all majors that:
- Fell under the category of Engineering and either
- Had mostly women graduates
- Or had an unemployment rate below 5.1%, which was the rate in August 2015
- Columns in order:
    - Major
    - Major_category
    - ShareWomen
    - Unemployment_rate

In [60]:
query = '''SELECT 
                Major, Major_category, ShareWomen, Unemployment_rate 
            FROM recent_grads
            WHERE 
                Major_category = 'Engineering' AND (ShareWomen > 0.5 OR Unemployment_rate < 0.051)'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,Major_category,ShareWomen,Unemployment_rate
0,PETROLEUM ENGINEERING,Engineering,0.120564,0.018381
1,METALLURGICAL ENGINEERING,Engineering,0.153037,0.024096
2,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,0.107313,0.050125
3,MATERIALS SCIENCE,Engineering,0.31082,0.023043
4,ENGINEERING MECHANICS PHYSICS AND SCIENCE,Engineering,0.183985,0.006334


Query that returns all majors where:
- ShareWomen is greater than 0.3
- And Unemployment_rate is less than .1
- Columns in order:
    - Major,
    - ShareWomen,
    - Unemployment_rate
    - Order the results in descending order by the ShareWomen column.

In [61]:
query = '''SELECT 
                Major, ShareWomen, Unemployment_rate 
            FROM recent_grads
            WHERE 
                ShareWomen > 0.3 AND Unemployment_rate < 0.1
            ORDER BY ShareWomen DESC'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,ShareWomen,Unemployment_rate
0,EARLY CHILDHOOD EDUCATION,0.967998,0.040105
1,MATHEMATICS AND COMPUTER SCIENCE,0.927807,0.0
2,ELEMENTARY EDUCATION,0.923745,0.046586
3,ANIMAL SCIENCES,0.910933,0.050862
4,PHYSIOLOGY,0.906677,0.069163


Query that returns the Engineering or Physical Sciences majors in ascending order of unemployment rates.
   - The results contain the Major_category, Major, and Unemployment_rate columns.

In [62]:
query = '''SELECT 
                Major_category, Major, Unemployment_rate 
            FROM recent_grads
            WHERE 
                Major_category = 'Engineering' OR Major_category = 'Physical Sciences'
            ORDER BY Unemployment_rate ASC'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major_category,Major,Unemployment_rate
0,Engineering,ENGINEERING MECHANICS PHYSICS AND SCIENCE,0.006334
1,Engineering,PETROLEUM ENGINEERING,0.018381
2,Physical Sciences,ASTRONOMY AND ASTROPHYSICS,0.021167
3,Physical Sciences,ATMOSPHERIC SCIENCES AND METEOROLOGY,0.022229
4,Engineering,MATERIALS SCIENCE,0.023043


# Summary Statistics

In the next steps I will be answering the questions:
- How many majors had mostly female students? How many had mostly male students? What proportion of majors had mostly female students?
- Which category of majors had the lowest unemployment rates? Which category of majors had the highest female representation?
- Which majors had the largest spread (difference) between the 25th and 75th percentile starting salaries?

##### Query that returns the number of majors with mostly male students

In [63]:
query = '''SELECT COUNT(Major) 
            FROM recent_grads
            WHERE ShareWomen < 0.5'''
pd.read_sql_query(query, connection)

Unnamed: 0,COUNT(Major)
0,76


##### Query that returns the lowest median salary from the major category Engineering

In [64]:
query = '''SELECT MIN(Median) 
            FROM recent_grads
            WHERE Major_category = 'Engineering\''''
pd.read_sql_query(query, connection)

Unnamed: 0,MIN(Median)
0,40000


##### Query that computes the sum of the Total column, only the total number of students integer value.

In [65]:
query = '''SELECT SUM(Total) 
            FROM recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,SUM(Total)
0,6776015


- What's the lowest median salary?
- What's the highest median salary?
- What's the total number of students?

In [66]:
query = '''SELECT 
                MIN(Median), MAX(Median), SUM(Total) 
            FROM recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,MIN(Median),MAX(Median),SUM(Total)
0,22000,110000,6776015


##### Query that computes the average of the Total column, the minimum of the Men column, and the maximum of the Women column

In [67]:
query = '''SELECT 
                AVG(Total), MIN(Men), MAX(Women) 
            FROM recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,AVG(Total),MIN(Men),MAX(Women)
0,39167.716763,119,307087


##### AS operator

Query that returns, in the following order:
- the number of rows as Number of Students
- the maximum value of Unemployment_rate as Highest Unemployment Rate

In [68]:
query = '''SELECT 
                COUNT(Total) as "Number of Students", 
            MAX(Unemployment_rate) as "Highest Unemployment Rate"
            FROM recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,Number of Students,Highest Unemployment Rate
0,173,0.177226


##### Unique values - DISTINCT

##### Count the number of unique values in a column

In [69]:
query = '''SELECT 
                COUNT(DISTINCT(Major_category)) unique_major_categories 
            FROM recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,unique_major_categories
0,16


Query that returns the number of unique values in the Major, Major_category, and Major_code columns. Aliases in the following order:
- For the unique value count of the Major column, use the alias unique_majors.
- For the unique value count of the Major_category column, use the alias unique_major_categories.
- For the unique value count of the Major_code column, use the alias unique_major_codes.

In [70]:
query = '''SELECT 
                COUNT(DISTINCT(Major)) unique_majors,
                COUNT(DISTINCT(Major_category)) unique_major_categories,
                COUNT(DISTINCT(Major_code)) unique_major_codes
            FROM recent_grads'''
pd.read_sql_query(query, connection)

Unnamed: 0,unique_majors,unique_major_categories,unique_major_codes
0,173,16,173


##### Arithmetic operators

##### Which majors had the largest spread (difference) between the 25th and 75th percentile starting salaries?

In [71]:
query = '''SELECT 
                Major, Major_category, P75th - P25th quartile_spread 
            FROM recent_grads
            ORDER BY quartile_spread ASC
            LIMIT 20'''
pd.read_sql_query(query, connection)

Unnamed: 0,Major,Major_category,quartile_spread
0,MILITARY TECHNOLOGIES,Industrial Arts & Consumer Services,0
1,SCHOOL STUDENT COUNSELING,Education,2000
2,LIBRARY SCIENCE,Education,2000
3,COURT REPORTING,Law & Public Policy,4000
4,PHARMACOLOGY,Biology & Life Science,5000
5,EDUCATIONAL ADMINISTRATION AND SUPERVISION,Education,6000
6,COUNSELING PSYCHOLOGY,Psychology & Social Work,6800
7,SPECIAL NEEDS EDUCATION,Education,10000
8,MATHEMATICS TEACHER EDUCATION,Education,10000
9,SOCIAL WORK,Psychology & Social Work,10000


# Summary statistics using groups

#####  Total number of people employed in each major category

In [72]:
query = '''SELECT 
                Major, SUM(Employed)
            FROM recent_grads
            GROUP BY Major_category'''
pd.read_sql_query(query, connection)

Unnamed: 0,Major,SUM(Employed)
0,MISCELLANEOUS AGRICULTURE,66943
1,DRAMA AND THEATER ARTS,288114
2,ZOOLOGY,302797
3,HOSPITALITY MANAGEMENT,1088742
4,MASS MEDIA,330660
5,COMMUNICATION TECHNOLOGIES,237894
6,LIBRARY SCIENCE,479839
7,MECHANICAL ENGINEERING RELATED TECHNOLOGIES,420372
8,COMMUNICATION DISORDERS SCIENCES AND SERVICES,372147
9,COMPOSITION AND RHETORIC,544118


##### Average number of ShareWomen for the major catagories

In [73]:
query = '''SELECT 
                Major_category, AVG(ShareWomen)
            FROM recent_grads
            GROUP BY Major_category'''
pd.read_sql_query(query, connection)

Unnamed: 0,Major_category,AVG(ShareWomen)
0,Agriculture & Natural Resources,0.617938
1,Arts,0.561851
2,Biology & Life Science,0.584518
3,Business,0.405063
4,Communications & Journalism,0.643835
5,Computers & Mathematics,0.512752
6,Education,0.674986
7,Engineering,0.257158
8,Health,0.616857
9,Humanities & Liberal Arts,0.676193


##### The percentage of graduates who are employed

In [74]:
query = '''SELECT 
                Major_category, AVG(Employed)/AVG(Total) share_employed
            FROM recent_grads
            GROUP BY Major_category'''
pd.read_sql_query(query, connection)

Unnamed: 0,Major_category,share_employed
0,Agriculture & Natural Resources,0.836986
1,Arts,0.806748
2,Biology & Life Science,0.667157
3,Business,0.835966
4,Communications & Journalism,0.842229
5,Computers & Mathematics,0.795611
6,Education,0.85819
7,Engineering,0.781967
8,Health,0.803374
9,Humanities & Liberal Arts,0.762638


##### The percentage of graduates who are employed - Select only rows where share_employed are greater than 0.8

In [75]:
query = '''SELECT 
                Major_category, AVG(Employed)/AVG(Total) share_employed
            FROM recent_grads
            GROUP BY Major_category
            HAVING share_employed > 0.8'''
pd.read_sql_query(query, connection)

Unnamed: 0,Major_category,share_employed
0,Agriculture & Natural Resources,0.836986
1,Arts,0.806748
2,Business,0.835966
3,Communications & Journalism,0.842229
4,Education,0.85819
5,Health,0.803374
6,Industrial Arts & Consumer Services,0.82267
7,Law & Public Policy,0.808399


##### Major categories where the share of graduates with low-wage jobs is greater than 0.1

In [76]:
query = '''SELECT 
                Major_category, AVG(Low_wage_jobs)/AVG(Total) share_low_wage
            FROM recent_grads
            GROUP BY Major_category
            HAVING share_low_wage > 0.1'''
pd.read_sql_query(query, connection)

Unnamed: 0,Major_category,share_low_wage
0,Arts,0.168331
1,Communications & Journalism,0.126324
2,Humanities & Liberal Arts,0.132087
3,Industrial Arts & Consumer Services,0.115713
4,Law & Public Policy,0.115685
5,Psychology & Social Work,0.116934
6,Social Science,0.102233


### ROUND function

In [77]:
query = '''SELECT 
                Major_category, ROUND(ShareWomen, 2) as rounded_share_women
            FROM recent_grads'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major_category,rounded_share_women
0,Engineering,0.12
1,Engineering,0.1
2,Engineering,0.15
3,Engineering,0.11
4,Engineering,0.34


SQL query that returns the following columns of recent_grads (in the same order): 
- ShareWomen rounded to 4 decimal places, 
- Major_category
- Limit the results to 10 rows.

In [78]:
query = '''SELECT 
                ROUND(ShareWomen, 4), Major_category
            FROM recent_grads
            LIMIT 10'''
pd.read_sql_query(query, connection)

Unnamed: 0,"ROUND(ShareWomen, 4)",Major_category
0,0.1206,Engineering
1,0.1019,Engineering
2,0.153,Engineering
3,0.1073,Engineering
4,0.3416,Engineering
5,0.145,Engineering
6,0.5357,Business
7,0.4414,Physical Sciences
8,0.1398,Engineering
9,0.4378,Engineering


##### Majors for share degree jobs below 0.3 

In [79]:
query = '''SELECT 
                Major_category, ROUND(AVG(College_jobs)/AVG(Total), 3) as share_degree_jobs
            FROM recent_grads
            GROUP BY Major_category
            HAVING share_degree_jobs < 0.3'''
pd.read_sql_query(query, connection)

Unnamed: 0,Major_category,share_degree_jobs
0,Agriculture & Natural Resources,0.248
1,Arts,0.265
2,Business,0.114
3,Communications & Journalism,0.22
4,Humanities & Liberal Arts,0.27
5,Industrial Arts & Consumer Services,0.249
6,Law & Public Policy,0.163
7,Social Science,0.215


### Information and types of column

In [80]:
query = '''PRAGMA TABLE_INFO(recent_grads) '''
pd.read_sql_query(query, connection).head()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,Rank,INTEGER,0,,0
2,2,Major_code,INTEGER,0,,0
3,3,Major,TEXT,0,,0
4,4,Major_category,TEXT,0,,0


##### Problem with divide int by int

In [81]:
query = '''SELECT Women/Total
            FROM recent_grads 
            LIMIT 5 '''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Women/Total
0,0
1,0
2,0
3,0
4,0


##### Changing for float to get nicer results

In [82]:
query = '''SELECT 
                CAST(Women as Float)/Cast(Total as Float) 
            FROM recent_grads
            LIMIT 5 '''
pd.read_sql_query(query, connection).head()

Unnamed: 0,CAST(Women as Float)/Cast(Total as Float)
0,0.120564
1,0.101852
2,0.153037
3,0.107313
4,0.341631


SQL query that
- Divides the sum of the Women column by the sum of the Total column, aliased as SW.
- a column of float values.
- Group by Major_category and order by SW.
- Only contains the Major_category and SW columns, in that order.

In [83]:
query = '''SELECT 
                Major_category, CAST(SUM(Women) as float)/CAST(SUM(Total) as float) SW
            FROM recent_grads 
            GROUP BY Major_category
            ORDER BY SW '''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major_category,SW
0,Law & Public Policy,0.030585
1,Business,0.084743
2,Industrial Arts & Consumer Services,0.160249
3,Computers & Mathematics,0.209356
4,Engineering,0.219596


# Subqueries

Query that returns the majors that are below the average for Unemployment_rate.
- only contain the Major and Unemployment_rate columns
- sorted in ascending order by Unemployment_rate

In [84]:
query = '''SELECT 
                Major, Unemployment_rate 
            FROM recent_grads
            WHERE 
                Unemployment_rate < (SELECT AVG(Unemployment_rate) from recent_grads)
            ORDER BY Unemployment_rate ASC'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,Unemployment_rate
0,MATHEMATICS AND COMPUTER SCIENCE,0.0
1,BOTANY,0.0
2,SOIL SCIENCE,0.0
3,EDUCATIONAL ADMINISTRATION AND SUPERVISION,0.0
4,ENGINEERING MECHANICS PHYSICS AND SCIENCE,0.006334


##### Finding the proportion - outputs with all rows

In [85]:
query = '''SELECT 
                COUNT(*), (select COUNT(*) from recent_grads)
            FROM recent_grads
            WHERE 
                ShareWomen > (select AVG(ShareWomen) from recent_grads)'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,COUNT(*),(select COUNT(*) from recent_grads)
0,91,173


SQL statement that computes the proportion (as a float value) of rows that contain above average values for the ShareWomen.
<br>The results should only return the proportion, aliased as proportion_abv_avg.

In [86]:
query = '''SELECT 
                CAST(COUNT(*) as float)/CAST((select COUNT(*) from recent_grads) as float) proportion_abv_avg
            FROM recent_grads
            WHERE 
                ShareWomen > (select AVG(ShareWomen) from recent_grads)'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,proportion_abv_avg
0,0.526012


##### Query returns the rows where Major_category equals either Business or Engineering

In [87]:
query = '''SELECT 
                Major, Major_category
            FROM recent_grads
            WHERE 
                Major_category IN('Business', 'Engineering')'''
pd.read_sql_query(query, connection).tail()

Unnamed: 0,Major,Major_category
37,MECHANICAL ENGINEERING RELATED TECHNOLOGIES,Engineering
38,BUSINESS MANAGEMENT AND ADMINISTRATION,Business
39,MARKETING AND MARKETING RESEARCH,Business
40,HUMAN RESOURCES AND PERSONNEL MANAGEMENT,Business
41,HOSPITALITY MANAGEMENT,Business


Query that returns the Major and Major_category columns for the rows
- Major_category is one of the 5 highest group level sums for the Total column

In [88]:
query = '''SELECT 
                Major, Major_category FROM recent_grads
            WHERE 
                Major_category IN (select Major_category from recent_grads
                                    group by Major_category
                                    order by SUM(Total) DESC
                                    limit 5)'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,Major_category
0,PETROLEUM ENGINEERING,Engineering
1,MINING AND MINERAL ENGINEERING,Engineering
2,METALLURGICAL ENGINEERING,Engineering
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering
4,CHEMICAL ENGINEERING,Engineering


##### Ratio of the Sample_size column to the Total column

query that returns the average ratio (Sample_size/Total)) for all of the majors.
- cast both columns to the float type.
- alias avg_ratio for the average ratio.

In [89]:
query = '''SELECT 
                AVG(cast(Sample_size as float)/cast(Total as float)) avg_ratio
            FROM recent_grads'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,avg_ratio
0,0.009086


##### Rows that exceed the average that was calculater above

In [90]:
query = '''SELECT 
                Major, Major_category, cast(Sample_size as float)/cast(Total as float) ratio
            FROM recent_grads
            WHERE 
                ratio > (select AVG(cast(Sample_size as float)/cast(Total as float)) from recent_grads)'''
pd.read_sql_query(query, connection).head()

Unnamed: 0,Major,Major_category,ratio
0,PETROLEUM ENGINEERING,Engineering,0.015391
1,MINING AND MINERAL ENGINEERING,Engineering,0.009259
2,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,0.012719
3,ACTUARIAL SCIENCE,Business,0.013503
4,MECHANICAL ENGINEERING,Engineering,0.01128
