In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import scipy.stats as st
from scipy import stats 
from scipy.stats import linregress


# Output File (CSV)
output_data_file = "output_data/salaries.csv"



In [2]:
# Save config information
url = "https://datausa.io/api/data?Geography=16000US4260000&measure=Average%20Wage,Average%20Wage%20Appx%20MOE,Total%20Population,Total%20Population%20MOE%20Appx,Record%20Count&drilldowns=Gender&Employment%20Time%20Status=1&Detailed%20Occupation=291141,1191XX,533030,537062,412031&Record%20Count%3E=5"
# units = "imperial"


# Build query URL
query_url = f"{url}"
query_url

'https://datausa.io/api/data?Geography=16000US4260000&measure=Average%20Wage,Average%20Wage%20Appx%20MOE,Total%20Population,Total%20Population%20MOE%20Appx,Record%20Count&drilldowns=Gender&Employment%20Time%20Status=1&Detailed%20Occupation=291141,1191XX,533030,537062,412031&Record%20Count%3E=5'

In [3]:
response = requests.get(query_url).json()
response

{'data': [{'ID Gender': 1,
   'Gender': 'Male',
   'ID Year': 2019,
   'Year': '2019',
   'ID Employment Time Status': 1,
   'Employment Time Status': 'Full-time',
   'ID Detailed Occupation': '291141',
   'Detailed Occupation': 'Registered nurses',
   'Average Wage': 70580.63570306363,
   'Average Wage Appx MOE': 7787.381405961702,
   'Total Population': 15276,
   'Total Population MOE Appx': 3029.5955491433274,
   'Record Count': 141,
   'Geography': 'Pennsylvania',
   'ID Geography': '04000US42'},
  {'ID Gender': 1,
   'Gender': 'Male',
   'ID Year': 2019,
   'Year': '2019',
   'ID Employment Time Status': 1,
   'Employment Time Status': 'Full-time',
   'ID Detailed Occupation': '1191XX',
   'Detailed Occupation': 'Other managers',
   'Average Wage': 115279.15485869144,
   'Average Wage Appx MOE': 8044.56391838343,
   'Total Population': 85239,
   'Total Population MOE Appx': 7103.747978839758,
   'Record Count': 866,
   'Geography': 'Pennsylvania',
   'ID Geography': '04000US42'},


In [4]:
data = response['data']
data

[{'ID Gender': 1,
  'Gender': 'Male',
  'ID Year': 2019,
  'Year': '2019',
  'ID Employment Time Status': 1,
  'Employment Time Status': 'Full-time',
  'ID Detailed Occupation': '291141',
  'Detailed Occupation': 'Registered nurses',
  'Average Wage': 70580.63570306363,
  'Average Wage Appx MOE': 7787.381405961702,
  'Total Population': 15276,
  'Total Population MOE Appx': 3029.5955491433274,
  'Record Count': 141,
  'Geography': 'Pennsylvania',
  'ID Geography': '04000US42'},
 {'ID Gender': 1,
  'Gender': 'Male',
  'ID Year': 2019,
  'Year': '2019',
  'ID Employment Time Status': 1,
  'Employment Time Status': 'Full-time',
  'ID Detailed Occupation': '1191XX',
  'Detailed Occupation': 'Other managers',
  'Average Wage': 115279.15485869144,
  'Average Wage Appx MOE': 8044.56391838343,
  'Total Population': 85239,
  'Total Population MOE Appx': 7103.747978839758,
  'Record Count': 866,
  'Geography': 'Pennsylvania',
  'ID Geography': '04000US42'},
 {'ID Gender': 1,
  'Gender': 'Male',


In [5]:
# Set empty lists to hold data needed 

ID_Geography = []
Gender = []
Time_Status = []
Detailed_Occupation = []
ID_Year = []
Avg_Wage = []
Total_Population = []

# Loop through each salary info
for info in data:
    ID_Geography.append(info['ID Geography'])
    Gender.append(info['Gender'])
    Time_Status.append(info['Employment Time Status'])
    Detailed_Occupation.append(info['Detailed Occupation'])
    ID_Year.append(info['ID Year'])
    Avg_Wage.append(info['Average Wage'])
    Total_Population.append(info['Total Population'])


In [6]:
#create table to put see data
data_summary = {"Gender":Gender, "ID_Geography":ID_Geography, "Full_or_Part_Time": Time_Status, "Title":Detailed_Occupation, "Year": ID_Year, "Avg Salary": Avg_Wage, "Total_Population": Total_Population}
data_summary_df = pd.DataFrame(data_summary)
data_summary_df


Unnamed: 0,Gender,ID_Geography,Full_or_Part_Time,Title,Year,Avg Salary,Total_Population
0,Male,04000US42,Full-time,Registered nurses,2019,70580.635703,15276
1,Male,04000US42,Full-time,Other managers,2019,115279.154859,85239
2,Male,04000US42,Full-time,Driver/sales workers & truck drivers,2019,50504.629076,118008
3,Male,04000US42,Full-time,"Laborers & freight, stock, & material movers, ...",2019,36499.649822,80362
4,Male,04000US42,Full-time,Retail salespersons,2019,53713.738282,38595
5,Female,04000US42,Full-time,Registered nurses,2019,69670.553261,112896
6,Female,04000US42,Full-time,Other managers,2019,87194.498228,51635
7,Female,04000US42,Full-time,Driver/sales workers & truck drivers,2019,30100.724163,5438
8,Female,04000US42,Full-time,"Laborers & freight, stock, & material movers, ...",2019,29083.28952,21812
9,Female,04000US42,Full-time,Retail salespersons,2019,34998.541089,23279


In [7]:
#stats table
data_stats_df= data_summary_df[["Avg Salary"]].agg(['count','mean', 'std', 'min',  'max'])
data_stats_df

Unnamed: 0,Avg Salary
count,52.0
mean,50081.585171
std,21297.838823
min,25718.498799
max,115279.154859


In [8]:
#save to csv
data_summary_df.to_csv("salary_summary_df.csv", encoding="utf-8", index=False)

In [9]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///philly.sqlite', echo=True)
sqlite_connection = engine.connect()

# Add primary key to "merged" table that contains all cleaned data
sqlite_connection.execute('DROP TABLE "query_url"' )
sqlite_connection.execute('CREATE TABLE "query_url" ( "index" INTEGER PRIMARY KEY, "ID_Geography" TEXT, "Gender" TEXT, "Full_or_Part_Time" TEXT, "Title" TEXT, "Year" INTEGER, "Avg Salary" INTEGER, "Total_Population" INTEGER)' )


2021-04-30 19:59:37,510 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2021-04-30 19:59:37,513 INFO sqlalchemy.engine.base.Engine ()
2021-04-30 19:59:37,514 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2021-04-30 19:59:37,515 INFO sqlalchemy.engine.base.Engine ()
2021-04-30 19:59:37,516 INFO sqlalchemy.engine.base.Engine DROP TABLE "query_url"
2021-04-30 19:59:37,517 INFO sqlalchemy.engine.base.Engine ()
2021-04-30 19:59:37,525 INFO sqlalchemy.engine.base.Engine COMMIT
2021-04-30 19:59:37,526 INFO sqlalchemy.engine.base.Engine CREATE TABLE "query_url" ( "index" INTEGER PRIMARY KEY, "ID_Geography" TEXT, "Gender" TEXT, "Full_or_Part_Time" TEXT, "Title" TEXT, "Year" INTEGER, "Avg Salary" INTEGER, "Total_Population" INTEGER)
2021-04-30 19:59:37,527 INFO sqlalchemy.engine.base.Engine ()
2021-04-30 19:59:37,533 INFO sqlalchemy.engine.base.Engine COMMIT


<sqlalchemy.engine.result.ResultProxy at 0x238db27dfd0>

In [27]:
sqlite_table = "query_url"
data_summary_df.to_sql(sqlite_table, sqlite_connection, if_exists='append', index=True, )

2021-04-28 18:52:55,974 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("query_url")
2021-04-28 18:52:55,976 INFO sqlalchemy.engine.base.Engine ()
2021-04-28 18:52:56,085 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("query_url")
2021-04-28 18:52:56,086 INFO sqlalchemy.engine.base.Engine ()
2021-04-28 18:52:56,088 INFO sqlalchemy.engine.base.Engine 
CREATE TABLE query_url (
	"index" BIGINT, 
	"Gender" TEXT, 
	"ID_Geography" TEXT, 
	"Full_or_Part_Time" TEXT, 
	"Title" TEXT, 
	"Year" BIGINT, 
	"Avg Salary" FLOAT, 
	"Total_Population" BIGINT
)


2021-04-28 18:52:56,088 INFO sqlalchemy.engine.base.Engine ()
2021-04-28 18:52:56,104 INFO sqlalchemy.engine.base.Engine COMMIT
2021-04-28 18:52:56,105 INFO sqlalchemy.engine.base.Engine CREATE INDEX ix_query_url_index ON query_url ("index")
2021-04-28 18:52:56,106 INFO sqlalchemy.engine.base.Engine ()
2021-04-28 18:52:56,109 INFO sqlalchemy.engine.base.Engine COMMIT
2021-04-28 18:52:56,129 INFO sqlalchemy.engine.base.Engine 

In [28]:
sqlite_connection.close()