In [1]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sqlalchemy as db

In [2]:
# Load the data from the .env file for the project
# Download and install python-dotenv library; use "pip install python-dotenv"
# This file has the value for connecting to the Postgres DB server.
# This methods prevents any hardcoded id and password in the code.

from dotenv import load_dotenv
load_dotenv(".env")

# Get the Postgres related details
db_driver = os.environ.get('DB_DRIVER')
postgres_server = os.environ.get('POSTGRES_SERVER')
postgres_server_port = os.environ.get('POSTGRES_SERVER_PORT')
postgres_db = os.environ.get('POSTGRES_DB')
postgres_user = os.environ.get('POSTGRES_USER')
postgres_pwd = os.environ.get('POSTGRES_PWD')

In [3]:
# Create a connection to the Postgres database engine
#connection_string = 'postgresql://<user>:<pwd>@<host>:<port>/<database>'
connection_string = f'{db_driver}://{postgres_user}:{postgres_pwd}@{postgres_server}:{postgres_server_port}/{postgres_db}'
engine = db.create_engine(connection_string)
connection = engine.connect()

ModuleNotFoundError: No module named 'psycopg2'

In [None]:
# Execute the query against the Postgres DB using the "connection"
# SQL query to get the salary of all the employees from the employee_db in Postgres
query = 'select e.emp_no as "Employee Number" \
               ,e.last_name as "Last Name" \
               ,e.first_name as "First Name" \
               ,e.sex as "Gender" \
               ,s.salary as "Salary" \
           from public.employees as e \
                left join public.salaries as s on (e.emp_no = s.emp_no) \
       order by e.emp_no;'

ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
# Display first 5 records
ResultSet[:5]

In [None]:
#Create a Pandas DataFrame to alanlyze the salary data
salary_df = pd.DataFrame(ResultSet)
salary_df.columns = ['Employee Number','Last Name','First Name','Gender','Salary']

In [None]:
salary_df.head()

In [None]:
# # Create a histogram to show the Employee count by Salary ranages as defined by Bins
# salary_list = salary_df['Salary']

# bins = np.linspace(salary_list.min(), salary_list.max(), 10)
# labels = ('>40K', '>50K', '>60K','>70K','>80K', '>90K','>100k', '>110K','>120K')

# #groups = pd.cut(salary_list, bins=bins, labels=labels)
# groups = salary_df.groupby(pd.cut(salary_df.Salary, bins))

# plt.figure(figsize=(10,5))
# #plt.hist(result.ravel(), bins=np.linspace(np.min(result), np.max(result), num=num_bins)) #<-- Change here.  Note the use of ravel.
# plt.hist( salary_list, bins=bins, color='#0504aa', alpha=0.7,rwidth=0.95)

# plt.title('Employee Salary Distribution')
# plt.xlabel('Salary')
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.legend()
# plt.show()

In [None]:
# Plot the histogram for the Employee Salary.

salary_list = salary_df['Salary']
bins = np.linspace(salary_list.min(), salary_list.max(), 10)

plt.figure(figsize=[14,6])
plt.title('Employee Salary Distribution', fontsize=15)
plt.xlabel('Salary', fontsize=15)
plt.ylabel('Frequency', fontsize=15)

#plt.hist(bins[:-1], weights=counts, width=0.5, color='#0504aa', alpha=0.9)
#plt.xlim(min(bin_edges), max(bin_edges))
n, bins, patches = plt.hist(x=salary_list, bins=bins, color='#0504aa', alpha=0.7, rwidth=1.0)

plt.grid(axis='y', alpha=0.75)
#plt.grid(True, alpha=0.75)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xticks(rotation=90)
plt.legend(['Salary'], loc='best')
plt.show()

plt.savefig("./output/fig01-employee-salary.png", bbox_inches='tight')  
