# Database System Development Plan
GROUP 3

##### This is our plan to create and populate our database system based on the normalization plan stated in Checkpoint 3. 

In [1]:
# Importing necessary packages

import pandas as pd
from sqlalchemy import create_engine

In [3]:
# Loading CSV data into a new dataframe

df = pd.read_csv('salary_data.csv')

In [4]:
# Pass the connection string to a variable, conn_url
conn_url = 'postgresql://postgres:123@localhost/checkpoint4'

# Create an engine that connects to PostgreSQL server
engine = create_engine(conn_url)

# Establish a connection
connection = engine.connect()


## Creation of Database Tables Based on Normalization Plan

In [5]:
# We are now ready to create database tables based on the normalization plan that were developed earlier

# Pass the SQL statements that create all tables
stmt = """
create table employee(
	employee_id serial,
	base_salary numeric(10,2),
	bonus_amount numeric(10,2),
	stock_value numeric(10,2),
    totalyearlycompensation numeric(10,2),
	timestamp timestamp NOT NULL,
	primary key (employee_id)
);

create table company(
	company_id serial,
	company varchar(100) NOT NULL,
	primary key (company_id)
);

create table location(
	location_id serial,
	location_city varchar(100),
	location_state varchar(100),
    location_add1 varchar(100),
    location_add2 varchar(100),
	primary key (location_id)
);

create table employee_company(
	employee_id int,
	company_id int,
	location_id int,
	foreign key(employee_id) references employee,
	foreign key(company_id) references company,
	foreign key(location_id) references location,
	primary key (employee_id, company_id, location_id)
);

create table specialization(
	tag_id serial,
	tag varchar(200),
	primary key (tag_id)
);

create table employee_specialization(
	employee_id int,
	tag_id int,
	foreign key(employee_id) references employee,
	foreign key(tag_id) references specialization,
	primary key(employee_id)
);

create table education(
	education_id serial,
	education varchar(50),
	primary key(education_id)
);

create table employee_qualification(
	employee_id int,
	education_id int,
	years_at_company int NOT NULL,
	years_experience int NOT NULL,
	foreign key(employee_id) references employee,
	foreign key(education_id) references education,
	primary key(employee_id)
);

create table gender(
	gender varchar(10),
	check (gender 
			in ('Male','Female', 'Other','Unknown')),
	primary key(gender));
    
create table race(
	race varchar(50) primary key);
    
create table employee_demographics(
	employee_id int primary key,
	gender varchar(10),
	race varchar(50),
	foreign key(employee_id) references employee,
	foreign key(gender) references gender,
	foreign key(race) references race);

create table level(
	level_id serial, 
	level varchar(100),
	primary key (level_id)
);

create table employee_level(
	employee_id int primary key,
	level_id int,
	foreign key(employee_id) references employee,
	foreign key(level_id) references level);

create table title(
	title_id serial primary key,
	title varchar(100) NOT NULL);
    
create table employee_title(
	employee_id int primary key,
	title_id int NOT NULL,
	foreign key(employee_id) references employee,
	foreign key(title_id) references title);
    
create table other(
	employee_id int primary key,
	other_details varchar(1000),
	foreign key(employee_id) references employee);

    """

connection.execute(stmt)

<sqlalchemy.engine.result.ResultProxy at 0x23e957837c0>

## Extract, Transform and Load

In [6]:
# loading employee database table - since the employee records are unique without duplication, we can add
# a column with incrementing integer numbers for the primary key of employee id

In [7]:
df.insert(0, 'employee_id', range(1, 1 + len(df)))

In [8]:
employee_df=df[['employee_id','basesalary','bonus','stockgrantvalue','timestamp','totalyearlycompensation']]

In [9]:
employee_df=employee_df.rename(columns={'basesalary':'base_salary','stockgrantvalue':'stock_value','bonus':'bonus_amount'})

In [10]:
employee_df.to_sql(name='employee', con=engine, if_exists='append', index=False)

In [11]:
# loading company database table - in the original csv data since there are repeating companies, we need to extract
# the unique company names, add a column of incrementing integer numbers and then map these numbers back 
# to the main dataframe

In [12]:
temp_company_df = pd.DataFrame(df.company.unique(), columns=['company'])
temp_company_df['company']=temp_company_df['company'].fillna("Unknown")
temp_company_df.insert(0, 'company_id', range(1, 1 + len(temp_company_df)))

In [13]:
temp_company_df.to_sql(name='company', con=engine, if_exists='append', index=False)

In [14]:
# adding a new column to the main dataframe, for the primary key which is company_id 
# this involves using temp_company_df to create a list mapping company_id using for loops
# and then inserting this list to the main dataframe as a new column

In [15]:
df['company']=df['company'].fillna("Unknown")

In [16]:
company_id_list = [temp_company_df.company_id[temp_company_df.company == i].values[0] for i in df.company]

In [17]:
df.insert(4, 'company_id', company_id_list)

In [18]:
# loading location database table - same as the company name attribute, since location has repetitive records
# in the main dataframe, we need to first extract the unique location information to database, and then use
# for-loop to add a column of incrementing integer numbers for each location_id to create a list, which is
# used to map these numbers back to the main dataframe 

In [19]:
df_cs=df['location'].str.split(',', expand=True)

In [20]:
df_cs.columns=['location_city','location_state','location_add1','location_add2']

In [21]:
df_cs=df_cs.drop_duplicates()

In [22]:
df_cs['location_state'] = df_cs['location_state'].str.strip()

In [23]:
df_cs['location_add1'] = df_cs['location_add1'].str.strip()

In [24]:
df_cs['location_add2'] = df_cs['location_add2'].str.strip()

In [25]:
df_cs.insert(0, 'location_id', range(1, 1 + len(df_cs)))

In [26]:
df_cs.to_sql(name='location', con=engine, if_exists='append', index=False)

In [27]:
# mapping location_id

In [28]:
df_cs=df['location']

In [29]:
df_cs=df_cs.drop_duplicates()

In [30]:
df_cs.columns=['location']

In [31]:
df_cs=pd.DataFrame(df_cs)

In [32]:
df_cs.insert(0, 'location_id', range(1, 1 + len(df_cs)))

In [33]:
df = pd.merge(df, df_cs, left_on=['location'], right_on = ['location'])

In [34]:
df=df.sort_values(by=['employee_id'])

In [35]:
# loading employee_company table to the database, but since we have already created employee_id and company_id,
# we don't need to add additional columns to the main dataframe

In [36]:
employee_company_df=df[['employee_id','company_id','location_id']]

In [37]:
employee_company_df.to_sql(name='employee_company', con=engine, if_exists='append', index=False)

In [38]:
# loading specialization table to the database - since some tag attributes are duplicated, as what we did earlier,
# we need to first extract the unique tag information to database, add a column of incrementing integer numbers
# for each unique tag_id to create a list, which is then used to map these numbers back to the main dataframe

In [39]:
specialization_df = pd.DataFrame(df.tag.unique(), columns=['tag'])
specialization_df['tag']=specialization_df['tag'].fillna("Unknown")
specialization_df.insert(0, 'tag_id', range(1, 1 + len(specialization_df)))

In [40]:
specialization_df.to_sql(name='specialization', con=engine, if_exists='append', index=False)

In [41]:
# mapping specialization or tag_id

In [42]:
df['tag']=df['tag'].fillna("Unknown")

In [43]:
tag_id_list = [specialization_df.tag_id[specialization_df.tag == i].values[0] for i in df.tag]

In [44]:
df.insert(1, 'tag_id', tag_id_list)

In [45]:
# loading employee_specialization table to database, but since we have already created employee_id and tag_id in
# the main dataframe, we just need to extract the necessary attributes, which are employee_id and tag_id, in
# creating and loading the table to database without making changes to the main dataframe

In [46]:
employee_specialization_df=df[['employee_id','tag_id']]
employee_specialization_df.to_sql(name='employee_specialization', con=engine, if_exists='append', index=False)

In [47]:
# loading education table to the database by extracting unique education level information, and then adding 
# education_id as the primary key to create a list, which is then used to map the numbers back to the main dataframe

In [48]:
education_df = pd.DataFrame(df.Education.unique(), columns=['Education'])
education_df['Education']=education_df['Education'].fillna("Unknown")
education_df.insert(0, 'education_id', range(1, 1 + len(education_df)))


In [49]:
education_df=education_df.rename(columns={'Education':'education'})

In [50]:
education_df.to_sql(name='education', con=engine, if_exists='append', index=False)

In [51]:
# mapping education_id

In [52]:
df['Education']=df['Education'].fillna("Unknown")
education_id_list = [education_df.education_id[education_df.education == i].values[0] for i in df.Education]
df.insert(1, 'education_id', education_id_list)

In [53]:
# loading employee_qualification table to database with existing employee_id and education_id that were previously
# created, without having to add addtional columns to the main dataframe

In [54]:
employee_qualification_df=df[['employee_id','education_id','yearsofexperience','yearsatcompany']]
employee_qualification_df=employee_qualification_df.rename(columns={'yearsofexperience':'years_experience','yearsatcompany':'years_at_company'})
employee_qualification_df.to_sql(name='employee_qualification', con=engine, if_exists='append', index=False)

In [55]:
# loaidng gender table to the database

In [56]:
df['gender']=df['gender'].replace(['Title: Senior Software Engineer'],'Unknown')
df['gender']=df['gender'].fillna("Unknown")

In [57]:
gender_df = pd.DataFrame(df.gender.unique(), columns=['gender'])

In [58]:
gender_df.to_sql(name='gender', con=engine, if_exists='append', index=False)

In [59]:
# loading race table to the database

In [60]:
df['Race']=df['Race'].fillna("Unknown")

In [61]:
race_df = pd.DataFrame(df.Race.unique(), columns=['Race'])

In [62]:
race_df=race_df.rename(columns={'Race':'race'})
race_df.to_sql(name='race', con=engine, if_exists='append', index=False)

In [63]:
# loading employee_demographics table to the database 

In [64]:
employee_demographics=df[['employee_id','gender','Race']]

In [65]:
employee_demographics=employee_demographics.rename(columns={'Race':'race'})
employee_demographics.to_sql(name='employee_demographics', con=engine, if_exists='append', index=False)

In [66]:
# loading level table to the databse by extracting unique job level information from the level attribute
# add a column of incrementing integer numbers to represent the primary key, level_id
# create a list based on the unique level_id and then map the numbers back to the main dataframe

In [67]:
level_df = pd.DataFrame(df.level.unique(), columns=['level'])
level_df['level']=level_df['level'].fillna("Unknown")
level_df.insert(0, 'level_id', range(1, 1 + len(level_df)))

In [68]:
level_df.to_sql(name='level', con=engine, if_exists='append', index=False)

In [69]:
# mapping level_id

In [70]:
df['level']=df['level'].fillna("Unknown")
level_id_list = [level_df.level_id[level_df.level == i].values[0] for i in df.level]
df.insert(1, 'level_id', level_id_list)

In [71]:
# loading employee_level table to database using employee_id and level_id that was just created

In [72]:
employee_level=df[['employee_id','level_id']]

In [73]:
employee_level.to_sql(name='employee_level', con=engine, if_exists='append', index=False)

In [74]:
# loading title table to the databse by extracting unique job title information from the title attribute
# add a column of incrementing integer numbers to represent the primary key, title_id
# create a list based on the unique title_id and then map the numbers back to the main dataframe

In [75]:
title_df = pd.DataFrame(df.title.unique(), columns=['title'])
title_df['title']=title_df['title'].fillna("Unknown")
title_df.insert(0, 'title_id', range(1, 1 + len(title_df)))

In [76]:
title_df.to_sql(name='title', con=engine, if_exists='append', index=False)

In [77]:
# mapping title_id

In [78]:
df['title']=df['title'].fillna("Unknown")
title_id_list = [title_df.title_id[title_df.title == i].values[0] for i in df.title]
df.insert(1, 'title_id', title_id_list)

In [79]:
# loading employee_title table to databse with employee_id and title_id that was just created

In [80]:
employee_title=df[['employee_id','title_id']]

In [81]:
employee_title.to_sql(name='employee_title', con=engine, if_exists='append', index=False)

In [82]:
# loading other table to databse using extracted information from the "otheretails" attribute

In [83]:
other=df[['employee_id','otherdetails']]

In [84]:
other=other.rename(columns={'otherdetails':'other_details'})

In [85]:
other.to_sql(name='other', con=engine, if_exists='append', index=False)

In [88]:
# Pass the SQL statement to filter data
stmt = """

CREATE USER mytest WITH
	LOGIN
	SUPERUSER
	CREATEDB
	CREATEROLE
	INHERIT
	NOREPLICATION
	CONNECTION LIMIT -1
	VALID UNTIL '2025-04-03T11:50:38+05:30' 
	PASSWORD '123456';

"""

# Execute the statement and get the results
connection.execute(stmt)



ProgrammingError: (psycopg2.errors.DuplicateObject) role "mytest" already exists

[SQL: 

CREATE USER mytest WITH
	LOGIN
	SUPERUSER
	CREATEDB
	CREATEROLE
	INHERIT
	NOREPLICATION
	CONNECTION LIMIT -1
	VALID UNTIL '2025-04-03T11:50:38+05:30' 
	PASSWORD '123456';

]
(Background on this error at: http://sqlalche.me/e/13/f405)

## Spot Checks & Validation

In [86]:
# Below is our analysis plan, which aims to first look at average salary information by different variables,
# such as job title in particular, which is of greatest interest of our clients.
# We then plan on taking a deep dive into one specific job title to see how salary varies by other variables 
# such as company, location, education, experience, etc., to provide a 360 view of salary information for our
# clients depending on a specific job title.

In [None]:
#1 Average salary by company

In [None]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,company from employee,employee_company,company
where employee.employee_id = employee_company.employee_id and company.company_id = employee_company.company_id
group by company order by Avg_salary DESC

"""

# Execute the statement and get the results
results = connection.execute(stmt).fetchall()

# Extract column names
column_names = results[0].keys()

# Store results in a new dataframe
temp_df = pd.DataFrame(results, columns=column_names)

# Show results
temp_df

In [None]:
#2 Average salary by job title

In [None]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,title from employee,employee_title,title
where employee.employee_id = employee_title.employee_id and title.title_id = employee_title.title_id
group by title order by Avg_salary DESC


"""

# Execute the statement and get the results
results1 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results1[0].keys()

# Store results in a new dataframe
temp_df1 = pd.DataFrame(results1, columns=column_names)

# Show results
temp_df1

In [None]:
#3 Average salary by city

In [None]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,location_city from employee,employee_company,location
where employee.employee_id = employee_company.employee_id and location.location_id = employee_company.location_id
group by location_city order by Avg_salary DESC limit 20


"""

# Execute the statement and get the results
results2 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results2[0].keys()

# Store results in a new dataframe
temp_df2 = pd.DataFrame(results2, columns=column_names)

# Show results
temp_df2

In [None]:
#4 Average salary by state

In [None]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,location_state from employee,employee_company,location
where employee.employee_id = employee_company.employee_id and location.location_id = employee_company.location_id
group by location_state order by Avg_salary DESC limit 20



"""

# Execute the statement and get the results
results3 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results3[0].keys()

# Store results in a new dataframe
temp_df3 = pd.DataFrame(results3, columns=column_names)

# Show results
temp_df3

In [None]:
#5 Level with the highest salary

In [None]:
# Pass the SQL statement to filter data
stmt = """

select max(totalyearlycompensation)as max_salary,company,level from employee,employee_level,level,company,employee_company
where employee.employee_id = employee_level.employee_id and level.level_id = employee_level.level_id and
employee.employee_id=employee_company.employee_id and company.company_id = employee_company.company_id
group by level,company order by max_salary DESC 


"""

# Execute the statement and get the results
results4 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results4[0].keys()

# Store results in a new dataframe
temp_df4 = pd.DataFrame(results4, columns=column_names)

# Show results
temp_df4

In [None]:
# In the following analyses we will focus on exploring data scientist salary by different variables
#6 Data Scientist salary by the company and average working experience

In [None]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,avg(years_experience)as Avg_exp,company from 
employee,employee_company,company,employee_title,title,employee_qualification
where employee.employee_id = employee_company.employee_id and company.company_id = employee_company.company_id 
and employee.employee_id = employee_qualification.employee_id
and title.title_id = employee_title.title_id and employee.employee_id=employee_title.employee_id and title='Data Scientist'  
group by company order by Avg_salary DESC


"""

# Execute the statement and get the results
results5 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results5[0].keys()

# Store results in a new dataframe
temp_df5 = pd.DataFrame(results5, columns=column_names)

# Show results
temp_df5

In [None]:
#7 Data scientist salary by working experience

In [None]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,years_experience from employee,employee_title,title,employee_qualification
where employee.employee_id = employee_qualification.employee_id and employee.employee_id=employee_title.employee_id
and title.title_id = employee_title.title_id and title='Data Scientist' 
group by years_experience order by Avg_salary DESC



"""

# Execute the statement and get the results
results6 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results6[0].keys()

# Store results in a new dataframe
temp_df6 = pd.DataFrame(results6, columns=column_names)

# Show results
temp_df6

In [None]:
#8 data scientist salary by tag or specialization

In [None]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,tag from employee,employee_title,title,employee_specialization,specialization
where employee.employee_id = employee_specialization.employee_id and employee_specialization.tag_id=specialization.tag_id
and title.title_id = employee_title.title_id and employee.employee_id=employee_title.employee_id and title='Data Scientist'  
group by tag order by Avg_salary DESC



"""

# Execute the statement and get the results
results7 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results7[0].keys()

# Store results in a new dataframe
temp_df7 = pd.DataFrame(results7, columns=column_names)

# Show results
temp_df7

In [None]:
#9 Data Scientist salary Man VS women

In [None]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,gender from employee,employee_title,title,employee_demographics
where title.title_id = employee_title.title_id and employee.employee_id=employee_title.employee_id and title='Data Scientist' 
and employee_demographics.employee_id=employee.employee_id
group by gender order by Avg_salary DESC



"""

# Execute the statement and get the results
results8 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results8[0].keys()

# Store results in a new dataframe
temp_df8 = pd.DataFrame(results8, columns=column_names)

# Show results
temp_df8

In [None]:
#10 Data Scientist salary race

In [None]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,race from employee,employee_title,title,employee_demographics
where title.title_id = employee_title.title_id and employee.employee_id=employee_title.employee_id and title='Data Scientist' 
and employee_demographics.employee_id=employee.employee_id
group by race order by Avg_salary DESC



"""

# Execute the statement and get the results
results9 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results9[0].keys()

# Store results in a new dataframe
temp_df9 = pd.DataFrame(results9, columns=column_names)

# Show results
temp_df9