In [2]:
import pandas as pd
from sqlalchemy import create_engine

In [2]:
df = pd.read_csv('Levels_Fyi_Salary_Data.csv')

In [3]:
# Pass the connection string to a variable, conn_url
conn_url = 'postgresql://postgres:('your password')@localhost/('your DB name')'

# Create an engine that connects to PostgreSQL server
engine = create_engine(conn_url)

# Establish a connection
connection = engine.connect()


In [4]:
# Pass the SQL statements that create all tables
stmt = """
create table employee(
	employee_id serial,
	base_salary numeric(10,2),
	bonus_amount numeric(10,2),
	stock_value numeric(10,2),
    totalyearlycompensation numeric(10,2),
	timestamp timestamp NOT NULL,
	primary key (employee_id)
);

create table company(
	company_id serial,
	company varchar(100) NOT NULL,
	primary key (company_id)
);

create table location(
	location_id serial,
	location_city varchar(100),
	location_state varchar(100),
    location_add1 varchar(100),
    location_add2 varchar(100),
	primary key (location_id)
);

create table employee_company(
	employee_id int,
	company_id int,
	location_id int,
	foreign key(employee_id) references employee,
	foreign key(company_id) references company,
	foreign key(location_id) references location,
	primary key (employee_id, company_id, location_id)
);

create table specialization(
	tag_id serial,
	tag varchar(200),
	primary key (tag_id)
);

create table employee_specialization(
	employee_id int,
	tag_id int,
	foreign key(employee_id) references employee,
	foreign key(tag_id) references specialization,
	primary key(employee_id)
);

create table education(
	education_id serial,
	education varchar(50),
	primary key(education_id)
);

create table employee_qualification(
	employee_id int,
	education_id int,
	years_at_company int NOT NULL,
	years_experience int NOT NULL,
	foreign key(employee_id) references employee,
	foreign key(education_id) references education,
	primary key(employee_id)
);

create table gender(
	gender varchar(10),
	check (gender 
			in ('Male','Female', 'Other','Unknown')),
	primary key(gender));
    
create table race(
	race varchar(50) primary key);
    
create table employee_demographics(
	employee_id int primary key,
	gender varchar(10),
	race varchar(50),
	foreign key(employee_id) references employee,
	foreign key(gender) references gender,
	foreign key(race) references race);

create table level(
	level_id serial, 
	level varchar(100),
	primary key (level_id)
);

create table employee_level(
	employee_id int primary key,
	level_id int,
	foreign key(employee_id) references employee,
	foreign key(level_id) references level);

create table title(
	title_id serial primary key,
	title varchar(100) NOT NULL);
    
create table employee_title(
	employee_id int primary key,
	title_id int NOT NULL,
	foreign key(employee_id) references employee,
	foreign key(title_id) references title);
    
create table other(
	employee_id int primary key,
	other_details varchar(1000),
	foreign key(employee_id) references employee);

    """

connection.execute(stmt)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x103e3ba00>

In [None]:
#load employee

In [5]:
df.insert(0, 'employee_id', range(1, 1 + len(df)))

In [6]:
employee_df=df[['employee_id','basesalary','bonus','stockgrantvalue','timestamp','totalyearlycompensation']]

In [7]:
employee_df=employee_df.rename(columns={'basesalary':'base_salary','stockgrantvalue':'stock_value','bonus':'bonus_amount'})

In [8]:
employee_df.to_sql(name='employee', con=engine, if_exists='append', index=False)

In [9]:
#load company

In [10]:
temp_company_df = pd.DataFrame(df.company.unique(), columns=['company'])
temp_company_df['company']=temp_company_df['company'].fillna("Unknown")
temp_company_df.insert(0, 'company_id', range(1, 1 + len(temp_company_df)))

In [11]:
temp_company_df.to_sql(name='company', con=engine, if_exists='append', index=False)

In [12]:
#map company id

In [13]:
df['company']=df['company'].fillna("Unknown")

In [14]:
# Map genre_id
genre_id_list = [temp_company_df.company_id[temp_company_df.company == i].values[0] for i in df.company]

In [15]:
# Add movie_id to the main dataframe
df.insert(4, 'company_id', genre_id_list)

In [16]:
#load location

In [17]:
df_cs=df['location'].str.split(',', expand=True)

In [18]:
df_cs.columns=['location_city','location_state','location_add1','location_add2']

In [19]:
df_cs=df_cs.drop_duplicates()

In [20]:
df_cs['location_state'] = df_cs['location_state'].str.strip()

In [21]:
df_cs['location_add1'] = df_cs['location_add1'].str.strip()

In [22]:
df_cs['location_add2'] = df_cs['location_add2'].str.strip()

In [23]:
df_cs.insert(0, 'location_id', range(1, 1 + len(df_cs)))

In [24]:
df_cs.to_sql(name='location', con=engine, if_exists='append', index=False)

In [26]:
#map location id

In [27]:
df_cs=df['location']

In [28]:
df_cs=df_cs.drop_duplicates()

In [29]:
df_cs.columns=['location']

In [30]:
df_cs=pd.DataFrame(df_cs)

In [31]:
df_cs.insert(0, 'location_id', range(1, 1 + len(df_cs)))

In [32]:
df = pd.merge(df, df_cs, left_on=['location'], right_on = ['location'])

In [33]:
df=df.sort_values(by=['employee_id'])

In [34]:
#employee_company

In [35]:
employee_company_df=df[['employee_id','company_id','location_id']]

In [36]:
employee_company_df.to_sql(name='employee_company', con=engine, if_exists='append', index=False)

In [37]:
# specialization

In [38]:
specialization_df = pd.DataFrame(df.tag.unique(), columns=['tag'])
specialization_df['tag']=specialization_df['tag'].fillna("Unknown")
specialization_df.insert(0, 'tag_id', range(1, 1 + len(specialization_df)))

In [39]:
specialization_df.to_sql(name='specialization', con=engine, if_exists='append', index=False)

In [40]:
#map specialization

In [41]:
df['tag']=df['tag'].fillna("Unknown")

In [42]:
genre_id_list = [specialization_df.tag_id[specialization_df.tag == i].values[0] for i in df.tag]

In [43]:
df.insert(1, 'tag_id', genre_id_list)

In [44]:
#employee_specialization

In [45]:
employee_specialization_df=df[['employee_id','tag_id']]
employee_specialization_df.to_sql(name='employee_specialization', con=engine, if_exists='append', index=False)

In [46]:
#education

In [47]:
education_df = pd.DataFrame(df.Education.unique(), columns=['Education'])
education_df['Education']=education_df['Education'].fillna("Unknown")
education_df.insert(0, 'education_id', range(1, 1 + len(education_df)))


In [48]:
education_df=education_df.rename(columns={'Education':'education'})

In [50]:
education_df.to_sql(name='education', con=engine, if_exists='append', index=False)

In [51]:
#map education

In [52]:
df['Education']=df['Education'].fillna("Unknown")
genre_id_list = [education_df.education_id[education_df.education == i].values[0] for i in df.Education]
df.insert(1, 'education_id', genre_id_list)

In [53]:
#employee_qualification

In [54]:
employee_qualification_df=df[['employee_id','education_id','yearsofexperience','yearsatcompany']]
employee_qualification_df=employee_qualification_df.rename(columns={'yearsofexperience':'years_experience','yearsatcompany':'years_at_company'})
employee_qualification_df.to_sql(name='employee_qualification', con=engine, if_exists='append', index=False)

In [55]:
#gender

In [56]:
df['gender']=df['gender'].replace(['Title: Senior Software Engineer'],'Unknown')
df['gender']=df['gender'].fillna("Unknown")

In [57]:
gender_df = pd.DataFrame(df.gender.unique(), columns=['gender'])

In [58]:
gender_df.to_sql(name='gender', con=engine, if_exists='append', index=False)

In [59]:
#race

In [60]:
df['Race']=df['Race'].fillna("Unknown")

In [61]:
race_df = pd.DataFrame(df.Race.unique(), columns=['Race'])

In [62]:
race_df=race_df.rename(columns={'Race':'race'})
race_df.to_sql(name='race', con=engine, if_exists='append', index=False)

In [63]:
#employee_demographics

In [64]:
employee_demographics=df[['employee_id','gender','Race']]

In [65]:
employee_demographics=employee_demographics.rename(columns={'Race':'race'})
employee_demographics.to_sql(name='employee_demographics', con=engine, if_exists='append', index=False)

In [66]:
#level

In [67]:
level_df = pd.DataFrame(df.level.unique(), columns=['level'])
level_df['level']=level_df['level'].fillna("Unknown")
level_df.insert(0, 'level_id', range(1, 1 + len(level_df)))

In [68]:
level_df.to_sql(name='level', con=engine, if_exists='append', index=False)

In [69]:
#map level

In [70]:
df['level']=df['level'].fillna("Unknown")
genre_id_list = [level_df.level_id[level_df.level == i].values[0] for i in df.level]
df.insert(1, 'level_id', genre_id_list)

In [71]:
#employee_level

In [72]:
employee_level=df[['employee_id','level_id']]

In [73]:
employee_level.to_sql(name='employee_level', con=engine, if_exists='append', index=False)

In [74]:
#title

In [75]:
title_df = pd.DataFrame(df.title.unique(), columns=['title'])
title_df['title']=title_df['title'].fillna("Unknown")
title_df.insert(0, 'title_id', range(1, 1 + len(title_df)))

In [76]:
title_df.to_sql(name='title', con=engine, if_exists='append', index=False)

In [77]:
#map title

In [78]:
df['title']=df['title'].fillna("Unknown")
genre_id_list = [title_df.title_id[title_df.title == i].values[0] for i in df.title]
df.insert(1, 'title_id', genre_id_list)

In [79]:
#employee_title

In [80]:
employee_title=df[['employee_id','title_id']]

In [81]:
employee_title.to_sql(name='employee_title', con=engine, if_exists='append', index=False)

In [82]:
#other

In [83]:
other=df[['employee_id','otherdetails']]

In [84]:
other=other.rename(columns={'otherdetails':'other_details'})

In [85]:
other.to_sql(name='other', con=engine, if_exists='append', index=False)

In [None]:
#1 Average salary by company

In [10]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,company from employee,employee_company,company
where employee.employee_id = employee_company.employee_id and company.company_id = employee_company.company_id
group by company order by Avg_salary DESC

"""

# Execute the statement and get the results
results = connection.execute(stmt).fetchall()

# Extract column names
column_names = results[0].keys()

# Store results in a new dataframe
temp_df = pd.DataFrame(results, columns=column_names)

# Show results
temp_df

Unnamed: 0,avg_salary,company
0,1483000.000000000000,Coupa software
1,700000.000000000000,Cloudkitchens
2,680000.000000000000,amplitude
3,593500.000000000000,Doordash
4,539000.000000000000,synaptics
...,...,...
1627,15000.0000000000000000,verkada
1628,14000.0000000000000000,Nice
1629,13000.0000000000000000,TATA Consultancy Services
1630,12600.000000000000,Bukalapak


In [None]:
#2 Average salary by job title

In [9]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,title from employee,employee_title,title
where employee.employee_id = employee_title.employee_id and title.title_id = employee_title.title_id
group by title order by Avg_salary DESC


"""

# Execute the statement and get the results
results1 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results1[0].keys()

# Store results in a new dataframe
temp_df1 = pd.DataFrame(results1, columns=column_names)

# Show results
temp_df1

Unnamed: 0,avg_salary,title
0,354635.75231157185,Software Engineering Manager
1,257813.39610528568,Product Manager
2,237099.92758870384,Technical Program Manager
3,214273.31887201732,Sales
4,213655.0,Hardware Engineer
5,212736.38720829727,Solution Architect
6,207637.2031662269,Product Designer
7,205403.9922873566,Software Engineer
8,203656.71062839407,Data Scientist
9,198971.83098591547,Marketing


In [None]:
#3 Average salary by city

In [8]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,location_city from employee,employee_company,location
where employee.employee_id = employee_company.employee_id and location.location_id = employee_company.location_id
group by location_city order by Avg_salary DESC limit 20


"""

# Execute the statement and get the results
results2 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results2[0].keys()

# Store results in a new dataframe
temp_df2 = pd.DataFrame(results2, columns=column_names)

# Show results
temp_df2

Unnamed: 0,avg_salary,location_city
0,650000.0,Aspen
1,605000.0,Chapel Hill
2,480000.0,Highland Park
3,479185.8407079646,Los Gatos
4,444000.0,Wimborne Minster
5,436000.0,Los Altos
6,390000.0,Nazareth Illit
7,388000.0,Sammamish
8,380000.0,Ellicott City
9,380000.0,Menomonie


In [None]:
#4 Average salary by state

In [11]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,location_state from employee,employee_company,location
where employee.employee_id = employee_company.employee_id and location.location_id = employee_company.location_id
group by location_state order by Avg_salary DESC limit 20



"""

# Execute the statement and get the results
results3 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results3[0].keys()

# Store results in a new dataframe
temp_df3 = pd.DataFrame(results3, columns=column_names)

# Show results
temp_df3

Unnamed: 0,avg_salary,location_state
0,390000.0,HZ
1,360000.0,LB
2,271510.7562215212,CA
3,249226.51933701657,ZH
4,245072.3256717384,WA
5,227358.48356309647,NY
6,213750.0,GE
7,212000.0,BS
8,194000.0,HI
9,192501.42450142448,MA


In [None]:
#5 Level with the highest salary

In [12]:
# Pass the SQL statement to filter data
stmt = """

select max(totalyearlycompensation)as max_salary,company,level from employee,employee_level,level,company,employee_company
where employee.employee_id = employee_level.employee_id and level.level_id = employee_level.level_id and
employee.employee_id=employee_company.employee_id and company.company_id = employee_company.company_id
group by level,company order by max_salary DESC 


"""

# Execute the statement and get the results
results4 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results4[0].keys()

# Store results in a new dataframe
temp_df4 = pd.DataFrame(results4, columns=column_names)

# Show results
temp_df4

Unnamed: 0,max_salary,company,level
0,4980000,Facebook,E9
1,4950000,Microsoft,80
2,4500000,Google,L10
3,2500000,Snap,L8
4,2372000,Facebook,D1
...,...,...,...
13543,10000,Bank of America Merrill Lynch,H7
13544,10000,Infosys,Fresher
13545,10000,Andela,2
13546,10000,Virtusa,Entry


In [None]:
#6 Data Scientist salary by the company and average working exp

In [13]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,avg(years_experience)as Avg_exp,company from 
employee,employee_company,company,employee_title,title,employee_qualification
where employee.employee_id = employee_company.employee_id and company.company_id = employee_company.company_id 
and employee.employee_id = employee_qualification.employee_id
and title.title_id = employee_title.title_id and employee.employee_id=employee_title.employee_id and title='Data Scientist'  
group by company order by Avg_salary DESC


"""

# Execute the statement and get the results
results5 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results5[0].keys()

# Store results in a new dataframe
temp_df5 = pd.DataFrame(results5, columns=column_names)

# Show results
temp_df5

Unnamed: 0,avg_salary,avg_exp,company
0,556000.000000000000,3.0000000000000000,LivePerson
1,494000.000000000000,8.5000000000000000,Roblox
2,484100.000000000000,5.9000000000000000,Netflix
3,477000.000000000000,4.0000000000000000,Slack
4,421200.000000000000,7.1000000000000000,Instacart
...,...,...,...
462,25000.000000000000,5.0000000000000000,GE
463,25000.000000000000,6.0000000000000000,Sony
464,23000.000000000000,8.0000000000000000,GlobalLogic
465,16000.0000000000000000,1.00000000000000000000,ZS Associates


In [None]:
#7 Data scientist salary by working experience

In [14]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,years_experience from employee,employee_title,title,employee_qualification
where employee.employee_id = employee_qualification.employee_id and employee.employee_id=employee_title.employee_id
and title.title_id = employee_title.title_id and title='Data Scientist' 
group by years_experience order by Avg_salary DESC



"""

# Execute the statement and get the results
results6 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results6[0].keys()

# Store results in a new dataframe
temp_df6 = pd.DataFrame(results6, columns=column_names)

# Show results
temp_df6

Unnamed: 0,avg_salary,years_experience
0,477500.0,22
1,380000.0,30
2,337880.9523809524,12
3,313000.0,21
4,303166.6666666666,17
5,300888.8888888889,16
6,300000.0,19
7,289619.0476190476,20
8,283756.75675675675,11
9,276111.1111111111,15


In [None]:
#8 data scientist salary by tag

In [15]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,tag from employee,employee_title,title,employee_specialization,specialization
where employee.employee_id = employee_specialization.employee_id and employee_specialization.tag_id=specialization.tag_id
and title.title_id = employee_title.title_id and employee.employee_id=employee_title.employee_id and title='Data Scientist'  
group by tag order by Avg_salary DESC



"""

# Execute the statement and get the results
results7 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results7[0].keys()

# Store results in a new dataframe
temp_df7 = pd.DataFrame(results7, columns=column_names)

# Show results
temp_df7

Unnamed: 0,avg_salary,tag
0,757000.000000000000,"ML, AI"
1,700000.000000000000,Product Data Science
2,630000.000000000000,Content
3,592000.000000000000,ML/Analytics
4,578000.000000000000,ml
...,...,...
332,29000.000000000000,SAP
333,29000.000000000000,Recommender Systems
334,22500.000000000000,Python
335,20000.000000000000,BixBy


In [None]:
#9 Data Scientist salary Man VS women

In [16]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,gender from employee,employee_title,title,employee_demographics
where title.title_id = employee_title.title_id and employee.employee_id=employee_title.employee_id and title='Data Scientist' 
and employee_demographics.employee_id=employee.employee_id
group by gender order by Avg_salary DESC



"""

# Execute the statement and get the results
results8 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results8[0].keys()

# Store results in a new dataframe
temp_df8 = pd.DataFrame(results8, columns=column_names)

# Show results
temp_df8

Unnamed: 0,avg_salary,gender
0,268181.8181818182,Other
1,210326.968973747,Unknown
2,202107.59027266028,Male
3,190063.71191135733,Female


In [None]:
#10 Data Scientist salary race

In [17]:
# Pass the SQL statement to filter data
stmt = """

select avg(totalyearlycompensation)as Avg_salary,race from employee,employee_title,title,employee_demographics
where title.title_id = employee_title.title_id and employee.employee_id=employee_title.employee_id and title='Data Scientist' 
and employee_demographics.employee_id=employee.employee_id
group by race order by Avg_salary DESC



"""

# Execute the statement and get the results
results9 = connection.execute(stmt).fetchall()

# Extract column names
column_names = results9[0].keys()

# Store results in a new dataframe
temp_df9 = pd.DataFrame(results9, columns=column_names)

# Show results
temp_df9

Unnamed: 0,avg_salary,race
0,210321.42857142855,Two Or More
1,209388.5918003565,Unknown
2,200350.15772870663,White
3,188524.75247524751,Asian
4,184593.75,Hispanic
5,162615.3846153846,Black
