In [None]:
# pip install sqlalchemy
# pip install psycopg2

In [26]:
# Import python database driver
import psycopg2

# Import config file for password and username (not commited to Github)
from config import username, password

# Import create_engine and Session to interact with PostGres using Alchemy
from sqlalchemy import create_engine
# from sqlalchemy.orm import Session
from sqlalchemy.orm import sessionmaker


# Import Base and features to create tables using ALchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Date, Float, ForeignKey, ForeignKeyConstraint
from datetime import datetime

from sqlalchemy.orm import relationship

# Others
import pandas as pd

# Inspect csv files to be inserted into database

In [27]:
main_table_df = pd.read_csv("Transformed Data/AllJobMarket-Transformed.csv")
main_table_df.head()

Unnamed: 0,job_id,job_title_id,country_id,job_title,company_name,location_id
0,p_ecae2dcad8f17d8b,1,2,Data & Systems Analyst,Protein Industries Canada,161
1,pj_12dccdfbb8ef0da5,1,2,Junior Data Analyst - LOCAL | MTL,BDP CALL CENTER,209
2,pj_7837ad55c28258ea,1,2,Pipeline Inline-Inspection Data Analyst (ILI L...,Onstream Pipeline Inspection Services Inc.,69
3,p_05719d87a0059bf7,1,2,Data and Reporting Analyst,Nunavut Government,106
4,p_bf4bd5f13d04a674,1,2,Specialist-Data Visualization,Canadian Red Cross,72


In [28]:
location_df = pd.read_csv("Transformed Data/Location-coordinates.csv")
location_df.head()

Unnamed: 0,location_id,country_id,city,state,lat,lng
0,161,2,Regina,SK,50.44876,-104.61731
1,209,2,Vaudreuil-Dorion,QC,45.397151,-74.025458
2,69,2,Calgary,AB,51.053423,-114.062589
3,106,2,Iqaluit,NU,63.74944,-68.521857
4,72,2,Canada,Canada,61.066692,-107.991707


In [29]:
country_df = pd.read_csv("Transformed Data/country.csv")
country_df.head()

Unnamed: 0,country_id,country_name
0,1,Singapore
1,2,Canada
2,3,United States
3,4,Australia


In [30]:
job_df = pd.read_csv("Transformed Data/job_title.csv")
job_df.head()

Unnamed: 0,job_title_id,job_title
0,1,Data Analyst
1,2,Data Scientist
2,3,Data Engineer
3,4,Machine Learning


In [31]:
mentalhealth_df = pd.read_csv("Transformed Data/MentalHealth-Transformed.csv")
mentalhealth_df.head()

Unnamed: 0,sample_id,timestamp,age,gender,country_id,state,self_employed,family_history,treatment,work_interfere,...,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical
0,0,2014-08-27 11:29:31,37,Female,3,IL,,No,Yes,Often,...,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes
1,1,2014-08-27 11:29:37,44,Male,3,IN,,No,No,Rarely,...,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know
2,2,2014-08-27 11:29:44,32,Male,2,,,No,No,Rarely,...,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No
3,4,2014-08-27 11:30:22,31,Male,3,TX,,No,No,Never,...,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know
4,5,2014-08-27 11:31:22,33,Male,3,TN,,Yes,No,Sometimes,...,Don't know,Don't know,Don't know,No,No,Yes,Yes,No,Maybe,Don't know


In [32]:
instituition_df = pd.read_csv("Transformed Data/University-Transformed.csv")
instituition_df.rename(columns = {"Unnamed: 0": "institition_id"}, inplace=True)
instituition_df.head()

Unnamed: 0,institition_id,institution,world_rank,country_id,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,0,Harvard University,1,3,1,1,1,1,1,1,1,1.0,2,100.0,2014
1,1,Stanford University,2,3,2,11,2,4,5,3,3,4.0,6,99.09,2014
2,2,Massachusetts Institute of Technology,3,3,3,3,11,2,15,2,2,2.0,1,98.69,2014
3,3,Columbia University,6,3,4,13,8,9,14,13,9,13.0,4,97.41,2014
4,4,"University of California, Berkeley",7,3,5,4,22,6,7,4,3,7.0,28,92.84,2014


# Connect to Database and Create Tables

In [33]:
# Create engine as an interface to PostgreSQL database named Project_2_ETL
db_local = f'postgresql://{username}:{password}@localhost:5432/Project_2_ETL'

# An ElephantSQL link was also imported for cloud database access
# db_link = "postgres://irxrnnfx:8E4uvAlptBYdblhx20hS5t_elOsRbOxm@suleiman.db.elephantsql.com:5432/irxrnnfx"
db = create_engine(db_local)

In [34]:
# Declarative_base use used to create classes and tables relative to that base
Base = declarative_base()

In [35]:
# Define mapped classes on the base, and create tables

class Country(Base):  
    __tablename__ = 'country'
    
    country_id = Column(Integer, primary_key=True)
    country_name = Column(String)
    
class Location(Base):  
    __tablename__ = 'location'
    
    location_id = Column(Integer, primary_key=True)
    city = Column(String)
    state = Column(String)
    country_id = Column(Integer, ForeignKey("country.country_id"))
    lat = Column(Float)
    lng = Column(Float)
    
class Main(Base):  
    __tablename__ = 'maintable'
    
    job_id = Column(String, primary_key=True)
    job_title_id = Column(Integer, ForeignKey("job.job_title_id"))
    country_id = Column (Integer, ForeignKey("country.country_id"))
    job_title = Column(String)
    company_name = Column(String)
    location_id = Column (Integer, ForeignKey("location.location_id"))
      
class Job(Base):  
    __tablename__ = 'job'
    
    job_title_id = Column(Integer, primary_key=True)
    job_title = Column(String)
    
class Instituition(Base):  
    __tablename__ = 'instituition'
    
    institition_id = Column(Integer, primary_key=True)
    world_rank = Column(Integer)
    institution = Column(String)
    country_id = Column(Integer, ForeignKey("country.country_id"))
    national_rank = Column(Integer)
    quality_of_education = Column(Integer)
    alumni_employment = Column(Integer)
    quality_of_faculty = Column(Integer)
    publications = Column(Integer)
    influence = Column(Integer)
    citations = Column(Integer)
    broad_impact = Column(Integer)
    patents = Column(Integer)
    score = Column(Float)
    year = Column(Integer)

class Mentalhealth(Base):  
    __tablename__ = 'mentalhealth'   
    
    sample_id = Column(Integer, primary_key=True)
    timestamp = Column(Date)
    age = Column(Integer)
    gender = Column(String)
    country_id = Column(Integer, ForeignKey("country.country_id"))
    state = Column(String)
    self_employed = Column(String)
    family_history = Column(String)
    treatment = Column(String)
    work_interfere = Column(String)
    no_employees = Column(String)
    remote_work = Column(String)
    tech_company = Column(String)
    benefits = Column(String)
    care_options = Column(String)
    wellness_program = Column(String)
    seek_help = Column(String)
    anonymity = Column(String)
    leave = Column(String)
    mental_health_consequence = Column(String)
    phys_health_consequence = Column(String)
    coworkers = Column(String)
    supervisor = Column(String)
    mental_health_interview = Column(String)
    phys_health_interview = Column(String)
    mental_vs_physical = Column(String) 


In [36]:
# Create session to talk to the database (instead of direct connect and execute)
# A session is a workspace for your objects, at this point we hasnt opened any connections yet until we commit all changes and close the session
Session = sessionmaker(db)  
session = Session()

In [37]:
# The Table is a member of a larger collection called MetaData
# MetaData has the ability to emit a limited set of schema generation commands to database (ex: Create Table statement)
Base.metadata.create_all(db)

In [38]:
job_df.to_sql('job', con=db, index=False, if_exists='append', method='multi')

In [39]:
country_df.to_sql('country', con=db, index=False, if_exists='append',method='multi')

In [40]:
location_df.to_sql('location', con=db, index=False, if_exists='append', method='multi')

In [41]:
# Push dataframes to tables created on database
main_table_df.to_sql('maintable', con=db, index=False, if_exists='append', method='multi')

In [42]:
mentalhealth_df.to_sql('mentalhealth', con=db, index=False, if_exists='append', method='multi')

In [43]:
instituition_df.to_sql('instituition', con=db, index=False, if_exists='append', method='multi')

In [44]:
# Commit and close session
session.commit()
session.close()

In [45]:
# Check if values are commited successfully
db.execute("SELECT * FROM maintable LIMIT 50").fetchall()

[('p_ecae2dcad8f17d8b', 1, 2, 'Data & Systems Analyst', 'Protein Industries Canada', 161),
 ('pj_12dccdfbb8ef0da5', 1, 2, 'Junior Data Analyst - LOCAL | MTL', 'BDP CALL CENTER', 209),
 ('pj_7837ad55c28258ea', 1, 2, 'Pipeline Inline-Inspection Data Analyst (ILI Level 2-3)', 'Onstream Pipeline Inspection Services Inc.', 69),
 ('p_05719d87a0059bf7', 1, 2, 'Data and Reporting Analyst', 'Nunavut Government', 106),
 ('p_bf4bd5f13d04a674', 1, 2, 'Specialist-Data Visualization', 'Canadian Red Cross', 72),
 ('p_13a59e490ff74b5b', 1, 2, 'Irrigation Data Analyst', 'Government of Alberta', 121),
 ('p_342507b44891b778', 1, 2, 'Content Researcher & Analyst', 'Upfeat Media Inc.', 162),
 ('p_badd1ceb510f77f0', 1, 2, 'Data Analyst', 'Paradigm Consulting Group', 161),
 ('p_ee7f4d3dae51aa09', 1, 2, 'Enterprise DBA or Data Analyst', 'Finesse-Tech', 63),
 ('pj_e65431d29d17c9f0', 1, 2, 'Business Analyst', 'LENDCARE', 152),
 ('pj_341545c1296400eb', 1, 2, 'Junior Business Analyst, Inventory Control (Excel Exp

In [46]:
db.execute("SELECT * FROM location LIMIT 50").fetchall()

[(161, 'Regina', 'SK', 2, 50.44876, -104.61731),
 (209, 'Vaudreuil-Dorion', 'QC', 2, 45.3971509, -74.0254577),
 (69, 'Calgary', 'AB', 2, 51.0534234, -114.0625892),
 (106, 'Iqaluit', 'NU', 2, 63.74944, -68.521857),
 (72, 'Canada', 'Canada', 2, 61.0666922, -107.9917071),
 (121, 'Lethbridge', 'AB', 2, 49.694285, -112.851562),
 (162, 'Remote', 'Remote', 2, 0.0, 0.0),
 (63, 'British Columbia', 'British Columbia', 2, 49.2048182, -122.9061329),
 (152, 'Pickering', 'ON', 2, 43.835765, -79.090576),
 (205, 'Toronto', 'ON', 2, 43.6534817, -79.3839347),
 (195, 'Surrey', 'BC', 2, 49.1913033, -122.8491439),
 (43, '100 Mile House', 'BC', 2, 51.6427866, -121.2956943),
 (119, 'Laval', 'QC', 2, 45.5757802, -73.7530656),
 (174, 'Saskatoon', 'SK', 2, 52.131802, -106.660767),
 (126, 'Markham', 'ON', 2, 43.8563707, -79.3376825),
 (144, 'Ontario', 'Ontario', 2, 44.6600802, -76.4905411),
 (159, 'Quebec City', 'QC', 2, 46.8259601, -71.2352226),
 (55, 'Bedford', 'QC', 2, 45.4745788, -73.6242104),
 (132, 'Missis

In [47]:
db.execute("SELECT * FROM country").fetchall()

[(1, 'Singapore'), (2, 'Canada'), (3, 'United States'), (4, 'Australia')]

In [23]:
db.execute("SELECT * FROM job").fetchall()

[(1, 'Data Analyst'),
 (2, 'Data Scientist'),
 (3, 'Data Engineer'),
 (4, 'Machine Learning')]

In [24]:
db.execute("SELECT * FROM mentalhealth LIMIT 50").fetchall()

[(0, datetime.date(2014, 8, 27), 37, 'Female', 3, 'IL', None, 'No', 'Yes', 'Often', '6-25', 'No', 'Yes', 'Yes', 'Not sure', 'No', 'Yes', 'Yes', 'Somewhat easy', 'No', 'No', 'Some of them', 'Yes', 'No', 'Maybe', 'Yes'),
 (1, datetime.date(2014, 8, 27), 44, 'Male', 3, 'IN', None, 'No', 'No', 'Rarely', 'More than 1000', 'No', 'No', "Don't know", 'No', "Don't know", "Don't know", "Don't know", "Don't know", 'Maybe', 'No', 'No', 'No', 'No', 'No', "Don't know"),
 (2, datetime.date(2014, 8, 27), 32, 'Male', 2, None, None, 'No', 'No', 'Rarely', '6-25', 'No', 'Yes', 'No', 'No', 'No', 'No', "Don't know", 'Somewhat difficult', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No'),
 (4, datetime.date(2014, 8, 27), 31, 'Male', 3, 'TX', None, 'No', 'No', 'Never', '100-500', 'Yes', 'Yes', 'Yes', 'No', "Don't know", "Don't know", "Don't know", "Don't know", 'No', 'No', 'Some of them', 'Yes', 'Yes', 'Yes', "Don't know"),
 (5, datetime.date(2014, 8, 27), 33, 'Male', 3, 'TN', None, 'Yes', 'No', 'Sometimes', '6-2

In [25]:
db.execute("SELECT * FROM instituition LIMIT 50").fetchall()

[(0, 1, 'Harvard University', 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 100.0, 2014),
 (1, 2, 'Stanford University', 3, 2, 11, 2, 4, 5, 3, 3, 4, 6, 99.09, 2014),
 (2, 3, 'Massachusetts Institute of Technology', 3, 3, 3, 11, 2, 15, 2, 2, 2, 1, 98.69, 2014),
 (3, 6, 'Columbia University', 3, 4, 13, 8, 9, 14, 13, 9, 13, 4, 97.41, 2014),
 (4, 7, 'University of California, Berkeley', 3, 5, 4, 22, 6, 7, 4, 3, 7, 28, 92.84, 2014),
 (5, 8, 'University of Chicago', 3, 6, 10, 14, 8, 17, 19, 10, 18, 149, 92.03, 2014),
 (6, 9, 'Princeton University', 3, 7, 5, 16, 3, 70, 25, 19, 41, 204, 88.56, 2014),
 (7, 10, 'Yale University', 3, 8, 9, 25, 11, 18, 7, 32, 19, 45, 88.11, 2014),
 (8, 11, 'Cornell University', 3, 9, 12, 18, 19, 23, 15, 23, 23, 12, 85.8, 2014),
 (9, 12, 'California Institute of Technology', 3, 10, 6, 303, 7, 48, 6, 16, 24, 9, 85.5, 2014),
 (10, 14, 'University of Pennsylvania', 3, 11, 21, 4, 26, 8, 17, 13, 10, 28, 79.3, 2014),
 (11, 15, 'University of California, Los Angeles', 3, 12, 27, 27, 12, 