In [50]:
import json 
import pandas as pd
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

In [51]:
db_params = {
    "database": "job_market",
    "user": "admin",
    "password": "root",
    "host": "localhost",
    "port": "5432"
}

conn = psycopg2.connect(**db_params)
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()

In [52]:
def get_or_create_company(cur, company_data):
    company = eval(company_data)
    cur.execute("SELECT companyId FROM Companies WHERE companyName = %s;", (company['name'],))
    result = cur.fetchone()
    if result:
        return result[0]
    else:
        cur.execute("INSERT INTO Companies (companyName, location, sector, information) VALUES (%s, %s, %s, %s) RETURNING companyId;",
                    (company['name'], company.get('location'), company.get('sector'), ''))
        return cur.fetchone()[0]

def get_or_create_source(cur, source_name):
    cur.execute("SELECT sourceId FROM Sources WHERE sourceName = %s;", (source_name,))
    result = cur.fetchone()
    if result:
        return result[0]
    else:
        cur.execute("INSERT INTO Sources (sourceName) VALUES (%s) RETURNING sourceId;", (source_name,))
        return cur.fetchone()[0]

In [53]:

job_offer_adzuna = pd.read_csv('../web_scrapping_wttj/output/job_offers_adzuna.csv')
job_offer_wttj = pd.read_csv('../web_scrapping_wttj/output/job_offers_wttj.csv')

df = pd.concat([job_offer_adzuna, job_offer_wttj], ignore_index=True)
df = df[~df.company.isna()].reset_index(drop=True)

df = df.assign(
	starting_date=pd.to_datetime(df.starting_date, format='%d %B %Y')
)
df['starting_date'] = df['starting_date'].dt.date

df['publication_date'] = pd.to_datetime(df['publication_date'], errors='coerce', format='%Y-%m-%d')

df = df.where(pd.notnull(df), None)

for _, row in df.iterrows():
    company_id = get_or_create_company(cur, row['company'])
    source_id = get_or_create_source(cur, row['source'])

    job_offer_data = (
        row['title'],
        company_id,
        row['salary'],
        row['remote_type'],
        row['contract_type'],
        row['starting_date'] if not pd.isnull(row['starting_date']) else None,
        row['location'],
        row['required_experience'],
        row['education'],
        row['description'],
        row['profil_experience'],
        row['publication_date'].date() if not pd.isnull(row['publication_date']) else None,
        row['url_direct_offer'],
        source_id
    )
    cur.execute("INSERT INTO JobOffers (title, companyId, salary, remoteType, contractType, startingDate, location, requiredExp, education, descriptions, profilExp, publicationDate, jobLink, sourceId) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);", job_offer_data)

conn.commit()
cur.close()
conn.close()

In [6]:
# for Skills table