In [26]:
import json 
import pandas as pd
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

In [45]:
db_params = {
    "database": "job_market",
    "user": "admin",
    "password": "root",
    "host": "localhost",
    "port": "5432"
}

conn = psycopg2.connect(**db_params)
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()

In [50]:
def get_or_create_company(cur, company_data):
    company = eval(company_data)
    cur.execute("SELECT companyId FROM Companies WHERE companyName = %s;", (company['name'],))
    result = cur.fetchone()
    if result:
        return result[0]
    else:
        cur.execute("INSERT INTO Companies (companyName, location, sector, information) VALUES (%s, %s, %s, %s) RETURNING companyId;",
                    (company['name'], company.get('location'), company.get('sector'), ''))
        return cur.fetchone()[0]

def get_or_create_source(cur, source_name):
    cur.execute("SELECT sourceId FROM Source WHERE sourceName = %s;", (source_name,))
    result = cur.fetchone()
    if result:
        return result[0]
    else:
        cur.execute("INSERT INTO Source (sourceName) VALUES (%s) RETURNING sourceId;", (source_name,))
        return cur.fetchone()[0]

In [56]:

csv_file_path = '../web_scrapping_wttj/output/job_offers_wttj.csv'

df = pd.read_csv(csv_file_path)
df = df[~df.company.isna()].reset_index(drop=True)
df['starting_date'] = pd.to_datetime(df['starting_date'], errors='coerce', format='%Y-%m-%d')
df['publication_date'] = pd.to_datetime(df['publication_date'], errors='coerce', format='%Y-%m-%d')

df = df.where(pd.notnull(df), None)

for _, row in df.iterrows():
    company_id = get_or_create_company(cur, row['company'])
    source_id = get_or_create_source(cur, row['source'])

    job_offer_data = (
        row['title'],
        company_id,
        row['salary'],
        row['remote_type'],
        row['contract_type'],
        row['starting_date'].date() if not pd.isnull(row['starting_date']) else None,
        row['location'],
        row['require_experience'],
        row['education'],
        row['description'],
        row['profil_experience'],
        row['publication_date'].date() if not pd.isnull(row['publication_date']) else None,
        row['url_direct_offer'],
        source_id
    )
    cur.execute("INSERT INTO job_offers (title, companyId, salary, remoteType, contractType, startingDate, location, require_exp, education, descriptions, profil_exp, publicationDate, jobLink, sourceId) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);", job_offer_data)

conn.commit()
cur.close()
conn.close()