In [1]:
import json 
import pandas as pd
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

In [2]:
db_params = {
    "database": "job_market",
    "user": "admin",
    "password": "root",
    "host": "localhost",
    "port": "5432"
}

conn = psycopg2.connect(**db_params)
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()

In [3]:
def get_or_create_company(cur, company_data):
    company = eval(company_data)
    cur.execute("SELECT companyId FROM Companies WHERE companyName = %s;", (company['name'],))
    result = cur.fetchone()
    if result:
        return result[0]
    else:
        cur.execute("INSERT INTO Companies (companyName, location, sector, information) VALUES (%s, %s, %s, %s) RETURNING companyId;",
                    (company['name'], company.get('location'), company.get('sector'), ''))
        return cur.fetchone()[0]

def get_or_create_source(cur, source_name):
    cur.execute("SELECT sourceId FROM Sources WHERE sourceName = %s;", (source_name,))
    result = cur.fetchone()
    if result:
        return result[0]
    else:
        cur.execute("INSERT INTO Sources (sourceName) VALUES (%s) RETURNING sourceId;", (source_name,))
        return cur.fetchone()[0]

In [5]:

job_offer_adzuna = pd.read_csv('../output/job_offers_adzuna.csv')
job_offer_wttj = pd.read_csv('../output/job_offers_wttj.csv')

df = pd.concat([job_offer_adzuna, job_offer_wttj], ignore_index=True)
df = df[~df.company.isna()].reset_index(drop=True)

df = df.assign(
	starting_date=pd.to_datetime(df.starting_date, format='%d %B %Y')
)
df['starting_date'] = df['starting_date'].dt.date

df['publication_date'] = pd.to_datetime(df['publication_date'], errors='coerce', format='%Y-%m-%d')

df = df.where(pd.notnull(df), None)

for _, row in df.iterrows():
    company_id = get_or_create_company(cur, row['company'])
    source_id = get_or_create_source(cur, row['source'])

    job_offer_data = (
        row['title'],
        company_id,
        row['salary'],
        row['remote_type'],
        row['contract_type'],
        row['starting_date'] if not pd.isnull(row['starting_date']) else None,
        row['location'],
        row['required_experience'],
        row['education'],
        row['description'],
        row['profil_experience'],
        row['publication_date'].date() if not pd.isnull(row['publication_date']) else None,
        row['url_direct_offer'],
        source_id
    )
    cur.execute("INSERT INTO JobOffers (title, companyId, salary, remoteType, contractType, startingDate, location, requiredExp, education, descriptions, profilExp, publicationDate, jobLink, sourceId) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);", job_offer_data)



ValueError: time data "05 février 2024" doesn't match format "%d %B %Y", at position 1. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [21]:
# for the Skills table

skills_list = ['API', 'AWS', 'Airflow', 'Bash', 'Cassandra', 'Docker', 'ElasticSearch', 'FastAPI', 'Flask', 'Flask', 'Flume', 'GCP', 'Git', 'Go', 'Hadoop', 'Hbase', 'Hive', 'Java', 'Java Spark', 'Julia', 'Kafka', 'Kotlin', 'Kubernetes', 'Matlib', 'Microsoft Azure', 'MongoDb', 'Neo4j', 'NoSQL', 'Perl', 'PySpark', 'Python', 'R', 'Redshift', 'SQL', 'Scala', 'SckitLearn', 'Sklearn', 'Snowflake', 'Spark', 'Spark Structured Streaming', 'Terradata']
#skills_dict = dict.fromkeys(skills_list,0)
#print(skills_dict)


for _, row in df.iterrows():
    description = (row['description'])
    skill_data = []
    for skill in skills_list:
        if skill in description:
            skill_data.append(skill)
    cur.execute("INSERT INTO Skills (skills) VALUES (%s);", skill_data)



from collections import Counter

#print(type(str(df["description"])))

word_freq = Counter(str(df["description"]).split()).most_common()
print(len(word_freq))
print(type(word_freq))


df["description"].to_csv('tmp_descriptions.csv', index=False)

#for _, row in df.iterrows():


{'API': 0, 'AWS': 0, 'Airflow': 0, 'Bash': 0, 'Cassandra': 0, 'Docker': 0, 'ElasticSearch': 0, 'FastAPI': 0, 'Flask': 0, 'Flume': 0, 'GCP': 0, 'Git': 0, 'Go': 0, 'Hadoop': 0, 'Hbase': 0, 'Hive': 0, 'Java': 0, 'Java Spark': 0, 'Julia': 0, 'Kafka': 0, 'Kotlin': 0, 'Kubernetes': 0, 'Matlib': 0, 'Microsoft Azure': 0, 'MongoDb': 0, 'Neo4j': 0, 'NoSQL': 0, 'Perl': 0, 'PySpark': 0, 'Python': 0, 'R': 0, 'Redshift': 0, 'SQL': 0, 'Scala': 0, 'SckitLearn': 0, 'Sklearn': 0, 'Snowflake': 0, 'Spark': 0, 'Spark Structured Streaming': 0, 'Terradata': 0}
71
<class 'list'>


In [20]:
list_to_sort = ["SQL",
"Bash",
"Git",
"NoSQL",
"Redshift",
"Terradata",
"Cassandra",
"Spark",
"Hadoop",
"Kafka",
"Hbase",
"Hive",
"Microsoft Azure",
"AWS",
"GCP",
"Python",
"Java",
"Java Spark",
"PySpark",
"Go",
"Scala",
"Julia",
"Perl",
"MongoDb",
"ElasticSearch",
"Flume",
"Docker",
"API",
"Flask",
"Airflow",
"Kubernetes",
"SckitLearn",
"Kotlin",
"Spark Structured Streaming",
"R",
"Neo4j",
"Sklearn",
"Matlib",
"FastAPI",
"Flask",
"Snowflake"]
print(sorted(list_to_sort))

['API', 'AWS', 'Airflow', 'Bash', 'Cassandra', 'Docker', 'ElasticSearch', 'FastAPI', 'Flask', 'Flask', 'Flume', 'GCP', 'Git', 'Go', 'Hadoop', 'Hbase', 'Hive', 'Java', 'Java Spark', 'Julia', 'Kafka', 'Kotlin', 'Kubernetes', 'Matlib', 'Microsoft Azure', 'MongoDb', 'Neo4j', 'NoSQL', 'Perl', 'PySpark', 'Python', 'R', 'Redshift', 'SQL', 'Scala', 'SckitLearn', 'Sklearn', 'Snowflake', 'Spark', 'Spark Structured Streaming', 'Terradata']


In [39]:
#couper la connexion

conn.commit()
cur.close()
conn.close()