In [109]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import tqdm
import datetime

# Step 1: Extract (Web Scraping)

In [123]:
URL = 'https://www.gulftalent.com/jobs/search?pos_ref=data&frmPositionCountry=#!?category=&industry=&seniority=&country=&city=&employment_type=&has_external_application=&keyword=data'

In [124]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

# soup.find_all('a')

# Job
job_title = [element.text.strip() for element in soup.find_all(class_='title')]

# Location
location = [element.text for element in soup.find_all(class_="location")]

# Date
date = [element.text.strip() for element in soup.find_all(class_="date pull-right")]

# Company
company_name = [element.text for element in soup.find_all(class_="company-name")]

# Link
link = [a['href'] for a in soup.find_all(class_='ga-job-impression ga-job-click job-results-item section')]

df = pd.DataFrame({
    'job_title': job_title,
    'location': location,
    'date': date,
    'comany_name': company_name,
    'job_link': link
}
    )

df.head()

Unnamed: 0,job_title,location,date,comany_name,job_link
0,Data Analyst / Specialist,Dubai,9 Aug 2023,Ultimate HR Solutions,/mobile/uae/jobs/data-analyst-specialist-382848
1,Data Strategy Consultant,UAE,22 Sep 2023,Core Consultants,/mobile/uae/jobs/data-strategy-consultant-388588
2,Data Manager - Consulting Technology,Riyadh,29 Oct 2023,PricewaterhouseCoopers,/mobile/saudi-arabia/jobs/data-manager-consult...
3,Data Engineering Manager,UAE,3 Aug 2023,Michael Page,/mobile/uae/jobs/data-engineering-manager-382247
4,Data Science Manager,Riyadh,25 Oct 2023,Deloitte & Touche (M.E.),/mobile/saudi-arabia/jobs/data-science-manager...


## Step 2: Transform (Data Transformation)

In [128]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
BASE_URL = 'https://www.gulftalent.com'

In [None]:
# df['job_link'] = [BASE_URL + link for link in df['job_link']]   
df['job_link'] = df['job_link'].apply(lambda link: BASE_URL + link)

In [137]:
df['job_link'] = df['job_link'].str.replace('/mobile', '')

In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   job_title    25 non-null     object        
 1   location     25 non-null     object        
 2   date         25 non-null     datetime64[ns]
 3   comany_name  25 non-null     object        
 4   job_link     25 non-null     object        
dtypes: datetime64[ns](1), object(4)
memory usage: 1.1+ KB


# Step 3: Load


In [148]:
from sqlalchemy import create_engine, Column, Integer, String, text, Date
from sqlalchemy.orm import Session, declarative_base

### Create new database

In [146]:
# Core Approch for building database
engine = create_engine("postgresql://andisheh:12345@localhost:5432/postgres")

# Create connection
with engine.connect() as connection:
        connection.execute(text("COMMIT"))
        connection.execute(text("CREATE DATABASE arab_job_search"))
        connection.commit()

### Create Table with SQLalchemy (ORM)

In [153]:
# Connect to new database
engine = create_engine("postgresql://andisheh:12345@localhost:5432/arab_job_search")

session = Session(engine)

Base = declarative_base()

class ArabJob(Base):
    
    __tablename__ = 'arabjobsearch'
    id = Column(Integer, primary_key=True)
    job_title = Column(String)
    location = Column(String)
    date = Column(Date)
    comany_name = Column(String)
    job_link = Column(String)
    
    
Base.metadata.create_all(engine)
session.commit()


In [None]:
'arabjobsearch'= 

## Insert data in arabjobsearch table

In [175]:
df.to_dict(orient='records')[0]

{'job_title': 'Data Analyst / Specialist',
 'location': 'Dubai',
 'date': Timestamp('2023-08-09 00:00:00'),
 'comany_name': 'Ultimate HR Solutions',
 'job_link': 'https://www.gulftalent.com/uae/jobs/data-analyst-specialist-382848'}

In [181]:
list_of_job_row = [ArabJob(**row) for row in df.to_dict(orient='records')]

In [182]:
list_of_job_row

[<__main__.ArabJob at 0x7fc1502f3d30>,
 <__main__.ArabJob at 0x7fc1502f34c0>,
 <__main__.ArabJob at 0x7fc1502f3c70>,
 <__main__.ArabJob at 0x7fc1502f3670>,
 <__main__.ArabJob at 0x7fc1502f3280>,
 <__main__.ArabJob at 0x7fc1502f3940>,
 <__main__.ArabJob at 0x7fc1502f3df0>,
 <__main__.ArabJob at 0x7fc1502f3460>,
 <__main__.ArabJob at 0x7fc1502f3d90>,
 <__main__.ArabJob at 0x7fc1502f3a30>,
 <__main__.ArabJob at 0x7fc1502f3a60>,
 <__main__.ArabJob at 0x7fc1502f30a0>,
 <__main__.ArabJob at 0x7fc1504b3e20>,
 <__main__.ArabJob at 0x7fc1504b3a90>,
 <__main__.ArabJob at 0x7fc1504b3d00>,
 <__main__.ArabJob at 0x7fc1504b3130>,
 <__main__.ArabJob at 0x7fc1504b3a60>,
 <__main__.ArabJob at 0x7fc1504b3790>,
 <__main__.ArabJob at 0x7fc1504b38e0>,
 <__main__.ArabJob at 0x7fc1504b3940>,
 <__main__.ArabJob at 0x7fc1504b3cd0>,
 <__main__.ArabJob at 0x7fc1504b3df0>,
 <__main__.ArabJob at 0x7fc1504b3c40>,
 <__main__.ArabJob at 0x7fc1504b3d30>,
 <__main__.ArabJob at 0x7fc1504b3ca0>]

In [183]:
session.add_all(list_of_job_row)
session.commit()

In [166]:
def add(a, b, *args, **kwargs):
    return a, b, args, kwargs

In [169]:
add(5, 7, 3, 6, 10,f=9)

(5, 7, (3, 6, 10), {'f': 9})

In [170]:
dict_ = {'a': 4, 'b': 10}

In [171]:
add(**dict_)

(4, 10, (), {})