Skip to content

Commit

Permalink
version prod
Browse files Browse the repository at this point in the history
  • Loading branch information
bluexm authored and bluexm committed May 6, 2020
1 parent ca94205 commit 658173b
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 21 deletions.
10 changes: 10 additions & 0 deletions scrape_indeed_20200506.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
1588738077.095023;2020-05-06 12:07:57;SmartBlkTrade;Research Data Scientist / Big Data Engineer, AI Dept;https://www.indeed.hk/rc/clk?jk=34710819bd9e8c4c&fccid=afefea0edad5ffb5&vjs=3;https://hk.indeed.com/viewjob?jk=34710819bd9e8c4c&from=serp&vjs=3;NA;NA;NA;NA;NA
1588738077.095023;2020-05-06 12:07:57;China Citic Bank International;Data Scientist;https://www.indeed.hk/rc/clk?jk=8d8e2762a3327031&fccid=94431f42f43dd27b&vjs=3;https://hk.indeed.com/viewjob?jk=8d8e2762a3327031&from=serp&vjs=3;NA;NA;NA;NA;NA
1588738077.095023;2020-05-06 12:07:57;FNA;Data Scientist;https://www.indeed.hk/rc/clk?jk=2e548a239066d4dc&fccid=be673821c44914d0&vjs=3;https://hk.indeed.com/viewjob?jk=2e548a239066d4dc&from=serp&vjs=3;NA;NA;NA;NA;NA
1588738077.095023;2020-05-06 12:07:57;Manulife;Data Integrator;https://www.indeed.hk/rc/clk?jk=46d4e327c2e0c8d3&fccid=935bdcc8ea6ea513&vjs=3;https://hk.indeed.com/viewjob?jk=46d4e327c2e0c8d3&from=serp&vjs=3;NA;NA;NA;NA;NA
1588738077.095023;2020-05-06 12:07:57;First Choice (Hong Kong) Medical Laboratory Limite...;Research Scientist;https://www.indeed.hk/rc/clk?jk=cb8beea7270632cf&fccid=b4ada5c8d4effcd4&vjs=3;https://hk.indeed.com/viewjob?jk=cb8beea7270632cf&from=serp&vjs=3;NA;NA;NA;NA;NA
1588738077.095023;2020-05-06 12:07:57;Neo Derm Ltd.;Data Scientist;https://www.indeed.hk/rc/clk?jk=44f68840b8c548b3&fccid=acdd7980231ab028&vjs=3;https://hk.indeed.com/viewjob?jk=44f68840b8c548b3&from=serp&vjs=3;NA;NA;NA;NA;NA
1588738077.095023;2020-05-06 12:07:57;Emerging Viral Diagnostics (HK) Limited;Research Scientist;https://www.indeed.hk/rc/clk?jk=3b27c37ba2caf7be&fccid=90e425d44574e282&vjs=3;https://hk.indeed.com/viewjob?jk=3b27c37ba2caf7be&from=serp&vjs=3;NA;NA;NA;NA;NA
1588738077.095023;2020-05-06 12:07:57;Bank of China (Hong Kong);AML Data Scientist;https://www.indeed.hk/rc/clk?jk=98e507e836860882&fccid=82a924ef8a1e6217&vjs=3;https://hk.indeed.com/viewjob?jk=98e507e836860882&from=serp&vjs=3;NA;NA;NA;NA;NA
1588738077.095023;2020-05-06 12:07:57;Match Talent;Data Scientist (Machine Learning);https://www.indeed.hk/rc/clk?jk=d282f2ec4b75875d&fccid=4da850c42dac5cb0&vjs=3;https://hk.indeed.com/viewjob?jk=d282f2ec4b75875d&from=serp&vjs=3;NA;NA;NA;NA;NA
1588738077.095023;2020-05-06 12:07:57;Hai Kang Life Corporation Ltd;Research Scientist;https://www.indeed.hk/rc/clk?jk=6554196a4124549d&fccid=1f7c4dc4b94d4995&vjs=3;https://hk.indeed.com/viewjob?jk=6554196a4124549d&from=serp&vjs=3;NA;NA;NA;NA;NA
43 changes: 22 additions & 21 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,19 @@
import datetime
import pandas as pd
#import Levenshtein as levs
#import scraperwiki as ws
import scraperwiki as ws
import pdb

## user params
NBPAGESMAX = 1 # number of pages for search results
NBPAGESMAX = 10 # number of pages for search results
RECORD_EXCEL = False # only not previously recorded ads are stored in the excel file
RECORD_CSV = True # all search results are stores in the CSV
RECORD_CSV = False # all search results are stores in the CSV
RECORD_DB = True # record in DB with wikiscraper (for morph.io)
USE_SCRAPERWIKI = False # if recorddb = True then use scraperwiki or SQLlite directly
RECORD_ALWAYSWRITE = True #always write record even if already recorded
USE_SCRAPERWIKI = True # if recorddb = True then use scraperwiki
DB_FILE = "data.sqlite"
DB_TITLES = ["timestamp","scrping_dt","ad_cie_indeed","ad_jobtitle_indeed","search_ad_url","ad_url","ad_jobdate", \
"ad_jobtitle","ad_jobcie","ad_jobdes","ad_email"]
"ad_jobtitle","ad_jobcie","ad_jobdes","ad_email"]
URL ='https://www.indeed.hk/jobs?q=Data+Scientist&start='

if RECORD_DB and not USE_SCRAPERWIKI:
Expand All @@ -43,18 +44,16 @@ def dict_value(tuple):
excelDBfilename = 'scraping_indeed.xlsx'
df = pd.read_excel(excelDBfilename, 'Sheet1', index_col=None, na_values=['NA'])

if RECORD_DB and USE_SCRAPERWIKI:
# TODO : add connection to scraper wiki to read database
pass
else: # record in lolcal database
if RECORD_DB and not USE_SCRAPERWIKI:
#pdb.set_trace()
try:
dfdb = pd.read_sql("select * from indeed_ads", CONNEXION)
dfdb = dfdb[1:]
except:
dfdb = pd.DataFrame(columns=DB_TITLES)
print("database empty ; creating table")

dfdb = pd.DataFrame(columns=DB_TITLES)
# or read column titles from database
# try:
# dfdb = pd.read_sql("select * from indeed_ads", CONNEXION)
# dfdb = dfdb[1:]
# except:
# print("database empty ; creating table")

## single ad page scrapers
def parse_workinginhongkong(pdata):
adtree = bs4.BeautifulSoup(pdata, 'html.parser')
Expand Down Expand Up @@ -107,7 +106,7 @@ def parse_classywheeler(pdata):

## NOT FINISHED !!!
def parse_whub(pdata):
pdb.set_trace()
#pdb.set_trace()
adtree = bs4.BeautifulSoup(pdata, 'html.parser')
root = adtree.find("h1", itemprop="title")
if root==None:
Expand Down Expand Up @@ -168,7 +167,7 @@ def parse_efinancialcareers(pdata):
## iterates on all search results
for c in content:
#exectime.timestamp()
rowres=[0, exectime.strftime("%Y-%m-%d %H:%M:%S")]
rowres=[exectime.timestamp(), exectime.strftime("%Y-%m-%d %H:%M:%S")]
#rowres.append(c.find_all("span",class_="company")[0].get_text())
#records company's listed on search
#pdb.set_trace()
Expand Down Expand Up @@ -230,7 +229,7 @@ def parse_efinancialcareers(pdata):
print("write csv")
writer.writerow(rowres)
#res.append(rowres)
if dorecord or True:
if dorecord or RECORD_ALWAYSWRITE:
# in dataframe for excel
if RECORD_EXCEL:
print("record to Excel")
Expand All @@ -251,11 +250,13 @@ def parse_efinancialcareers(pdata):

print("end of scraping ---------------------------------------------------------")

if RECORD_EXCEL:
df.to_excel(excelDBfilename, sheet_name='Sheet1', index=False)
if RECORD_DB and not USE_SCRAPERWIKI:
#pdb.set_trace()
dfdb.to_sql('indeed_ads',CONNEXION,if_exists='append', index=False)
CONNEXION.close()

if RECORD_EXCEL:
df.to_excel(excelDBfilename, sheet_name='Sheet1', index=False)

if RECORD_CSV:
csvfile.close()

0 comments on commit 658173b

Please sign in to comment.