In [45]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
import time
import re
from multiprocessing import Pool
from functools import partial
from datetime import datetime, timedelta
import os.path
import os

from pymongo import MongoClient 
import pandas as pd 
import json
from pymongo import errors 
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
from pymongo.errors import BulkWriteError
from tempfile import NamedTemporaryFile
import string
import random

class IndeedMongodbDao:
    def __init__(self):
        self.conn = MongoClient() 
        self.db = self.conn.Indeed
        self.collection = self.db.data
        
    def _valid_url_format(self,url):
        val = URLValidator()
        try:
            val(url)
        except ValidationError as e:
            raise Exception('bad format for url {}'.format(ur))
    
    def insert_data_bulk(self,data):
        try:
            self.collection.insert_many(data)
        except BulkWriteError as bwe:
            print(bwe.details)
            print(bwe.details['writeErrors'])
            raise
    
    def insert_data(self, url, title, name, address, publication_date,salaire, description, localisation):
        
        try:
            if url == "":
                raise Exception('url cannot be empty {}'.format(ur))

            self._valid_url_format(url)

            if title == "":
                raise Exception('title cannot be empty {}'.format(title))

            if name == "":
                raise Exception('the name of company cannot be be empty {}'.format(title))

            if description == "":
                raise Exception('description of company cannot be be empty {}'.format(title))

            line_to_insert = {
                                "url": url,
                                "titre":title,
                                "nom_entreprise":name,
                                "adresse":address,
                                "date_de_publication":publication_date,
                                "salaire":salaire,
                                "description":description,
                                "localisation":localisation
                             }

            # Insert Data 
            result = self.collection.insert_one(line_to_insert) 
        except Exception as e:
            print(e)
    
    def get_all_data(self):
        data = self.collection.find({})
        return data
    
    def description_exist(self, description):
        return self.collection.find({"description" : description}).count() > 0
        
    def url_exist(self, url):
        return self.collection.find({"url" : url}).count() > 0
    
    def get_all_dupliate(self):
        return self.collection.aggregate([{"$group" : { "_id": "$url", "count": { "$sum": 1 } }}, {"$match": {"_id" :{ "$ne" : None } , "count" : {"$gt": 1} } }, {"$project": {"u" : "$_id", "_id" : 0}}])

class KeyWordsProvider:
    
    def get_langages(self):
        return ['python', 'r','vba', 'mysql','excel','asp.net','nosql','sql','linux','mongodb',
            'mariadb','java','javascript','php','html','css','sas','c#','ruby','swift','objective-c',
            'vb.net','kotlin','scala','bash','powershell','shell','front end','back end','soap']
    
    def get_tools(self):
        return ['tableau', 'powerbi','symfony', 'jquery','angular','react','react native','node js','git','github',
            'visual studio','django','flask','api rest','laravel', 'hololens', 'docker', 'jira', 'scrum', 'kanban', 
               'azure','aws', 'teamcity', 'jenkins']
    
    def get_others(self):
        return ['cisco','intel','android','ios']

class IndeedItemParser:
    def __init__(self):
        self.driverPath = "C:\\Users\\User\\Documents\\selenium\\driver\\chromedriver.exe"
        
    def _get_title(self, driver):
        try:
            title = driver.find_element_by_xpath("//*[@class='jobsearch-DesktopStickyContainer']//h3")
        except Exception as e:
            print(e)
        return title.text
    
    def _get_name(self, driver):
        
        try:    
            name = driver.find_element_by_xpath("//*[contains(@class,'jobsearch-InlineCompanyRating')]//div[1]")
            return name.text
        except Exception as e:
            print(e)
        
    
    def _get_address(self,driver):
         try:   
            address = driver.find_element_by_xpath("//*[contains(@class,'jobsearch-InlineCompanyRating')]//div[3]")
            if address.text == "-":
                address = driver.find_element_by_xpath("//*[contains(@class,'jobsearch-InlineCompanyRating')]//div[4]")
            return address.text
         except Exception as e:
            address = driver.find_element_by_xpath("//span[@class='jobsearch-JobMetadataHeader-iconLabel'][1]")
            return address.text
    
    def _get_salaire(self,driver, description):
        return np.nan # à compléter dans la partie pre-processing
            
    
    def _get_description(self,driver):
        try:
            #jobDescriptionText
            e_description = driver.find_element_by_id("jobDescriptionText")
            return e_description.text
        except Exception as e:
            print(e)
    
    def _get_date(self,driver,url,name):
       # print(url)
        try:
            date_str = driver.find_element_by_xpath("//*[@class='jobsearch-JobMetadataFooter']")
            date_str_full = date_str.text
            date_tbl = date_str_full.split(" ")
            count_str = date_tbl[4]
            label = date_tbl[5]
            
            if name in date_str_full:
                date_str_full = date_str_full.replace(name, "")
                date_tbl = date_str_full.split(" ")
                count_str = date_tbl[5]
                label = date_tbl[6]
                
            if count_str == "a" :
                count_str = date_tbl[5]
                label = date_tbl[6]
                
            #print("date_str", date_tbl)
            date = datetime.now()
            
            if count_str == "30+":
                return date - timedelta(days=30)
            
            count = int(count_str)
            if "jour" in label:
                date = date - timedelta(days=count)
            elif "heur" in label:
                date = date - timedelta(hours=count)
            return date;
        except Exception as e:
            print(e)
    
    
    def parse(self,url):
        driver = webdriver.Chrome(self.driverPath)
        driver.get(url)
        driver.maximize_window()
        
        source = driver.page_source
        title = self._get_title(driver)
        name = self._get_name(driver)
        address = self._get_address(driver)
        date = self._get_date(driver, url,name)
        description = self._get_description(driver)
        salaire = self._get_salaire(driver,description)
        
        driver.quit()
        
        return title, name, address, date, salaire, description, source


class IndeedPaser:
    def __init__(self):
        self.website = "https://www.indeed.fr"
        self.driverPath = "C:\\Users\\User\\Documents\\selenium\\driver\\chromedriver.exe"
        self.dao = IndeedMongodbDao()
        
        self.jobs = ["développeur", "data scientist", "data analyst", "business intelligence"]
        self.locations = ["Lyon", "Toulouse", "Nantes", "Bordeaux","Paris"]
        self.indeed_item_parser = IndeedItemParser()
        self.keyWordsProvider = KeyWordsProvider()
    
    def _get_pages_counts(self,driver):
        searchCountPages = None
        try:
            searchCountPages_elt = driver.find_element_by_id("searchCountPages")
            searchCountPages = searchCountPages_elt.text.split()
        except Exception as e: 
            print(e)
        
        if searchCountPages != None:    
            if len(searchCountPages) == 6:
                searchCountPages = int("{0}{1}".format(searchCountPages[3],searchCountPages[4])) 
            else :
                searchCountPages = searchCountPages[3]  
            result = (int(searchCountPages) // 18)

            if result == 1:
                result = 2
            return result
    
    def _get_subs_collections(self,items, nbr=5):
        result = []
        sub = []
        for index, item in enumerate(items):
            sub.append(item)
            if (index > 0) & (index % nbr) == 0:
                result.append(sub)
                sub = []
        return result
    
    def _local_parse_page(self, item_link,localisation):
        try:
            if self.dao.url_exist(item_link) == True:
                print("aready parsed, skip", item_link)
            else:
                title, name, address, date,salaire, description, source = self.indeed_item_parser.parse(item_link)
                if self.dao.description_exist(description) == True:
                    print("doublon, skip.")
                    return
                                
                self.dao.insert_data(item_link,title,name,address,date,salaire,description,localisation)
                print("saved : {0}",item_link)   
                
                self.create_local_file(item_link, source)
        except Exception as e:
            print(e)
    
    def randomString(self, stringLength=10):
        letters = string.ascii_lowercase
        return ''.join(random.choice(letters) for i in range(stringLength))
                    
    def create_local_file(self, item_link, source):
        file = open("pages/{0}.html".format(self.randomString()), "w")
        source = item_link + "----------------" + source
        file.write(source)
        file.close()
                    
    def parse(self):
        browser = webdriver.Chrome(self.driverPath)
        browser.get(self.website)
        browser.maximize_window()
        
        for job in self.jobs:
            jobs_filter_list = [job]
            
            if job == "développeur":
                all_competences = self.keyWordsProvider.get_langages() + self.keyWordsProvider.get_tools()
                jobs_filter_list = ["{0} {1}".format(job, item) for item in all_competences]
            
            random.shuffle(jobs_filter_list)
            for job_key_word in jobs_filter_list:
                for location in self.locations:

                    query = "https://www.indeed.fr/jobs?q={0}&l={1}".format(job_key_word, location)
                    browser.get(query)
                    
                    pages_count = self._get_pages_counts(browser)
                    if pages_count == None:
                        print("No data on {0}, skip".format(query))
                        continue
                    
                    for page_index in random.sample(range(0, pages_count), pages_count):
                        full_query = "{0}&start={1}".format(query,page_index)

                        browser.get(full_query)

                        items = browser.find_elements_by_xpath("//*[contains(@class,'clickcard')]//*[contains(@class,'jobtitle')]")
                        items = [item.get_attribute("href") for item in  items]
                        
                        for index_i, link in enumerate(items):
                            self._local_parse_page(link, location)

In [38]:
test = IndeedPaser()
test.parse()



doublon, skip.
doublon, skip.
doublon, skip.
doublon, skip.
doublon, skip.
doublon, skip.
saved : {0} https://www.indeed.fr/company/Ciorane/jobs/D%C3%A9veloppeur-Python-a598ae23dfe33dc5?fccid=54f101fb37310c9a&vjs=3
saved : {0} https://www.indeed.fr/company/Verduron/jobs/D%C3%A9veloppeur-Backend-Python-a8a29f6b23eeed7e?fccid=0d02f1f8135a708a&vjs=3
saved : {0} https://www.indeed.fr/rc/clk?jk=708e6d247993a630&fccid=10b79275effd1c67&vjs=3
saved : {0} https://www.indeed.fr/company/Verduron/jobs/D%C3%A9veloppeur-Python-Golang-b2fdce19906b1e11?fccid=0d02f1f8135a708a&vjs=3
saved : {0} https://www.indeed.fr/company/COGICEO/jobs/Stage-D%C3%A9veloppeur-Python-b607eaec220c0fdb?fccid=2187cf56586b8d13&vjs=3
saved : {0} https://www.indeed.fr/company/OCTOPUS-IT/jobs/D%C3%A9veloppeur-Python-Junior-Dipl%C3%B4m%C3%A9-Logiciel-Open-Source-c40fbad4cbf04651?fccid=7358eb8948faec3a&vjs=3
saved : {0} https://www.indeed.fr/rc/clk?jk=d5ce3d3cbffed937&fccid=c56ae1a209cde5cf&vjs=3
aready parsed, skip https://www.i

KeyboardInterrupt: 

In [7]:
dataset = pd.read_csv("archiv_to_mongo/indeed.part.2.csv")
dataset

Unnamed: 0,URL,Titre,Nom entreprise,Adresse,Date de publication,salaire,description,localisation
0,https://www.indeed.fr/rc/clk?jk=ae69ae0adabbb3...,Développeur logiciel système embarqué - H/F,Labsoft,Toulouse (31),2019-09-10 08:16:58.738000,,CONTEXTE : Labsoft recrute un(e) Développeur l...,Toulouse
1,https://www.indeed.fr/rc/clk?jk=e3fc82878e0491...,Développeur Sénior Java/ J2E h/f,Vidal Associates,Toulouse (31),2019-10-09 08:17:11.596268,,"VIDAL ASSOCIATES Consulting & Search, référenc...",Toulouse
2,https://www.indeed.fr/rc/clk?jk=9d3a85e0205ca0...,Expert performance - infrastructure,MGI Consultants,Toulouse (31),2019-09-18 08:17:38.446060,,"Chez notre client grand compte, basé à Toulous...",Toulouse
3,https://www.indeed.fr/rc/clk?jk=cdc48a4dcf7c15...,DevOps / SRE,cenareo,Labège (31),2019-09-20 08:17:53.806715,,Startup de l’IoT Valley labellisée Pass French...,Toulouse
4,https://www.indeed.fr/rc/clk?jk=5b7f648563c94f...,Service Reliability Engineer (SRE) F/H,Sigfox,Toulouse (31),2019-09-19 08:18:07.469667,,L’equipe SRE est responsable de la disponibili...,Toulouse
5,https://www.indeed.fr/rc/clk?jk=d625e0f2a0ab61...,DEVELOPPEUR(SE) BACKEND JAVA/WEB (H/F),Wiseed,Toulouse (31),2019-09-10 08:18:21.925446,,WiSEED est un des acteurs les plus innovants e...,Toulouse
6,https://www.indeed.fr/rc/clk?jk=0eaed1137edb2e...,Développeur Java / Jee/ Spring f/h,Vidal Associates,Toulouse (31),2019-09-10 08:18:34.879537,,"VIDAL ASSOCIATES Consulting & Search, référenc...",Toulouse
7,https://www.indeed.fr/rc/clk?jk=34d5c533cea70c...,Développeur Sénior Java EE et/ou Spring,Vidal Associates,Toulouse (31),2019-09-10 08:18:47.826426,,"VIDAL ASSOCIATES Consulting & Search, référenc...",Toulouse
8,https://www.indeed.fr/rc/clk?jk=350e1d69247dfc...,Ingénieur QA Automation (H/F),JFrog,Toulouse (31),2019-09-10 08:19:00.550257,,"JFrog est une startup en pleine croissance, éd...",Toulouse
9,https://www.indeed.fr/rc/clk?jk=5d667e321cb2ee...,Stagiaire développeur.e web Fullstack,Place2Swap,Issy-les-Moulineaux (92),2019-10-04 08:19:41.530213,,Mission : Participation à l’évolution de la pl...,Paris


In [47]:
c =IndeedMongodbDao()
result = c.get_all_dupliate()
for item in result:
    print(item)