In [1]:
import requests
import re
from lxml import html
import time
from pprint import pprint

In [2]:
class Parser:
    
    def __init__(self, url):
        self.main_link = url
        self.headers = {'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'}
        self.response = requests.get(url, headers=self.headers)
        self.root = html.fromstring(self.response.text)

In [3]:
class DeepParser(Parser):
    
    def collect_company_item(self):
        
        name = self.root.xpath('//h1[@class="nomSociete"]/text()')
        
        registration_office_raw = self.root.xpath('//table[@id="etab2"]/tr[5]/td[2]/text()')
        registration_office = registration_office_raw[0].replace('\t', '').replace('\n', ' ').strip() if registration_office_raw else None
        
        registration_country = self.root.xpath('//table[@id="etab2complete"]/tr/td[text()="Pays"]/../td[2]/text()')

        registration_type = self.root.xpath('//table[@id="rensjur"]/tr/td[text()="Forme juridique"]/../td[2]/text()')
        
        
        registration_number = self.root.xpath('//table[@id="rensjurcomplete"]/tr/td[text()="RCS"]/../td[2]/text()')
        sales_tax_number = self.root.xpath('//table[@id="rensjurcomplete"]/tr/td[text()="Code greffe"]/../td[2]/text()')
        
        description_raw = ''.join(self.root.xpath('//div[@id="synthese"]//text()')).replace('\n', '')
        description = re.sub(r'\s+', ' ', description_raw).strip()
            
        return {
            "name": name[0].strip() if name else None,
            "registration_office": registration_office,
            "registration_country": registration_country[0] if registration_country else None,
            "registration_type": registration_type[0] if registration_type else None,
            "registration_number": registration_number[0] if registration_number else None,
            "sales_tax_number": sales_tax_number[0] if sales_tax_number else None,
            "description": description,
        }
    
    def collect_employees_item(self):
        return self.root.xpath('//table[@id="etab2complete"]/tr/td[text()="Tranche d\'effectif salarié"]/../td[2]/text()')
    
    def collect_adresses_item(self):
        
        name_raw = self.root.xpath('//table[@id="etab2complete"]/tr/td[text()="Pays"]/../td[2]/text()')
        name = name_raw[0] if name_raw else None
        city = self.root.xpath('//table[@id="etab2complete"]/tr/td[text()="Ville"]/../td[2]/text()')
        street = self.root.xpath('//table[@id="etab2complete"]/tr/td[text()="Adresse"]/../td[2]/text()')
        zip_code = self.root.xpath('//table[@id="etab2complete"]/tr/td[text()="Code postal"]/../td[2]/text()')
        
        return [{
            "country": {
                "name": name,
                "iso_code": 250 if name == 'France' else None,
                "code3": "FRA" if name == 'France' else None,
                "code": "FR" if name == 'France' else None,
            },
            "city": city[0] if city else None,
            "street": street[0] if street else None,
            "zip": zip_code[0] if zip_code else None,
        }]
    
    def collect_phones_item(self):
        
        return [{
            "phone": None,  # this data is not provided in source 
        }]
    
    def collect_websites_item(self):
        
        return [{
            "website": None,  # this data is not provided in source 
        }]
    
    def collect_emails_item(self):
        
        return [{
            "email": None, # this data is not provided in source
        }]
    
    def collect_ceos_item(self):
        
        leaders_profiles = set(self.root.xpath('//table[@class="Table leader"]//@href'))
        
        ceos = []
        for url in leaders_profiles:
            profile = Parser(url)
            name = profile.root.xpath('//h1[@id="identite_deno"]/text()')
            if name :
                firstname = ' '.join([n for n in name[0].split() if not n.isupper()])
                lastname = ' '.join([n for n in name[0].split() if n.isupper()])
                birthday = profile.root.xpath('//div[@class="CompanyIdentity__adress"]/p[1]/text()')
            else:
                firstname = url.split('.')[-4].split('/')[-1]
                lastname = url.split('.')[-3].replace('_', ('-'))
                birthday = None
            
            type_of_ceo = self.root.xpath('//h5[text()="Mandataires de type : "]/b/text()')
            
            ceos.append({
                "firstname": firstname,
                "lastname": lastname,
                "city": None, # this data is not provided in source
                "birthday": birthday[0] if birthday else None,
                "type": type_of_ceo[0] if type_of_ceo else None
            })
            
        return ceos
    
    def collect_branches_item(self):
        
        name = self.root.xpath('//table[@id="etab2complete"]/tr/td[text()="Nature de l\'établissement"]/../td[2]/text()')
        code = self.root.xpath('//table[@id="etab2complete"]/tr/td[text()="Code ape (NAF)"]/../td[2]/text()')
        category = self.root.xpath('//table[@id="etab2complete"]/tr/td[text()="Libellé du code APE"]/../td[2]/text()')
        
        return [{
            "name": name[0] if name else None,
            "description": None, # this data is not provided in source
            "code": code[0] if code else None,
            "category": category[0] if category else None,
        }]
    
    
    def collect(self):
        
        return {
            "company": self.collect_company_item(),
            "employees": self.collect_employees_item(),
            "addresses": self.collect_adresses_item(),
            "phones": self.collect_phones_item(),
            "websites": self.collect_websites_item(),
            "emails": self.collect_emails_item(),
            "ceos": self.collect_ceos_item(),
            "branches": self.collect_branches_item()
        }
    

In [4]:
url = 'http://213.136.89.232:3002/frogs'
parser = Parser(url)
task = parser.response.json()

In [5]:
for item in task:
    search_tail = item['company'].lower().replace(' ', '-') + '-' + item['registration_number'] + '.html'
    deep = DeepParser('http://www.societe.com/societe/' + search_tail)
    data = deep.collect() 
    response = requests.post('http://213.136.89.232:3002/company', json=data)
    if response.status_code != 201:
        print(response.status_code)
        pprint(response.json())
        break
else:
    print('Success!')

Success!
