In [3]:
import os
#from orderedset import OrderedSet
from bs4 import BeautifulSoup
from indeed_mongodb_dao import IndeedMongodbDao
#from pymongo import MongoClient 
from pymongo import errors 
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
class LocalIndeedPaser:
    def __init__(self):
        self.dao = IndeedMongodbDao() 
        self.pages_path = r'C:\Users\Junior\Documents\Projects_Simplon\Projet_ML_gr3-master\scrapping\pages'
        self.data = self.dao.get_all_data()
        self.dataset = pd.DataFrame()
        self.salary_pattern = "[[S|s]alaire?[\s+]?:?[\s+]?(.*)e?[\s+]?\/(an|mois)|((.*)?[\s+]?par?[\s+]?(an|ans|mois|jour|heure))"
        self.keyWordsProvider = KeyWordsProvider()
    
    def _get_salary(self,select_result):
        salary = ""
        for item in select_result:
            if "€" in item.text:
                outer_salary = re.compile(self.salary_pattern)
                m_salary = outer_salary.search(item.text)
                if m_salary is not None:
                    salary = m_salary.group(0)
                    break
        return salary
    
    def save_file(self):
        self.dataset.to_csv("indeed.pre_processing.csv",index=False)
    
    def _get_binnary_list_data(self, input_list):
        data = []
        for i in range(len(self.dataset)):
            inside_data = []
            for ele in input_list:
                pattern = re.compile(r"[\s/\(\),]"+ele+r"[\s/\(\),]")
                value = pattern.search(self.dataset['description'][i].lower().replace('\n',' ').replace('\r',' '))
                if value:
                    inside_data.append(1)
                else:
                    inside_data.append(0)
            data.append(inside_data)
        
        return data
    
    def _set_quantitative_features(self, pattern, col_indice,label_col,func_callback = None):
        result = []
        for index, row in self.dataset.iterrows():
            re_pattern = re.compile(pattern)
            value = re_pattern.search(row['description'].lower().replace('\n',' ').replace('\r',' '))
            if value:
                if func_callback is not None:
                    result.append(func_callback(value.group(0)))
                else:
                    result.append(value.group(0))
            else:
                result.append(None)
        
        if (label_col not in self.dataset.columns):
            self.dataset.insert(col_indice, label_col,result,True)
        else:
            self.dataset[label_col] = pd.DataFrame(result)
        
        dummies = self.dataset[label_col].str.get_dummies() 
        
        #self._fusion_with_dataset(dummies)
        
    
    def parse_education_level(self):
        reg_pattern = "([(b|B)\w+]ac\s*\+\s*[1-8])|ingénieur|master\s*(1|2)|(D|d)iplôme\s*supérieur"
        self._set_quantitative_features(reg_pattern,7,"niveau_etude", self._education_level_callback)
    
    def _education_level_callback(self, value):
        bac_pattern = "bac\s*\+\s*[1-8]"
        result = re.compile(bac_pattern).search(value)
        response = ""
        if result:
            response = re.findall('(\d+)',value)
            if response is not None:
                return "bac + " + response[0] 
        
        master_pattern = "master\s*(1|2)"
        result = re.compile(master_pattern).search(value)
        if result:
            response = re.findall('(\d+)',value)
            if response is not None:
                return "master " + response[0] 
            
        return value
    
    def set_type_de_cursus(self):
        #j'ai desactivé le pattern "|([(m|M)\w+]aster?\s?\w{3,25})" master car ça renvoie "master dans" ou "master data"
        reg_pattern = '([(é|E)\w+]cole [(i|I)\w+]ngénieur?)|([(a|A)\w+]utodidacte?)|([(g|G)\w+]rande[s]? [(é|E)\w+]cole[s]?)|([(é|E)\w+]cole[s]? de [(c|C)\w+]ommerce[s]?)|([(i|I)\w+]ngénieur [(i|I)\w+]nformatique?)'
        self._set_quantitative_features(reg_pattern,8,"type_de_cursus", self._type_cursus_callback)
    
    def _type_cursus_callback(self, value):
        ge_pattern = "grandes?\s*(é|e)coles?"
        result = re.compile(ge_pattern).search(value)
        if result:
            return "grande école"
        
        ec_pattern = "([(é|E)\w+]cole[s]? de [(c|C)\w+]ommerce[s]?)"
        result = re.compile(ec_pattern).search(value)
        if result:
            return "école de commerce"
        
        return value
        
    def set_type_de_contrat(self):
        #j'ai desactivé le pattern "[(c|C)\w+]ontrat?:?\s\w{3,25}|"  car ça renovie "contrat logue", "contrat avec", etc
        reg_pattern = '(cdi|cdd|stage|alternance|alternant|cdic|freelance)|3\s*mois\s*renouvelable\s*'
        self._set_quantitative_features(reg_pattern,9,"type_de_contrat", self._type_de_contrat_callback)
    
    def _type_de_contrat_callback(self,value):
        bac_pattern = "alternance|alternant"
        result = re.compile(bac_pattern).search(value)
        if result:
            return "alternance"
        return value
    
    def set_grande_categorie(self):
        reg_pattern = 'développeur?\s*(web|mobile|data|front\s*end|back\s*end|desktop|full stack\s*(developer))'
        self._set_quantitative_features(reg_pattern,10,"grande_categorie",self._grande_categorie_callback)
        
    def _grande_categorie_callback(self, value):
        bac_pattern = "front\s*end|back\s*end"
        result = re.compile(bac_pattern).search(value)
        if result:
            if 'front' in value:
                return "front-end"
            else:
                return "back-end"
            
        return value
    
    #def _fusion_with_dataset(self, df):
        #diff_cols = list(OrderedSet(self.dataset.columns) - OrderedSet(df.columns))
        #self.dataset = self.dataset[diff_cols]
        #self.dataset = pd.concat([self.dataset,df], axis=1)
    
    def parse_langage(self):
        languages = self.keyWordsProvider.get_langages()
        
        data = self._get_binnary_list_data(languages)
        
        language_dict = pd.DataFrame(data, columns=languages)
        self.language_df = pd.DataFrame.from_dict(language_dict)

        #self._fusion_with_dataset(self.language_df)
    
    def parse_tools(self):
        tools = self.keyWordsProvider.get_tools()
        
        data = self._get_binnary_list_data(tools)
        tools_dict = pd.DataFrame(data, columns=tools)
        self.tools_df = pd.DataFrame.from_dict(tools_dict)
        #self._fusion_with_dataset(self.tools_df)
    
    def set_salary_man(self):
        salaire_moyen = []
        for i in range(len(self.dataset)):
            try:
                salaire_liste = re.findall('(\d+),?',normalize('NFKD',df['salaire'][i]).replace(' ',''))
                mois = re.search('mois',df['salaire'][i])
                if mois:
                    if len(salaire_liste) > 1:
                        moy = 12 * (int(salaire_liste[0]) + int(salaire_liste[1])) / 2
                        salaire_moyen.append(moy)
                    else:
                        salaire_moyen.append(int(salaire_liste[0]) * 12)
                else:
                    if len(salaire_liste) > 1:
                        moy = (int(salaire_liste[0]) + int(salaire_liste[1])) / 2
                        if moy < 100:
                            moy *= 1000
                        salaire_moyen.append(moy)
                    else:
                        if int(salaire_liste[0]) < 100:
                            salaire_moyen.append(int(salaire_liste[0]) * 1000)
                        else:
                            salaire_moyen.append(int(salaire_liste[0]))

            except:
                salaire_moyen.append(None)
                continue
        
        label_col = "salaire_moyen"
        
        if (label_col not in self.dataset.columns):
            self.dataset.insert(6, label_col,salaire_moyen,True)
        else:
            self.dataset[label_col] = pd.DataFrame(salaire_moyen)
            self.dataset[label_col]
    
    

SyntaxError: invalid syntax (indeed_mongodb_dao.py, line 74)

In [4]:
#dao = IndeedMongodbDao() 
#dao

In [18]:
localIndeedPaser = LocalIndeedPaser()
#localIndeedPaser.dataset["description"].dropna(how='all').isna().sum()
#localIndeedPaser.dataset = localIndeedPaser.dataset.dropna(subset=['description'])
#localIndeedPaser.dataset["description"].isna().sum()

NameError: name 'pd' is not defined

In [None]:
localIndeedPaser.parse_salary_from_local_files()
localIndeedPaser.set_salary_man()

In [None]:
localIndeedPaser.save_file()