In [1]:
import pandas as pd
import spacy
import string
import joblib

In [2]:
# load a new spacy model
nlp = spacy.load("en_core_web_lg")

# add somes stopwords in the default list of spacy
nlp.Defaults.stop_words |= {"or","per","like",'-','_','',
                            '–','[]','\n','\n\n','\n\n ','i.e.'}

# Create our list of stopwords
stopWords= spacy.lang.en.stop_words.STOP_WORDS #set(stopwords.words('english'))


In [3]:
def text_preprocess(text):
    nlp.max_length = 2030000 # To raise the max legnth of word
    
### Creation of function that we'll use to process data


# Lemmatization
    def lemmatize_word(text):
        """ lemmatise words to give his root for example did becomes do
        input: text that contains Tokens
        output: A list of lemmatized tokens
        """
        lemma_word = [] 
        for token in text:
            lemma_word.append(token.lemma_)
        return lemma_word
    
    
# Split words that ontains character and correct them 
    def check_character_in_words(text):
        """ Split words that ontains character and correct them 
        input: A list of tokens with characters
        output: A list of tokens splitted on the character
        """
        charact = ["\n", ":", '$']
        for words in text:
            for chars in charact:
                if chars in words:
                    text.remove(words)
                    words = words.replace(chars," ")
                    words = words.split()
                    text.extend(words)

        return text 
    
# Remove punctuation    
    def remove_punct(text):
        """
        Remove punctuation from text (List of tokens)
        input: A list of tokens with punctuation
        output: A list of tokens without punctuation
        """
        l=[]
        for word in text:
            if not word in string.punctuation:# list of punctuation
                l.append(word)
        # resultat=" ".join(l)   
        return l


# Remove stopwords
    def remove_stopword(liste,stopWords):
        """
        Remove stopwords from a list of tokens
        input:A list of tokens with stopwords
        output:A list of tokens without stopwords
        """
        list_tokens = [tok.lower() for tok in liste]
        l=[]
        for word in list_tokens:  
            if word not in stopWords:
                l.append(word)
        return l
    
# Remove duplicated wods 
    # In the text the world sometimes repeated twice or more. 
    # For example slary in the title of the job and the description
    
    def remove_duplicates(text):
        
        """Remove duplicated words in each elements of the list
        input: list
        output: list
        """
        l=[]
        [l.append(x) for x in text if x not in l]
        resultat=" ".join(l)
        return resultat

    
### Process the data

    # Tokenization
    doc = nlp(text)
    lemmatize_text = lemmatize_word(doc)
    checked_text = check_character_in_words(lemmatize_text)
    removed_punctuation_text = remove_punct(checked_text)
    removed_stopwords_punctuation = remove_stopword(removed_punctuation_text,
                                                    stopWords)
# Remove duplicated wods 
    clean_text = remove_duplicates(removed_stopwords_punctuation)
#     resultat=" ".join(clean_text)
    return(clean_text) 
    
    

In [4]:
def model(email):
        """
    tokenize the text, load the model and predict the job title
    input: string
    output: job_title
    """
    processed_email = text_preprocess(email)
    model = joblib.load("model.joblib")
    job_title = model.predict([processed_email])
    return(job_title)

In [5]:
test = "Stress Engineer Glasgow Salary **** to **** We re currently looking for talented engineers to join our growing Glasgow team at a variety of levels. The roles are ideally suited to high calibre engineering graduates with any level of appropriate experience, so that we can give you the opportunity to use your technical skills to provide high quality input to our aerospace projects, spanning both aerostructures and aeroengines. In return, you can expect good career opportunities and the chance for advancement and personal and professional development, support while you gain Chartership and some opportunities to possibly travel or work in other offices, in or outside of the UK. The Requirements You will need to have a good engineering degree that includes structural analysis (such as aeronautical, mechanical, automotive, civil) with some experience in a professional engineering environment relevant to (but not limited to) the aerospace sector. You will need to demonstrate experience in at least one or more of the following areas: Structural/stress analysis Composite stress analysis (any industry) Linear and nonlinear finite element analysis Fatigue and damage tolerance Structural dynamics Thermal analysis Aerostructures experience You will also be expected to demonstrate the following qualities: A strong desire to progress quickly to a position of leadership Professional approach Strong communication skills, written and verbal Commercial awareness Team working, being comfortable working in international teams and self managing PLEASE NOTE SECURITY CLEARANCE IS REQUIRED FOR THIS ROLE Stress Engineer Glasgow Salary **** to ****"

In [6]:
a='This is a string'
model(a)

array(['other'], dtype=object)

In [7]:
model(test)

array(['other'], dtype=object)