In [1]:
"""
Import statements
"""

import pandas as pd
import itertools
import ast
import re

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [2]:
""" 
Normalizing functions 
"""

porter = PorterStemmer()
stoplist = set(stopwords.words('english'))

def norm_sent(sentence): 
    clean = []
    clean_sent = sentence.lower().split()
    for dirt_word in clean_sent: 
        word = dirt_word.strip()
        clean_word = norm_word(word)
        if clean_word not in stoplist: 
            clean.append(clean_word)
    return clean


# use this normalizing funtion for information extraction purposed
def norm_sentence(sent): 
    if sent != None: 
        return sent.lower().split()


def stem_word(wrd): 
    return porter.stem(wrd)


def norm_word(wrd): 
    return re.sub(r'[^a-zA-Z0-9%]', '', stem_word(wrd))

In [3]:
""" 
Functions for preprocessing CVs 
"""

def open_file(file): 
    file = open(file, "r")
    read_file = file.readlines()
    file.close()
    return read_file

def preprocess_one_file(file): 
    texts = []
    for line in file:
        for word in norm_sentence(line): 
            clean_word = norm_word(word)
            if clean_word not in stoplist:
                texts.append(clean_word)
    return texts 

def preprocess_more_files(file, signal): 
    one_cv = []
    cvs = []
    count = 0 
    
    for cv_lines in file:
        for cv_word in norm_sentence(cv_lines):
            clean_word = norm_word(cv_word)
            if clean_word not in stoplist: 
                match = re.search(signal, clean_word)
                if match: 
                    if one_cv != []:
                        cvs.append(one_cv)
                        one_cv = []
                        count += 1 
                else:
                    one_cv.append(clean_word)
    cvs.append(one_cv)
    return cvs

# use string when using regular expressions 
def txt_to_str(doc, signal=None): 
    with open(doc, 'r') as file:
        data = file.read().replace('\n', ' ')
        clean_data = data.lower().replace('\xa0', ' ')
    if signal == None: 
        return str(clean_data)
    else: 
        split_data = re.split(signal, clean_data)
        del split_data[0]
        return split_data
    

In [4]:
""" 
Code for preprocessing document of CVs 
The file "linkedin_cvs_10052021" is extracted by myself using the following search in Google: 
site:linkedin.com/in/ AND "data scientist". The resulting PDFs were tranformed to txt files by hand
The CVs are stored as lists in list
"""

read_li_file = open_file("linkedin_cvs_10052021.txt")
li_cvs_txt = txt_to_str("linkedin_cvs_10052021.txt", 'linkedin-\d\d')
li_cvs = preprocess_more_files(read_li_file, r'linkedin\d\d')

In [5]:
""" 
Code for preprocessing real life examples of vacancies 
These examples are provided by Thom 
The job title is normalized and stemmed 
The job description is already normalized, we only have to stem it 
The result is stored in a dict, the key is the row in the Dataframe
so you can extract the name of the company/title of the vacancy
The first value is the job title, the second is the job description
"""


def preprocess_df(dataframe): 
    vac_dict = dict()
    for i in range(1, len(dataframe)+1):
        title = dataframe.iat[i-1,0]
        numb = dataframe.iat[i-1,1]
        if not pd.isna(numb):
            stem_vac = []
            tot_vac = list(itertools.chain.from_iterable(ast.literal_eval(numb)))
            if not pd.isna(title):
                clean_title = norm_sent(title)
            for wrd in tot_vac: 
                if wrd not in stoplist: 
                    stem_vac.append(stem_word(wrd))
            vac_dict[i] = [clean_title, stem_vac]

    return vac_dict

data = pd.read_csv(r'clean_50_job_descriptions.csv')
pre_df = pd.DataFrame(data, columns= ['title', 'description'])
df = pre_df.drop([0], axis=0)
vac_dict = preprocess_df(df)