# Text processing

applying NLP techniques to the two columns " job requirements and descriptions ".

In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import re
import unicodedata
import operator
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

In [2]:
# remove all html tags from a document. [ return a document ]
def remove_tags(document):
    # remove non-english letters ( arabic letters.)
    document = unicodedata.normalize('NFKD',document.decode('utf-8')).encode('ascii','ignore')
    # set the pattern then remove all matches.
    tag_pattern = re.compile('(</*\w+\s*/*>)|(<span style=".*">)|[\r\n,]|(&\w+;)')
    document = tag_pattern.sub('',document)
    return document

# remove stop words and stem the words in a document. [ return a list of filtered&stemmed words]
def filter_stem_doc(document):
    # stop words filteration 
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(document)
    filtered_words = [ w for w in word_tokens if not w in stop_words and len(w) > 1 ]
    # stemming words.
    sbstem = SnowballStemmer('english')
    stemmed_words = [ sbstem.stem(w) for w in filtered_words ] 
    
    return stemmed_words

In [3]:
data = pd.read_csv('./data/fixed/data_science_dataset_wuzzuf.csv')

In [4]:
requirements = data.job_requirements.dropna().reset_index(drop=True)
descriptions = data.description.dropna().reset_index(drop=True)

requirements = requirements.apply(remove_tags)
descriptions = descriptions.apply(remove_tags)

requirements_words = {}
descriptions_words = {}

for k,document in requirements.iteritems():
    for word in filter_stem_doc(document):
        if word in requirements_words :
            requirements_words[word] += 1
        else :
            requirements_words[word] = 1

for k,document in descriptions.iteritems():
    for word in filter_stem_doc(document):
        if word in descriptions_words :
            descriptions_words[word] += 1
        else :
            descriptions_words[word] = 1
    
sorted_reqirements = sorted(requirements_words.items(), key=operator.itemgetter(1), reverse=True )
sorted_descriptions = sorted(descriptions_words.items(), key=operator.itemgetter(1), reverse=True )

with open('./data/generated/dicts/requirements_words_freq.csv','wb') as f :
    f.write('word,occurrences\n')
    for key,value in sorted_reqirements:
        f.write(str(key)+','+str(value)+'\n')

with open('./data/generated/dicts/descriptions_words_freq.csv','wb') as f :
    f.write('word,occurrences\n')
    for key,value in sorted_descriptions:
        f.write(str(key)+','+str(value)+'\n')

-----