## ライブラリのインポート

In [None]:
import os,sys,re
import csv
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter('ignore')
from tqdm.autonotebook import tqdm
from time import sleep

# import openpyxl

import json

from nltk.util import ngrams
from janome.tokenizer import Tokenizer
from nltk.corpus import stopwords
# stop_words = set(stopwords.words('english'))
# 形態要素解析とストップワードの削除

from gensim.models import Word2Vec


In [None]:

#base-path
path = "/workspace"
data_path = os.path.join(path,"data")

## データのロード

In [None]:
#データの読み込み
cv_df = pd.read_csv(os.path.join(data_path,"input/resume_corpus.csv"))
JDs_df = pd.read_csv(os.path.join(data_path,"input/Train_rev1.csv"))
JDs_df = JDs_df.query("Category == 'Engineering Jobs'")
JDs_df = JDs_df.reset_index(drop=True)
DE_JDs_df = pd.read_csv(os.path.join(data_path,'input/DataEngineer.csv'))
cv_df = cv_df.reset_index(drop=True)
cv_df = cv_df.loc[:100]

## 前処理

### テキストの処理からスキル抽出

In [None]:
#テキストの前処理の関数
def re_clean(_df):
    
    _dict = _df.to_dict()
    for key in _dict.keys():
        #小文字化
        _txt = str(_dict[key])
        _txt = _txt.lower()
        #cvのhtmlタグの除去
        _txt = re.sub(re.compile('<.*?>'), '', _txt)

        #urlの削除
        _txt = re.sub("https?://[\w!\?/\+\-_~=;\.,\*&@#\$%\(\)'\[\]]+", "", _txt)

        #特殊文字の削除
        _txt = re.sub(r"[^a-zA-Z0-9]"," ",_txt)
        _txt = re.sub(r"\s+", ' ', _txt)

        #数字を全て0に
        _dict[key] = re.sub(r"\d+"," ",_txt)
    return _dict 



In [None]:
# imports
import spacy
from spacy.matcher import PhraseMatcher

# load default skills data base
from skillNer.general_params import SKILL_DB
# import skill extractor
from skillNer.skill_extractor_class import SkillExtractor
!python -m spacy download en_core_web_lg

In [None]:
def get_skiller(_dict):
    nlp = spacy.load("en_core_web_lg")
    error_count = []
    count = 0
# init skill extractor
    skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)
    skill_dict = {}
    error_dict = {}
    for key in tqdm(list(_dict.keys())):
        try:
            annotations = skill_extractor.annotate(_dict[key])
            skill_list = []
           
            dict_list = {}
            for i in range(len(annotations["results"]['full_matches'])):
                skill_list.append( annotations["results"]["full_matches"][i]["doc_node_value"])

            for j in range(len(annotations["results"]['ngram_scored'])):
                skill_list.append(annotations["results"]["ngram_scored"][j]["doc_node_value"])
                dict_list["skill"] = skill_list
            _dict[key] = dict_list
        except:
            error_count.append(key)
            _dict.pop(key)
            error_dict[key] = 1 
            count = count+1
            continue
    print(count)
    return _dict 

In [None]:
# #スキルの抽出
# #全部終えるのに15時間以上かかります。
cv_skills =  get_skiller(re_clean(cv_df.cv))
JDs_skills = get_skiller(re_clean(JDs_df.FullDescription))
DE_skills = get_skiller(re_clean(DE_JDs_df["Job Description"]))

In [None]:
#格納用のディレクトリの作成
!mkdir -p /workspace/data/output/

In [None]:
with open(os.path.join(data_path,'output/skills_cv_100.json'),"w") as f:
    json.dump(cv_skills,f)

with open(os.path.join(data_path,'output/skills_JDs_1000.json'),"w") as f:
    json.dump(JDs_skills,f)

with open(os.path.join(data_path,r'output/skills_DataEngineer_JDs_2528.json'),"w") as f:
    json.dump(DE_skills,f)

## 形態要素解析

In [None]:
#抽出したskillのロード
with open(os.path.join(data_path,r'output/skills_cv_100.json'),"r") as f:
    skill_cv = json.load(f)

with open(os.path.join(data_path,r'output/skills_JDs_1000.json'),"r") as f:
    skill_JDs = json.load(f)

with open(os.path.join(data_path,r'output/skills_DataEngineer_JDs_2528.json'),"r") as f:
    skill_DE = json.load(f)

In [None]:
#skillを全て小文字に
def lower(x):
    return x.lower()
for key,val in skill_DE.items():
    try:
        val["skill"] = list(map(lower,val["skill"]))
    except:
        continue
for key,val in skill_JDs.items():
    try:
        val["skill"] = list(map(lower,val["skill"]))
    except:
        continue

for key,val in skill_cv.items():
    try:
        val["skill"] = list(map(lower,val["skill"]))
    except:
        continue

In [None]:
# ストップワードのロード
from nltk.util import ngrams
from janome.tokenizer import Tokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
#テキスト処理
def re_clean_text(_text):

    #小文字化
    _txt = _text.lower()
    #cvのhtmlタグの除去
    _txt = re.sub(re.compile('<.*?>'), '', _txt)

    #urlの削除
    _txt = re.sub("https?://[\w!\?/\+\-_~=;\.,\*&@#\$%\(\)'\[\]]+", "", _txt)

    #特殊文字の削除
    _txt = re.sub(r"[^a-zA-Z0-9]"," ",_txt)
    _txt = re.sub(r"\s+", ' ', _txt)

    #数字を全て0に
    _txt = re.sub(r"\d+"," ",_txt)
    return _txt

#トークン化とストップワードの除去
def Token(_text):
    cp_text = str(_text)
    t = Tokenizer(wakati=True)
    token_lists = []
    for token in t.tokenize(cp_text):
        if ((token.strip() != '')):
            token_lists.append(token)
    # ストップワードの除去
    filtered_sentence = []
    for w in token_lists:
        if w not in stop_words:
            filtered_sentence.append(w)
    cp_text = filtered_sentence

    return cp_text

#2-gramとして処理し、"deep learning"のような抽出されたスキル単語を1つのトークンとして扱うように処理する

def re_token(_token, skill_list):

    each_bigrams_list = [(_token[i], _token[i+1]) for i in range(len(_token)-1)]  # bigramsの作成
    prcc_list = (len(_token)-1)*[0]
    # 結合した文字がもしskillにあるならばskillに置き換えて1単語として扱う
    for i in range(len(each_bigrams_list)):
        join_word = " ".join(each_bigrams_list[i])  # bigramごとに一度文字をjoin
        if join_word in skill_list:
            in_list = [*each_bigrams_list[i]].copy()
            in_list.append(join_word)
            del in_list[:-1]
            [*each_bigrams_list[i]] = in_list
            prcc_list[i] = [*each_bigrams_list[i]][0]
        prcc_list[i] = [*each_bigrams_list[i]][-1]
    return prcc_list

In [None]:
def marge_json(skill_dict, id_and_full_text_df):
    # keyをidに
    id_skill_cv = {}
    _dict = skill_dict.copy
    for key_index in list(skill_dict.keys()):
        id = id_and_full_text_df.iloc[int(key_index)-1,0]
        try:
            value_dict = {}
            value_dict["text"] = re_clean_text(id_and_full_text_df.iloc[int(key_index),1])  # textを追加
            value_dict["skill"] = skill_dict[key_index]["skill"]  # keyをindexからidに変更
            value_dict["unique_skill"] = sorted(set(
                value_dict['skill']), key=value_dict['skill'].index)  # skillをuniqueに
            value_dict["token"] = Token(value_dict["text"])  # tokenを追加
            value_dict["re_token"] = re_token(
                value_dict["token"], value_dict["skill"])  # skillを考慮したtoken
            id_skill_cv[str(id)] = value_dict
        except:
            continue

    return id_skill_cv

In [None]:
#格納用のディレクトリの作成
!mkdir -p /workspace/data/output/marge_json

In [None]:
# dt_now = datetime.datetime.now()
#dump
with open(os.path.join(data_path,"output/marge_json/marge_cv.json"),"w") as f:
    json.dump(marge_json(skill_cv,cv_df.iloc[:,[0,11]]),f)

with open(os.path.join(data_path,"output/marge_json/marge_JDs.json"),"w") as f:
    json.dump(marge_json(skill_JDs,JDs_df.iloc[:,[0,2]]),f)

with open(os.path.join(data_path,"output/marge_json/marge_DataEngineer.json"),"w") as f:
    json.dump(marge_json(skill_DE,DE_JDs_df.iloc[:,[0,3]]),f)