This is rework for https://sparknlp.org/2021/01/03/ner_ud_kaist_glove_840B_300d_ko.html
Modified for korean, with tokenizer and visualizer.

Code may be updated in https://github.com/aria1th/Korean-sentense-NER-Notebook
use `wget https://raw.githubusercontent.com/aria1th/Korean-sentense-NER-Notebook/main/NER.ipynb` to download this notebook.
SYSTEM REQUIREMENTS

Model requires at least 2.4GB of RAM, For safety, we will use increased spark memory of 8G.

In [None]:
# Install PySpark and Spark NLP
%pip install pyspark spark-nlp

# Install Spark NLP Display lib
%pip install --upgrade -q spark-nlp-display
# Install KoNLPy
%pip install konlpy
%pip install tqdm
# eunjeon should support mecab
%pip install eunjeon --upgrade


In [None]:
# Tokenizer
def get_tokenizer(option:str = 'Komoran', base='konlpy.tag', mecab_dict=''):
    # available options = ['Mecab', 'Komoran', 'Okt', 'Hannanum', 'Kkma']
    # dynamic import
    available_options = ['Komoran', 'Okt', 'Hannanum', 'Kkma', 'Mecab']
    if option not in available_options:
        print("[Warn] Fallbacking to dynamic import for {option} from {base}, this may not work properly")
        return _get_tokenizer(option, base)
    if option == 'Komoran':
        from konlpy.tag import Komoran
        return Komoran()
    elif option == 'Okt':
        from konlpy.tag import Okt
        return Okt()
    elif option == 'Hannanum':
        from konlpy.tag import Hannanum
        return Hannanum()
    elif option == 'Kkma':
        from konlpy.tag import Kkma
        return Kkma()
    elif option == 'Mecab':
        from eunjeon import Mecab
        if mecab_dict:
            return Mecab(mecab_dict)
        return Mecab()
    else:
        raise Exception("Unknown tokenizer option %s" % option)

def _get_tokenizer(clsname:str, base='konlpy.tag'):
    return __import__(base, fromlist=[clsname]).__getattribute__(clsname)()


In [None]:
import json
import pandas as pd
import numpy as np

from tqdm import tqdm

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType
from pyspark import SparkContext


In [None]:
spark = sparknlp.start(memory='8G') #for safety, increase memory to 8G.

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

In [None]:
language = 'ko'

In [None]:
# List of sentences, not tokenized yet
text_list = ["""모나리자는 레오나르도 다 빈치에 의해 그려진 어떤 여인의 초상화로, 파리의 루브르 박물관에 소장되어 있다."""]

In [None]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

sentence_detector = SentenceDetector() \
    .setInputCols('document') \
    .setOutputCol('sentence')

In [None]:
# setup NER, this should be done first
ner = NerDLModel.pretrained("ner_kmou_glove_840B_300d", "ko") \
.setInputCols(["document", "token", "embeddings"]) \
.setOutputCol("ner")

In [None]:
# setup word segmenter
word_segmenter = WordSegmenterModel.pretrained("wordseg_kaist_ud", "ko")\
.setInputCols(["sentence"])\
.setOutputCol("token")

In [None]:
# setup POS tagger, this is large model, so it takes time
embeddings = WordEmbeddingsModel.pretrained("glove_840B_300", "xx")\
.setInputCols("document", "token") \
.setOutputCol("embeddings")

In [None]:
# setup NER converter
ner_converter = NerConverter() \
    .setInputCols(['sentence', 'token', 'ner']) \
    .setOutputCol('ner_chunk')

In [None]:
# setup pipeline
nlp_pipeline = Pipeline(stages=[documentAssembler, sentence_detector, word_segmenter, embeddings, ner, ner_converter])

In [None]:
# tokenize first.
# Its not trivial to implement tokenizer for Pipeline stage, so we are just assuming documents are 'tokenized' before pipeline

def tokenize(texts:list[str], tokenizer_type:str = 'Komoran', **kwargs):
    _tokenizer = get_tokenizer(tokenizer_type, **kwargs)
    _tokenized = []
    for text in tqdm(texts):
        _tokenized.append(' '.join(_tokenizer.morphs(text)))
    return _tokenized

tokenized = tokenize(text_list, 'Komoran')

In [None]:
# create spark dataframe
df = spark.createDataFrame(tokenized, StringType()).toDF("text")

In [None]:
# run pipeline
result = nlp_pipeline.fit(df).transform(df)

In [None]:
#Visualize NER result
from sparknlp_display import NerVisualizer

NerVisualizer().display(
    result = result.collect()[0],
    label_col = 'ner_chunk',
    document_col = 'document'
)