In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'thaijo-researcher:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F67476%2F7600019%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240210%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240210T203713Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da7bdda353eab0f86de94f318dd614a6552fe15d6f40994aea35a9dcc4963ae9cedee0617bf2e13a24a212f907d0a3df9b8882527cdde0e5383a0be7be28d124394275247340b8e0de6d495d9e72c067e0f4216dbc9a9cc4fde7ad522c91545e12bdbb291de3871782bb267e79c55b79a55d0a98118a1d6c1f9d4f4888a6261ba3a591fac763ad0ed70363ef5e2781326a2c51ad56f0d73567ab0c23c19aafdbe2dc207eb7adbbad2659c812ea271bf87c26f269fb6d22fba87be5256146b18bc949d121ef7899ef19b60decac1c720e860e23d544ba87b77773da801bab12089db51d3903d8d23d28fa81c7df2e8cbdfaca649069e3476bf8bcd698b54ea160e,start-point:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4423360%2F7598906%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240210%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240210T203713Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D4df2e9f74be908abe1151027104aa5b433e9b82b28ae30a850de89df47fe1fb8631a70a9b367f950c5a153ff220b576aee25605fadfc020dbe2f0fe55a4918eb691b0f18dda06262f4495fd1d7580e82ea39f96c8c6a4be805e42756b53866d2afe696461a06b5d6d44b3ba04a716299705a2cbe0e989bb6edf169a653764962a5c65f101414c2bf7aeba233b518fca6941414f6b5d339508b9f16c68fecb94ed52cdd5f5cd308e837938b8f3bc76aab4ea507c4ed4cb2072643fb918a6b583fb48d08d63900d736569f979b96d167ee0936836eb92c1b0721295524c2813c8da099f774949252d33d05a9aac1a2eda6606cab18932a603cfd5a6b7c4473655c,b90052:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4423377%2F7598929%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240210%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240210T203713Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D747d8c5f5c2049b58a50fe915561678cda82918feb0bd901b3125993c201710ebcf25823fb279e5bbf6d7bc1248044eff941bc361b626f3bef5475f60fcc944d9a2fc51f71afbc90a9f62e66c7ca7aa5eae36ed1cf5143445f5d8b0230ce04e3288b3f9eaa3f406ffdf31a7571f18b8e1d45b63357d14d6b388eb3608f000d8762eb9893230e17a87e8ae7f8a28a2ce9cdbcd436a87471068a476d951a7e157f1a905e340f363533a34a201054a5804cf371a48ab7b56b79d0f03f79076abd6251bab948ec1ae3b80bc012bbd323a03cd81c00e54a2950d007f1186ab7a22224e24f23b9231dccff438770ce7d74fc3d5aaedfb9f120dafc8f733252836f3252'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# code เก่า

In [None]:
import pandas as pd
import regex as re
import numpy as np

def remove_spaces(string):
    if string == '':
        return ''
    elif string == ' ':
        return ''
    if string[0] == ' ':
        string = string[1:]
    if string[-1] == ' ':
        string = string[:-1]
    return string

if __name__ == '__main__':
    char = {'id': [], 'name': []}
    df = pd.read_csv('/kaggle/input/thaijo-researcher/test.csv')
    df = df.fillna('')
    for idx, row in df.iterrows():
        names = []
        char['id'].extend([f"{row['_id']}_{i+1}" for i in range(10)])
        author = row['_source.author']
        if author != '':
            names.append(author)
        co_author = row['_source.co-author']
        if co_author != '':
            ca = co_author.split(',')
            for c in ca:
                if c not in names:
                    names.append(c)
        if len(names) == 0:
            char['name'].extend([None]*10)
            continue
        names = [remove_spaces(name) for name in names]
        if len(names) > 10:
            names = names[:10]
        elif len(names) < 10:
            names.extend(['']*(10-len(names)))
        char['name'].extend(names)
    char = pd.DataFrame(char)
    char.to_csv('submit.csv', index=False)

----------------------------------------------------

# code pjaaa

In [None]:
import pandas as pd
import regex as re
import numpy as np
import unicodedata

In [None]:
# def remove_spaces(s:str):
#     return s.strip()

In [None]:
df = pd.read_csv('/kaggle/input/thaijo-researcher/test.csv')
df = df.fillna('')
df = df[["_id", "_source.author", "_source.co-author"]]
df.columns = ["id", "aut", "coaut"]
og = df.copy()
og

## check sep in co-au & aut

### coauthor

In [None]:
df["coaut_;"] = df["coaut"].apply(
    lambda x: 1 if x.find(";") != -1 else 0
)
df["coaut_;"].value_counts()

In [None]:
df["coaut_,"] = df["coaut"].apply(
    lambda x: 1 if x.find(",") != -1 else 0
)
df["coaut_,"].value_counts()

### author

In [None]:
df["aut_;"] = df["aut"].apply(
    lambda x: 1 if x.find(";") != -1 else 0
)
df["aut_;"].value_counts()

In [None]:
df["aut_,"] = df["aut"].apply(
    lambda x: 1 if x.find(",") != -1 else 0
)
df["aut_,"].value_counts()

In [None]:
df[df["aut_,"] == 1]

In [None]:
df["aut"] = df["aut"].apply(lambda x: x.strip(",").replace(";", ","))

## function check anything

In [None]:
from typing import Callable

def is_starts_with_word(string:str, word:str):
    pattern = r'^\b' + re.escape(word)
    return bool(re.search(pattern, string))

def is_contain_word(string:str, word:str):
    pattern = re.escape(word)
    return bool(re.search(pattern, string))

def is_have_thai_characters(string: str):
    for char in string:
        if 'THAI' in unicodedata.name(char):
            return True
    return False

def check_start_with_word(
    aut: str,
    coaut: str,
    word: str,
    sepa: str = ",",
):
    auts = aut.split(sepa)
    for aut_i in auts:
        if is_starts_with_word(aut_i, word):
            return 1
    coauts = coaut.split(sepa)
    for coaut_i in coauts:
        if is_starts_with_word(coaut, word):
            return 1
    return 0

def check_contain_word(
    aut: str,
    coaut: str,
    word: str,
    aut_sepa: str = ";",
    coaut_sepa: str = ",",
):
    auts = aut.split(aut_sepa)
    for aut_i in auts:
        if is_contain_word(aut_i, word):
            return 1
    coauts = coaut.split(coaut_sepa)
    for coaut_i in coauts:
        if is_contain_word(coaut_i, word):
            return 1
    return 0

def retreive_names_from_df(
    df: pd.DataFrame,
    val_fn: Callable,
    sepa: str = ",",
) -> list[str]:
    names = list()
    for row in df.itertuples():
        aut = row.aut
        coaut = row.coaut
        auts = aut.split(sepa)
        for aut_i in auts:
            if val_fn(aut_i):
                names.append(aut_i)
        coauts = coaut.split(sepa)
        for coaut_i in coauts:
            if val_fn(coaut_i):
                names.append(coaut_i)
    return names

def check_first_coauthor(aut: str, coaut: str):
    return 1 if coaut[: len(aut)] == aut else 0

## scan data

### check first co-author in author

In [None]:
df["same_1_coaut"] = df.apply(
    lambda x: check_first_coauthor(x.aut, x.coaut), axis=1
)
df["same_1_coaut"].value_counts()

In [None]:
df[df["same_1_coaut"] == 0]

### check phraaaaaa

In [None]:
df["start_phra"] = df.apply(
    lambda x: check_start_with_word(x.aut, x.coaut, "พระ"), axis=1
)
df["start_phra"].value_counts()

In [None]:
df[df["start_phra"] == 1]

In [None]:
all_phras = retreive_names_from_df(
    df,
    val_fn=lambda name: is_starts_with_word(name, "พระ"),
)
print(len(all_phras))

### check element & get tokens

In [None]:
!pip install pythainlp

In [None]:
from pythainlp.tokenize import word_tokenize

def fst_tok(name: str):
    name = name.strip()
    tokens = word_tokenize(name, engine="newmm")
    first_token = tokens[0] if len(tokens) >= 1 else ""
    return first_token

def get_first_tokens(aut: str, coaut: str, sepa: str = ","):
    first_tokens = set()
    auts = aut.split(sepa)
    for aut_i in auts:
        first_tokens.add(fst_tok(aut_i))
    coauts = coaut.split(sepa)
    for coaut_i in coauts:
        first_tokens.add(fst_tok(coaut_i))
    return first_tokens

In [None]:
all_first_tokens = set()
for row in df.itertuples():
    first_tokens = get_first_tokens(row.aut, row.coaut)
    all_first_tokens = all_first_tokens.union(first_tokens)

all_first_tokens = list(all_first_tokens)
all_first_tokens.sort()
print(len(all_first_tokens))

In [None]:
all_names_with_dot = retreive_names_from_df(
    df,
    val_fn=lambda name: is_contain_word(name, "."),
)
print(len(all_names_with_dot))

In [None]:
all_names_with_colon = retreive_names_from_df(
    df,
    val_fn=lambda name: is_contain_word(name, ":"),
)
print(len(all_names_with_colon))

In [None]:
all_names_with_blanket = retreive_names_from_df(
    df,
    val_fn=lambda name: is_contain_word(name, "(") or is_contain_word(name, ")"),
)
print(len(all_names_with_blanket))

In [None]:
all_names_with_hyphen = retreive_names_from_df(
    df,
    val_fn=lambda name: is_contain_word(name, "-"),
)
print(len(all_names_with_hyphen))

In [None]:
all_names_with_space_hyphen = retreive_names_from_df(
    df,
    val_fn=lambda name: is_contain_word(name, " -"),
)
print(len(all_names_with_space_hyphen))

In [None]:
all_names_with_number = retreive_names_from_df(
    df,
    val_fn=lambda name: bool(re.search(r'\d', name)),
)
print(len(all_names_with_number))

In [None]:
all_names_interested = retreive_names_from_df(
    df,
    val_fn=lambda name: is_contain_word(name, "ว่าที่"),
)
print(len(all_names_interested))

## model

In [None]:
def remove_restrict(col: str):
    res_words = [
       "บทบรรณาธิการ (Editorial)",
        "",
        "ผู้ทรงคุณวุฒิ -",
        "แนะนำผู้เขียน -",
        "คำแนะนำสำหรับผู้เขียน วารสารวิทยาการจัดการ",
        "กสทช.",
        "และคณะ",
        "and Other",
        "et al",
        "บทบรรณาธิการ",
        "-- --",
        "- -",
        " -",
        "Authors :",
        "(ผู้แต่ง)",
        "*",
        " ๋",
        "dusayu@kku.ac.th",
        'CFA',
        'M.D.',
        'Asst.',
    ]

    # full title or 2-letter-or-more abbreviation
    non_conflict_nametitles = [
       r"ร้อยเอก",
        r"ม.ร.ว.",
        r"รองศาสตราจารย์",
        r"รศ.",
        r"ผู้ช่วยศาสตราจารย์",
        r"ผศ.",
        r"ศาสตราจารย์",
        r"อ.ดร.",
        r"ดร.",
        r"อาจารย์",
        r"เด็กหญิง",
        r"เด็กชาย",
        r"ว่าที่ร้อยตรี",
        r"ว่าที่ ร.ต.",
        r"ร.ต.อ.",
        r"ว่าที่ พ.ต.",
        r"นพ.",
        r"Mr.",
        r"Ms.",
        r"Jr.",
        r"Dr.",
        r"Ph.D",
        r"Assoc.",
        r"Assist.",
        r"Prof.",
        r"Lect.",
        r"Ven.",
        r", M.R.",
        r'Mohd.',
        r"ศ.",
        r' .',
    ]
    for res_word in res_words + non_conflict_nametitles:
        col = col.replace(res_word, " ")

    # remove number
    col = re.sub(r"[0-9]+", "", col)

    # Manual rules
    col = col.replace("(Phrakrusuteejariyawat)", ", Phrakrusuteejariyawat")
    col = col.replace("์Nantawut Jampangam", "Nantawut Jampangam")
    col = col.replace("ืnatthabhol khanthachai", "natthabhol khanthachai")
    col = col.replace("ืnatthabhol khanthachai", "natthabhol khanthachai")
    col = col.replace("somboon ๋jarukasemthawee", "somboon jarukasemthawee")

    return col

def seperate_names(col: str):
    col = col.strip().strip(",").strip()
    names = col.split(",")
    return [name.strip() for name in names if name.strip() != ""]

def filter_with_general_rules(name: str):
    start_with_terms = [
        "กองบรรณาธิการ",
        "คณะ",
        "สำนัก",
        "Cover ",
        "เจ้าหน้าที่",
    ]
    contain_terms = [
        "Editorial"
        "Author",
        "Journal",
        "วารสาร",
        "มหาวิทยาลัย",
        "วิทยาลัย",
        "Thailand",
        "Subcommittee"
    ]
    for term in start_with_terms:
        if is_starts_with_word(name, term):
            return False
    for term in contain_terms:
        if is_contain_word(name, term):
            return False
    return True

In [None]:
def handle_blanket_phra_case(name: str):
    phra_titles = [
        'พระเทพปริยัติมุนี',
        "พระเทพ",
        'พระอธิการ',
        "พระปลัด",
        "พระครูใบฎีกา",
        "พระครูปลัด",
        'พระครูสมุห์',
        "พระครู",
        "พระมหา",
        "พระ",
        "Phrakrubha",
        "Phrakhru",
        "Phrakru",
        "Phramaha",
        "Phra"
    ]
    for title in phra_titles:
        name = name.replace(title, "")
    name = name.replace("( ", " (").replace("(", " (")
    name = name.replace(" )", ") ").replace(")", ") ")
    name = name.replace("  ", " ").strip()

    return name

def handle_blanket_general_case(name: str):
    splited_names = name.replace("(", ";").replace(")", ";").split(";")
    splited_names = [name.strip() for name in splited_names if name.strip() != ""]
    if len(splited_names) % 2 == 0:
        lan1_name = ""
        lan2_name = ""
        for i in range(len(splited_names) // 2):
            lan1_name += splited_names[i * 2] + " "
            lan2_name += splited_names[(i * 2) + 1] + " "
        name = lan1_name.strip() + "," + lan2_name.strip()
    else:
        name = " ".join(splited_names)
    return name

def handle_blanket(name: str):
    if is_starts_with_word(name, "พระ") or is_starts_with_word(name, "Phra"):
        name = handle_blanket_phra_case(name)
    elif is_contain_word(name, "("):
        name = handle_blanket_general_case(name)
    return name

def handle_th_and(name:str):
    and_index = name.find(" และ")
    if and_index != -1:
        next_space_index = name[and_index+1:].find(" ")
        if next_space_index != -1:
            name = ",".join(name.split(" และ"))
    return name

def handle_th_en_name(name: str):
    break_index = None
    try:
        for i in range(1, len(name) - 1):
            prev_lang = unicodedata.name(name[i - 1]).split(" ")[0]
            break_lang = unicodedata.name(name[i]).split(" ")[0]
            next_lang = unicodedata.name(name[i + 1]).split(" ")[0]
            if break_lang == "SPACE" and set([prev_lang, next_lang]) == {
                "THAI",
                "LATIN",
            }:
                break_index = i
                break
    except:
        pass

    if break_index is not None:
        first_lang_name = name[:break_index]
        second_lang_name = name[break_index + 1 :]
        name = first_lang_name + "," + second_lang_name
    return name

In [None]:
def captitalize_first_letter(name:str):
    name_parts = name.split(" ")
    name_parts_capitalized = list()
    for name_part in name_parts:
        if len(name_part) >= 2 and name_part[1] == ".":
            name_parts_capitalized.append(name_part)
        else:
            name_parts_capitalized.append(name_part.capitalize())
    return " ".join(name_parts_capitalized)
def get_names(col: str):
    # remove restricted terms
    col = remove_restrict(col)

    # seperate col to names
    names = seperate_names(col)
    names = [name.replace("  ", " ").strip("(") for name in names]

    # general rules filter
    names = [name for name in names if filter_with_general_rules(name)]

    # handle blanket
    names = [handle_blanket(name) for name in names]
    names = ",".join(names).split(",")

    # handle "และ"
    names = [handle_th_and(name) for name in names]
    names = ",".join(names).split(",")

    # handle name with both en and th
    names = [handle_th_en_name(name) for name in names]
    names = ",".join(names).split(",")

    # Uppercase first letter
    names = [captitalize_first_letter(name) for name in names]

    return names

In [None]:
test_col = "ประกายทิพย์  (Prakaithip) พิชัย (Pichai), ม.ร.ว.สมพร (Somporn, M.R.) สุทัศนีย์ (Sudhasani), เสรี (Seree) ชัดแช้ม(Chadcham)"
get_names(test_col)

In [None]:
all_names = list()

for row in df.itertuples():
    all_names = all_names + get_names(row.aut) + get_names(row.coaut)

print(len(all_names))

In [None]:
# all_names

## submition

In [None]:
def export_submission(df: pd.DataFrame):
    submission = {"id": [], "name": []}
    for row in df.itertuples():

        submission["id"].extend([f"{row.id}_{i+1}" for i in range(10)])
        names = list()

        aut: str = row.aut
        if aut != "":
            names.extend(get_names(aut))

        coaut: str = row.coaut
        if coaut != "":
            names.extend(get_names(coaut))

        if len(names) == 0:
            submission["name"].extend([""] * 10)
            continue

        # remove duplicates
        unique_indexes = np.unique(names, return_index=True)[1]
        names = [names[index] for index in sorted(unique_indexes)]

        if len(names) > 10:
            names = names[:10]
        elif len(names) < 10:
            names.extend([""] * (10 - len(names)))
        submission["name"].extend(names)

    submission_df = pd.DataFrame(submission)
    submission_df.to_csv('predict.csv', index=False)

In [None]:
export_submission(df)
print('submit successfully')

----------------------------------------------------

# code aomsin

In [None]:
import pandas as pd
import regex as re
import numpy as np

In [None]:
prefixes = [
        # คำนำหน้าในภาษาไทย
        'อ\.ดร\.','ม\.ร\.ว\.','ผู้ช่วยศาสตราจารย์',
        'เด็กชาย','นาย','เด็กหญิง','นางสาว','นาง',
        'ว่าที่ร้อยตรี','ว่าที่','รศ\.','ดร\.','ผศ\.','ศ\.',
        'รองศาสตราจารย์','อาจารย์','ทันตแพทย์หญิง',
        'อ\.','ร\.ต\.','พ\.ต\.','อ.ดร.'
        # คำนำหน้าในภาษาอังกฤษ
        'Ms\.','Ph\.D','M\.R\.','Mr\.','Mrs\.','Dr\.','Ven\.',
        'Prof\.','Assoc\.','Asst\.','Assist\.',
        'Lect\.','Mohd\.','Prof.','M\.Q\.','N\.J\.'
]
res_ls = ['เกี่ยวกับวารสาร ABOUT THE JOURNAL','คำแนะนำสำหรับผู้เขียน วารสารวิทยาการจัดการ','TSME Thailand','สำนักส่งเสริมวิชาการและงานทะเบียน มรภ.วไลยอลงกรณ์ ในพระบรมราชูปถัมภ์',
          'บทบรรณาธิการ (Editorial)','Dhammathas Academic Journal','Rajapark Journal','Cover Vol.12 (2)','บทบรรณาธิการ วารสารวิชาการและวิจัยสังคมศาสตร์',
          'บรรณาธิการ วารสารวิทยาการจัดการ','กองบรรณาธิการ วารสารสถาบันวัฒนธรรมและศิลปะ','(ผู้แต่ง)','พัฒนาตำราและวารสาร วิทยาลัยดุสิตธานี',
          'สารบัญ วารสารวิทยาการจัดการ', 'MA','คำแนะนำสำหรับผู้เขียน วิทยาการจัดการ','\uf70e','*','. .','กสทช.'
          'Journal of Thai Traditional Alternative Medicine','ผู้ทรงคุณวุฒิ -', '', '--','เจ้าหน้าที่สาธารณสุขโรงพยาบาลส่งเสริมสุข',
          'และคณะ','กองบรรณาธิการ วารสาร','dusayu@kku.ac.th','คณะมนุษยศาสตร์ มหาวิทยาลัยรามคำแหง','ศูนย์บริการโลหิตแห่งชาติ สภากาชาดไทย',
          'รายละเอียดบทความ วารสารวิทยาการจัดการ','subcommittee on the preparation of monofraphs of selected thai materia medica',
          'subcommittee on the preparation of Monographs of Selected Thai Materia Medica',
          'บทบรรณาธิการ วารสารวิชาการและวิจัยสังคม','- -','บทบรรณาธิการ','วารสาร', 'แนะนำผู้เขียน -', 'Mancoh Posung, Et Al', 'บรรณาธิการ -',
        'กองบรรณาธิการ กองบรรณาธิการ', 'กองบรรณาธิการ วารสารรัฏฐาภิรักษ์', 'กองบรรณาธิการ ผู้เชี่ยวชาญ', 'กองบรรณาธิการ วารสารสถาบันวัฒนธรรมและศิลปะ',
          'บรรณาธิการ วารสารมหาวิทยาลัยราชภัฏสกลนคร', 'Editorial team','วิทยาลัยดุสิตธานี Dusit Thani College']
monk_ls = ['พระเทพปริยัติมุนี','พระครูสมุห์','พระครู', 'พระปลัด','พระอธิการ','พระมหา', 'Phrakrubha','Phrakhru','Phrakru','Phramaha'
            ,'Phra','พระ']

In [None]:
def replace_resword(word):
    res_word = res_ls
    for var in res_word:
        word = word.replace(var,'') # เอาคำแปลกๆออก
    if len(re.findall(".+:",word))!=0: # เอา ...: ออก
        rep_word = re.findall(".+:",word)[0]
        word = word.replace(rep_word,'') # re.sub('rep_word','',word)
    word = re.sub('[0-9]+','',word) # เอา 'เลข' ออก
    return word
def check_monk(word):
    monk = monk_ls
    nword = re.sub('^(%s)'%'|'.join(monk),'',word)
    return nword, nword != word
def replace_prefix(word):
    prefix = prefixes
    check = True
    nword = ''
    while check:
        prev = nword
        word = re.sub('^(%s)'%'|'.join(prefix).lower(),'',word.lower().strip())
        nword = word
        if nword == prev:
            check = False
    return nword
def get_names(txt):
    ls_monk = []
    ls = []
    ans = []
    res = replace_resword(txt).split(',')
    res = [r.strip() for r in res]
    # แยกไทยอังกฤษ อะไรก็ไม่รู้
    for r in res:
        if re.findall("[\ก-\๙|.]*|[^\ก-\๙]*",r.lower()) != '':
            mat = re.findall("[\ก-\๙|.]*|[^\ก-\๙]*",r)
            checkinside = re.findall("\([\ก-\๙|.|^ก-\๙]+\)",r.lower()) # (ก-ฮ๑-๙.)
            mat = [var for var in mat if (var != '' and var != ' ')]
            if len(checkinside) != 0:
                for var in range(len(mat)):
                    if mat[var] in checkinside[0].strip() and mat[var] != ''and mat[var] != ')'and mat[var] != '(':
                        mat[var] = checkinside[0].strip()
            ls2 = []
            ls2_monk = []
            state = False
            for var in mat:
                if var != ' ' and var != '':
                    monk = False
                    if mat[0] == var:  # พระ... == first word
                        var, monk = check_monk(var)
                    if (monk or state)and var.strip() != '(' and var.strip() !=')':
                        # if monk = True then state = True (forever until end loop)
                        # else then state = False (forever until end loop)
                        state = True
                    else:
                        var = replace_prefix(var.replace('(','').replace(')',''))
                    ls2_monk.append(state)
                    if len(re.findall(".+:",var.lower()))!=0:
                        # if have ...: jump to next loop
                        continue
                    elif state:
                        ls2.append(var.strip())
                    else:
                        ls2.append(var.strip().replace('(','').replace(')',''))
            ls.append(ls2)
            ls_monk.append(ls2_monk)
#     print(ls, ls_monk)
    for var in range(len(ls)):
        thai = []
        eng = []
        phra = []
        state = False
        rev_order = False
        for var2 in range(len(ls[var])):
            if ls_monk[var][var2]: # [True, True, ...], [False, False, ...]
                # If got phra keep it in list
                phra.append(ls[var][var2])
                state = True
            elif re.search("[ก-๙]+",ls[var][var2].lower()) != None:
                # ขอแค่มีตัวเลข/อักษรไทย ก็เข้าเงื่อนไข
                thai.append(ls[var][var2])
            else:
                if var2 == 0:
                    rev_order = True
                eng.append(ls[var][var2])
        if rev_order:
            ls[var] = [phra,eng,thai]
        else:
            ls[var] = [phra,thai,eng]
        ls_monk[var] = state
#     print(ls, ls_monk)
    ls3_monk=[]
    for r in range(len(ls)):
        for lang in ls[r]:  # [phra,thai,eng]
            if len(lang) != 0:
                ans.append(' '.join(lang).strip())
                ls3_monk.append(ls_monk[r])
#     print(ans, ls3_monk)
    for var in range(len(ans)):
        if re.search('[^ก-\๙]+', ans[var]) != '':
            # ขอแค่มีเลข/อักษร ที่ไม่ใช่ไทย ก็เข้าเงื่อนไข
            word = ans[var].split(' ')
            if ls3_monk[var]:
                bound = word[-1]
                word = [var.capitalize() for var in word if var != '']
                word[-1] = bound.strip()
            if ' '.join(word) != ' ' and ' '.join(word) != '':
                ans[var] = ' '.join(word)
    ans = list(filter(lambda x: x!='-',ans))
#     print(ans)
    for var in range(len(ans)):
        for var2 in ans[var].split(' '):
            word = re.findall('.*\.',var2)
            if len(word) != 0:
                ans[var] = ans[var].replace(var2,var2.upper())
#     print(ans)
    ls3 = []
    for var in range(len(ans)):
        ans[var] = re.sub('[ ]+-','',ans[var])
        ans[var] = re.sub(' [\.]$','',ans[var])
        if len(ans[var])>1:
            if ans[var][-1] == ',':
                ans[var] = ans[var][:-1]
            ls3.append(ans[var].strip())
#     print(ls3)
    return ls3
get_names('พระอดุลย์ กุสลจิตฺโต (ภักดีกุล), Kittikun Sangnin (กิตติคุณ แสงนิล), TIEN-THUY P.H.')

In [None]:
def get_namesauthor(txt):
    ls_monk = []
    ans = []
    res = replace_resword(txt).split(';')
    res = [r.strip() for r in res]
    #print(len(res))
    ls = []
    #แยกไทยชอังกฤษนไทยนอังกฤษ
    for r in res:
        #print(r)
        if re.findall(r"[\ก-\๙|.]*|[^\ก-\๙]*",r.lower()) != '':
            mat = re.findall(r"[\ก-\๙|.]*|[^\ก-\๙]*",r)
            checkinside = re.findall(r"\([\ก-\๙|.|^ก-\๙]+\)",r.lower())
#             print(mat)
            mat = [var for var in mat if (var != '' and var != ' ')]
#             print(mat)
            if len(checkinside) != 0:
                for var in range(len(mat)):
                    if mat[var] in checkinside[0].strip() and mat[var] != ''and mat[var] != ')'and mat[var] != '(':
                        stat_inside = True
                        mat[var] = checkinside[0].strip()
            ls2 = []
            ls2_monk = []
            state = False
            #print(mat)
            for var in mat:
#                 print(var)
                if var != ' ' and var != '':
                    monk = False
                    if mat[0] == var:
                        var,monk = check_monk(var)
#                     print(var)
                    if (monk or state)and var.strip() != '(' and var.strip() !=')':
#                         print(var)
                        state = True
                    else:
                        var = replace_prefix(var.replace('(','').replace(')',''))
#                         print(var)
                    ls2_monk.append(state)
                    #print(var)
                    if len(re.findall(".+:",var.lower()))!=0:
                        #print(var)
                        continue
                    elif state:
#                         print(var.strip())
                        ls2.append(var.strip())
                    else:
                        ls2.append(var.strip().replace('(','').replace(')',''))
#                 print('--------')
            ls.append(ls2)
            ls_monk.append(ls2_monk)
#     print(ls)
#     print(ls_monk)
    for var in range(len(ls)):
        thai = []
        eng = []
        phra = []
        m=[]
        state = False
        rev_order = False
        for var2 in range(len(ls[var])):
            if ls_monk[var][var2]:
                phra.append(ls[var][var2])
                state = True
            elif re.search(r"[ก-๙]+",ls[var][var2].lower()) != None:
                thai.append(ls[var][var2])
            else:
                if var2 == 0:
                    rev_order = True
                eng.append(ls[var][var2])
            m.append(state)
        if rev_order:
            ls[var] = [phra,eng,thai]
        else:
            ls[var] = [phra,thai,eng]
        ls_monk[var] = state
#     print(ls)
#     print(ls_monk)
    ls_monk2=[]
    for r in range(len(ls)):
        for lang in ls[r]:
            if len(lang) != 0:
                ans.append(' '.join(lang).strip())
                ls_monk2.append(ls_monk[r])
#     print(ans)
#     print(ls_monk2)
    for var in range(len(ans)):
        if re.search('[^ก-\๙]+', ans[var]) != '':
            word = ans[var].split(' ')
#             print(word)
            if ls_monk2[var]:
                bound = word[-1]
            word = [var.capitalize() for var in word if var != '']
            if ls_monk2[var]:
                word[-1] = bound.strip()
#             print(word)
            if ' '.join(word) != ' ' and ' '.join(word) != '':
                ans[var] = ' '.join(word)
#                 print(ans[var])
    ans = list(filter(lambda x: x!='-',ans))
    for var in range(len(ans)):
        for var2 in ans[var].split(' '):
            word = re.findall('.*\.',var2)
            if len(word) != 0:
                ans[var] = ans[var].replace(var2,var2.upper())
    ls3 = []
    for var in range(len(ans)):
        ans[var] = re.sub('[ ]+-','',ans[var])
        ans[var] = re.sub(' [\.]$','',ans[var])

        if len(ans[var])>1:
            if ans[var][-1] == ',':
                ans[var] = ans[var][:-1]
            ls3.append(ans[var].strip())
    return ls3

In [None]:
ndf = pd.read_csv('/kaggle/input/thaijo-researcher/test.csv')
ndf = ndf.fillna('')
ndf['ans'] = ndf['_source.co-author'].apply(get_names)
ndf['c_author'] = ndf['_source.author'].apply(get_namesauthor)

In [None]:
ls = []
for var in range(len(ndf['ans'])):
    if len(ndf['c_author'][var]) != 0:
        if type(ndf['c_author'][var]) == list:
            for var2 in ndf['c_author'][var]:
                if var2 not in ndf['ans'][var]:
                    ls.append(var)

In [None]:
for var in ls:
    print(var)
    ndf.iloc[var,4] = ndf.iloc[var,5]
ndf.iloc[ls]

In [None]:
char2.to_csv('predict.csv', index=False)

-------------------

# compare

In [None]:
import pandas as pd
now = pd.read_csv('/kaggle/working/predict.csv')
b90052 = pd.read_csv('/kaggle/input/b90052/predict (2).csv')

In [None]:
compare = pd.concat([now['id'], now['name'], b90052['name']], axis=1)
compare = compare.fillna('')
compare.columns=['id','now','b90052']
compare[compare['now'] != compare['b90052']]

In [None]:
startup = compare[compare['start'] != compare['b90052']]
startup.to_csv('start_90052.csv', index=False)