In [2]:
import pandas as pd
import regex as re
import numpy as np
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from flair.models import SequenceTagger
from flair.data import Sentence

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_flair = pd.read_csv('data/clean/clean_dataset.csv')
df_flair = df_flair[['name', 'name_length']]
df_flair.head()
df_stanza = df_flair.copy()

In [4]:
# load POS taggers model
pos_custom_id = SequenceTagger.load('resources/taggers/stacked-upos/best-model.pt')
pos_custom_multi = SequenceTagger.load('resources/taggers/stacked-upos-en/best-model.pt')
pos_multiCorpus = SequenceTagger.load('resources/taggers/multiCorpus-upos/best-model.pt')
pos_bert_id = SequenceTagger.load('resources/taggers/bert-id-upos/best-model.pt')
pos_bert_multi = SequenceTagger.load('resources/taggers/bert-multi-upos/best-model.pt')

2024-12-27 17:45:17,482 SequenceTagger predicts: Dictionary with 19 tags: NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, SYM, X, INTJ, <START>, <STOP>
2024-12-27 17:45:22,433 SequenceTagger predicts: Dictionary with 19 tags: NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, SYM, X, INTJ, <START>, <STOP>
2024-12-27 17:45:27,190 SequenceTagger predicts: Dictionary with 19 tags: NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, SYM, X, INTJ, <START>, <STOP>
2024-12-27 17:45:30,965 SequenceTagger predicts: Dictionary with 19 tags: NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, SYM, X, INTJ, <START>, <STOP>
2024-12-27 17:45:34,871 SequenceTagger predicts: Dictionary with 19 tags: NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, SYM, X, INTJ, <START>, <STOP>


In [50]:
# extract nouns from text
def extract_noun_custom(text, tag_pos):
    try:
        # tokens = re.split(r'[^\w]+', text.lower())
        # cleaned_text = ' '.join(filter(None, tokens))
        cleaned_text = text.lower()
        
        sentence = Sentence(cleaned_text)
        tag_pos.predict(sentence)

        filtered_words = []
        # for token in sentence:
        #     if token.get_label('upos').value in ['NOUN']:
        #         filtered_words.append(token.text)
        
        if(len(filtered_words) == 0):
            for token in sentence:
                if token.get_label('upos').value in ['NOUN', 'PROPN']:
                    filtered_words.append(token.text)

        if(len(filtered_words) == 0):
            for token in sentence:
                if token.get_label('upos').value not in ['PUNCT', 'NUM']:
                    filtered_words.append(token.text)
        
        if(len(filtered_words) == 0):
            return text
        
        return ' '.join(filtered_words)
    except Exception as e:
        print(f"Error processing text: {text}. Exception: {e}")
        return ''

In [51]:
# apply POS taggers to extract nouns
df_flair['custom_id_noun'] = df_flair['name'].apply(lambda text: extract_noun_custom(text, pos_custom_id))
df_flair['custom_multi_noun'] = df_flair['name'].apply(lambda text: extract_noun_custom(text, pos_custom_multi))
df_flair['multiCorpus_noun'] = df_flair['name'].apply(lambda text: extract_noun_custom(text, pos_multiCorpus))
df_flair['bert_id_noun'] = df_flair['name'].apply(lambda text: extract_noun_custom(text, pos_bert_id))
df_flair['bert_multi_noun'] = df_flair['name'].apply(lambda text: extract_noun_custom(text, pos_bert_multi))

# FLAIR - BERT - ID V1

In [70]:
# BEST !!!
df_flair_bert_idv1 = df_flair[['name', 'multiCorpus_noun', 'name_length']].copy()
df_flair_bert_idv1['name_length'] = df_flair_bert_idv1['multiCorpus_noun'].apply(lambda x: len(x))

df_flair_bert_idv1[df_flair_bert_idv1['name_length'] == 0][['name', 'multiCorpus_noun', 'name_length']].head()

Unnamed: 0,name,multiCorpus_noun,name_length


In [91]:
all_words = ' '.join(df_flair_bert_idv1['multiCorpus_noun']).split()
word_counts = Counter(all_words)

# words_below_10 = [(word, count) for word, count in word_counts.items() if count < 10]

# for word, count in words_below_10:
#     print(f"{word}: {count}")

words_below_10 = [word for word, count in word_counts.items() if count < 40]

In [92]:
def remove_words(text, words_to_remove):
    words = text.split()
    filtered_words = [word for word in words if word not in words_to_remove]
    return ' '.join(filtered_words)

In [93]:
df_flair_bert_idv1['multiCorpus_noun_rm40'] = df_flair_bert_idv1['multiCorpus_noun'].apply(lambda x: remove_words(x, words_below_10))

In [96]:
df_flair_bert_idv1['name_length'] = df_flair_bert_idv1['multiCorpus_noun_rm40'].apply(lambda x: len(x))

df_flair_bert_idv1[df_flair_bert_idv1['name_length'] == 0][['name', 'multiCorpus_noun', 'multiCorpus_noun_rm10', 'multiCorpus_noun_rm20', 'multiCorpus_noun_rm30', 'multiCorpus_noun_rm40', 'name_length']]

Unnamed: 0,name,multiCorpus_noun,multiCorpus_noun_rm10,multiCorpus_noun_rm20,multiCorpus_noun_rm30,multiCorpus_noun_rm40,name_length


In [95]:
df_flair_bert_idv1.loc[df_flair_bert_idv1['name_length'] == 0, 'multiCorpus_noun_rm40'] = df_flair_bert_idv1['multiCorpus_noun_rm10']

In [98]:
df_flair_bert_idv1.to_csv('data/clean/clean_dataset_posBERTV1.csv', index=False)

In [None]:
all_words = ' '.join(df_flair_bert_idv1['multiCorpus_noun']).split()
word_counts = Counter(all_words)

for word, count in word_counts.most_common(1000):
    print(f"{word}: {count}")

In [100]:
remove_word = ['pcs', 'premium', 'inch', 'cm', 'ml', 'x', 'new', 'ukuran', 'kg', 'import', 
               'gr', 'size', 'meter', 'liter', 'gram', 'l', 'ori', 'indonesia', 'korea', 
               'm', 's', 'mm', 'the', 'in', 'watt', 'korean', 'c', 'edition', 'a', '100ml', 
               'xl', 'b', 'japan', 'kualitas', 'g', 'kekinian', 'v', '3in1', 'termurah', 
               'bpom', 'w', 'dll', 'r', 'h', 'gb', 't', 'k', '8gb']

def remove_specific_words(text, remove_word):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in remove_word]
    return ' '.join(filtered_words)

df_flair_bert_idv1['multiCorpus_noun'] = df_flair_bert_idv1['multiCorpus_noun'].apply(lambda x: remove_specific_words(x, remove_word))
df_flair_bert_idv1['multiCorpus_noun_rm10'] = df_flair_bert_idv1['multiCorpus_noun_rm10'].apply(lambda x: remove_specific_words(x, remove_word))
df_flair_bert_idv1['multiCorpus_noun_rm20'] = df_flair_bert_idv1['multiCorpus_noun_rm20'].apply(lambda x: remove_specific_words(x, remove_word))
df_flair_bert_idv1['multiCorpus_noun_rm30'] = df_flair_bert_idv1['multiCorpus_noun_rm30'].apply(lambda x: remove_specific_words(x, remove_word))
df_flair_bert_idv1['multiCorpus_noun_rm40'] = df_flair_bert_idv1['multiCorpus_noun_rm40'].apply(lambda x: remove_specific_words(x, remove_word))

In [114]:
df_flair_bert_idv1['name_length'] = df_flair_bert_idv1['multiCorpus_noun_rm40'].apply(lambda x: len(x))

df_flair_bert_idv1[df_flair_bert_idv1['name_length'] == 0][['name', 'multiCorpus_noun', 'multiCorpus_noun_rm10', 'multiCorpus_noun_rm20', 'multiCorpus_noun_rm30', 'multiCorpus_noun_rm40', 'name_length']]

Unnamed: 0,name,multiCorpus_noun,multiCorpus_noun_rm10,multiCorpus_noun_rm20,multiCorpus_noun_rm30,multiCorpus_noun_rm40,name_length


In [113]:
df_flair_bert_idv1.loc[df_flair_bert_idv1['name_length'] == 0, 'multiCorpus_noun_rm40'] = df_flair_bert_idv1['multiCorpus_noun_rm10']

In [115]:
df_flair_bert_idv1.to_csv('data/clean/clean_dataset_posBERTV1-1.csv', index=False)

In [117]:
# cleaned_text = df_flair_bert_idv1['name'].iloc[52].lower()
# sentence = Sentence(cleaned_text)
# pos_multiCorpus.predict(sentence)
# for token in sentence:
#     print(f"{token.text} -> {token.get_label('upos')}")

# STANZA