In [1]:
import pandas as pd
import regex as re
import numpy as np
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from flair.models import SequenceTagger
from flair.data import Sentence

import stanza
stanza.download('id')
stanza.download('en')
nlp = stanza.Pipeline(lang='id', processors='tokenize,pos', use_gpu=True)
nlp_en = stanza.Pipeline(lang='en', processors='tokenize,pos', use_gpu=True)

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 11.8MB/s]                    
2024-12-28 16:50:14 INFO: Downloading default packages for language: id (Indonesian) ...
2024-12-28 16:50:15 INFO: File exists: C:\Users\User\stanza_resources\id\default.zip
2024-12-28 16:50:17 INFO: Finished downloading models and saved to C:\Users\User\stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 5.27MB/s]                    
2024-12-28 16:50:18 INFO: Downloading default packages for language: en (English) ...
2024-12-28 16:50:19 INFO: File exists: C:\Users\User\stanza_resources\en\default.zip
2024-12-28 16:50:23 INFO: Finished downloading models and saved to C:\Users\User\stanza_resources.
2024-12-28 16:50:23 INFO: Checking for updates to resources.json in case models have been updated.  Note: this beha

In [2]:
df_flair = pd.read_csv('data/clean/clean_dataset.csv')
df_stanza = df_flair[['name', 'clean_name', 'name_length']]
df_flair = df_flair[['name', 'name_length']]
df_flair.head()

Unnamed: 0,name,name_length
0,Gamis Pria Dewasa Premium / Jubah Pakistan Polos,38
1,Buket Bunga Mawar Flanel / Bunga wisuda / Bung...,11
2,Mika Sen Depan Supra Fit New Kaca Lampu Sein D...,10
3,Blazer wanita jumbo big size stik balik / plus...,24
4,Buku Ilmu Sosial Budaya Dasar Perspektif Baru ...,69


In [4]:
# load POS taggers model
pos_custom_id = SequenceTagger.load('resources/taggers/stacked-upos/best-model.pt')
pos_custom_multi = SequenceTagger.load('resources/taggers/stacked-upos-en/best-model.pt')
pos_multiCorpus = SequenceTagger.load('resources/taggers/multiCorpus-upos/best-model.pt')
pos_bert_id = SequenceTagger.load('resources/taggers/bert-id-upos/best-model.pt')
pos_bert_multi = SequenceTagger.load('resources/taggers/bert-multi-upos/best-model.pt')

2024-12-27 17:45:17,482 SequenceTagger predicts: Dictionary with 19 tags: NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, SYM, X, INTJ, <START>, <STOP>
2024-12-27 17:45:22,433 SequenceTagger predicts: Dictionary with 19 tags: NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, SYM, X, INTJ, <START>, <STOP>
2024-12-27 17:45:27,190 SequenceTagger predicts: Dictionary with 19 tags: NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, SYM, X, INTJ, <START>, <STOP>
2024-12-27 17:45:30,965 SequenceTagger predicts: Dictionary with 19 tags: NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, SYM, X, INTJ, <START>, <STOP>
2024-12-27 17:45:34,871 SequenceTagger predicts: Dictionary with 19 tags: NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, SYM, X, INTJ, <START>, <STOP>


In [50]:
# extract nouns from text
def extract_noun_custom(text, tag_pos):
    try:
        # tokens = re.split(r'[^\w]+', text.lower())
        # cleaned_text = ' '.join(filter(None, tokens))
        cleaned_text = text.lower()
        
        sentence = Sentence(cleaned_text)
        tag_pos.predict(sentence)

        filtered_words = []
        # for token in sentence:
        #     if token.get_label('upos').value in ['NOUN']:
        #         filtered_words.append(token.text)
        
        if(len(filtered_words) == 0):
            for token in sentence:
                if token.get_label('upos').value in ['NOUN', 'PROPN']:
                    filtered_words.append(token.text)

        if(len(filtered_words) == 0):
            for token in sentence:
                if token.get_label('upos').value not in ['PUNCT', 'NUM']:
                    filtered_words.append(token.text)
        
        if(len(filtered_words) == 0):
            return text
        
        return ' '.join(filtered_words)
    except Exception as e:
        print(f"Error processing text: {text}. Exception: {e}")
        return ''

In [51]:
# apply POS taggers to extract nouns
df_flair['custom_id_noun'] = df_flair['name'].apply(lambda text: extract_noun_custom(text, pos_custom_id))
df_flair['custom_multi_noun'] = df_flair['name'].apply(lambda text: extract_noun_custom(text, pos_custom_multi))
df_flair['multiCorpus_noun'] = df_flair['name'].apply(lambda text: extract_noun_custom(text, pos_multiCorpus))
df_flair['bert_id_noun'] = df_flair['name'].apply(lambda text: extract_noun_custom(text, pos_bert_id))
df_flair['bert_multi_noun'] = df_flair['name'].apply(lambda text: extract_noun_custom(text, pos_bert_multi))

# FLAIR - BERT - ID V1

In [70]:
# BEST !!!
df_flair_bert_idv1 = df_flair[['name', 'multiCorpus_noun', 'name_length']].copy()
df_flair_bert_idv1['name_length'] = df_flair_bert_idv1['multiCorpus_noun'].apply(lambda x: len(x))

df_flair_bert_idv1[df_flair_bert_idv1['name_length'] == 0][['name', 'multiCorpus_noun', 'name_length']].head()

Unnamed: 0,name,multiCorpus_noun,name_length


In [91]:
all_words = ' '.join(df_flair_bert_idv1['multiCorpus_noun']).split()
word_counts = Counter(all_words)

# words_below_10 = [(word, count) for word, count in word_counts.items() if count < 10]

# for word, count in words_below_10:
#     print(f"{word}: {count}")

words_below_10 = [word for word, count in word_counts.items() if count < 40]

In [92]:
def remove_words(text, words_to_remove):
    words = text.split()
    filtered_words = [word for word in words if word not in words_to_remove]
    return ' '.join(filtered_words)

In [93]:
df_flair_bert_idv1['multiCorpus_noun_rm40'] = df_flair_bert_idv1['multiCorpus_noun'].apply(lambda x: remove_words(x, words_below_10))

In [96]:
df_flair_bert_idv1['name_length'] = df_flair_bert_idv1['multiCorpus_noun_rm40'].apply(lambda x: len(x))

df_flair_bert_idv1[df_flair_bert_idv1['name_length'] == 0][['name', 'multiCorpus_noun', 'multiCorpus_noun_rm10', 'multiCorpus_noun_rm20', 'multiCorpus_noun_rm30', 'multiCorpus_noun_rm40', 'name_length']]

Unnamed: 0,name,multiCorpus_noun,multiCorpus_noun_rm10,multiCorpus_noun_rm20,multiCorpus_noun_rm30,multiCorpus_noun_rm40,name_length


In [95]:
df_flair_bert_idv1.loc[df_flair_bert_idv1['name_length'] == 0, 'multiCorpus_noun_rm40'] = df_flair_bert_idv1['multiCorpus_noun_rm10']

In [98]:
df_flair_bert_idv1.to_csv('data/clean/clean_dataset_posBERTV1.csv', index=False)

In [None]:
all_words = ' '.join(df_flair_bert_idv1['multiCorpus_noun']).split()
word_counts = Counter(all_words)

for word, count in word_counts.most_common(1000):
    print(f"{word}: {count}")

In [100]:
remove_word = ['pcs', 'premium', 'inch', 'cm', 'ml', 'x', 'new', 'ukuran', 'kg', 'import', 
               'gr', 'size', 'meter', 'liter', 'gram', 'l', 'ori', 'indonesia', 'korea', 
               'm', 's', 'mm', 'the', 'in', 'watt', 'korean', 'c', 'edition', 'a', '100ml', 
               'xl', 'b', 'japan', 'kualitas', 'g', 'kekinian', 'v', '3in1', 'termurah', 
               'bpom', 'w', 'dll', 'r', 'h', 'gb', 't', 'k', '8gb']

def remove_specific_words(text, remove_word):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in remove_word]
    return ' '.join(filtered_words)

df_flair_bert_idv1['multiCorpus_noun'] = df_flair_bert_idv1['multiCorpus_noun'].apply(lambda x: remove_specific_words(x, remove_word))
df_flair_bert_idv1['multiCorpus_noun_rm10'] = df_flair_bert_idv1['multiCorpus_noun_rm10'].apply(lambda x: remove_specific_words(x, remove_word))
df_flair_bert_idv1['multiCorpus_noun_rm20'] = df_flair_bert_idv1['multiCorpus_noun_rm20'].apply(lambda x: remove_specific_words(x, remove_word))
df_flair_bert_idv1['multiCorpus_noun_rm30'] = df_flair_bert_idv1['multiCorpus_noun_rm30'].apply(lambda x: remove_specific_words(x, remove_word))
df_flair_bert_idv1['multiCorpus_noun_rm40'] = df_flair_bert_idv1['multiCorpus_noun_rm40'].apply(lambda x: remove_specific_words(x, remove_word))

In [114]:
df_flair_bert_idv1['name_length'] = df_flair_bert_idv1['multiCorpus_noun_rm40'].apply(lambda x: len(x))

df_flair_bert_idv1[df_flair_bert_idv1['name_length'] == 0][['name', 'multiCorpus_noun', 'multiCorpus_noun_rm10', 'multiCorpus_noun_rm20', 'multiCorpus_noun_rm30', 'multiCorpus_noun_rm40', 'name_length']]

Unnamed: 0,name,multiCorpus_noun,multiCorpus_noun_rm10,multiCorpus_noun_rm20,multiCorpus_noun_rm30,multiCorpus_noun_rm40,name_length


In [113]:
df_flair_bert_idv1.loc[df_flair_bert_idv1['name_length'] == 0, 'multiCorpus_noun_rm40'] = df_flair_bert_idv1['multiCorpus_noun_rm10']

In [115]:
df_flair_bert_idv1.to_csv('data/clean/clean_dataset_posBERTV1-1.csv', index=False)

In [117]:
# cleaned_text = df_flair_bert_idv1['name'].iloc[52].lower()
# sentence = Sentence(cleaned_text)
# pos_multiCorpus.predict(sentence)
# for token in sentence:
#     print(f"{token.text} -> {token.get_label('upos')}")

# STANZA

In [3]:
df_stanza.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8061 entries, 0 to 8060
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         8061 non-null   object
 1   clean_name   8061 non-null   object
 2   name_length  8061 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 189.1+ KB


In [4]:
def extract_noun_stanza(text, nlp):
    try:
        tokens = re.split(r'[^\w]+', text.lower())
        cleaned_text = ' '.join(filter(None, tokens))
        
        doc = nlp(cleaned_text)
        nouns = [word.text for sentence in doc.sentences for word in sentence.words if word.upos in {'NOUN'}]
        if(nouns):
            return ' '.join(nouns)
        
        nouns = [word.text for sentence in doc.sentences for word in sentence.words if word.upos in {'NOUN', 'PROPN'}]
        if(nouns):
            return ' '.join(nouns)
          
        nouns = [word.text for sentence in doc.sentences for word in sentence.words if word.upos not in {'NUM', 'PUNCT'}]
        if(nouns):
            return ' '.join(nouns)
        
        return cleaned_text
    except Exception as e:
        print(f"Error processing text: {text}. Exception: {e}")
        return ''

In [5]:
df_stanza['noun_id'] = df_stanza['name'].apply(lambda text: extract_noun_stanza(text, nlp))
df_stanza['noun_en'] = df_stanza['name'].apply(lambda text: extract_noun_stanza(text, nlp_en))

# REMOVING WORDS FROM COLUMN

In [None]:
def remove_word_bottom_noun(df, num_delete, column_name):
    try:
        all_words = ' '.join(df[column_name]).split()
        word_counts = Counter(all_words)
        words_below = [word for word, count in word_counts.items() if count < num_delete]
        remove_words = [
            word.text
            for word in words_below
            for sentence in nlp(word).sentences
            for word in sentence.words
            if word.upos not in {'NOUN', 'PROPN'}
        ]
        
        return remove_words
    except Exception as e:
        print(f"Error processing column: {column_name}. Exception: {e}")
        return ''

In [6]:
def remove_word_bottom(df, num_delete, column_name):
    try:
        all_words = ' '.join(df[column_name]).split()
        word_counts = Counter(all_words)
        words_below = [word for word, count in word_counts.items() if count < num_delete]
        
        return words_below
    except Exception as e:
        print(f"Error processing column: {column_name}. Exception: {e}")
        return ''

In [7]:
def remove_specific_words(text, remove_word):
    try:
        if not text:
            return ''
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in remove_word]
        return ' '.join(filtered_words)
    except Exception as e:
        print(f"Error processing text: {text}. Exception: {e}")
        return ''

In [8]:
def check_empty_column(df, column_name):
    try:
        df['name_length'] = df[column_name].apply(lambda x: len(x.split()))
        return df[df['name_length'] == 0][['name', 'clean_name', column_name, 'name_length']]
    except Exception as e:
        print(f"Error processing column: {column_name}. Exception: {e}")
        return ''

# STANZA ID

In [9]:
df_stanza['name_length'] = df_stanza['noun_id'].apply(lambda x: len(x.split()))

df_stanza[df_stanza['name_length'] == 0][['name', 'noun_id', 'name_length']]

Unnamed: 0,name,noun_id,name_length


## REMOVE WORD THAT HAS LENGTH LESS THAN 10

In [10]:
remove_word_10 = remove_word_bottom(df_stanza, 10, 'noun_id')
print(len(remove_word_10))
df_stanza['noun_id_rm10'] = df_stanza['noun_id'].apply(lambda x: remove_specific_words(x, remove_word_10))

10321


In [11]:
empty_column = check_empty_column(df_stanza, 'noun_id_rm10')
empty_column

Unnamed: 0,name,clean_name,noun_id_rm10,name_length
15,Victorinox Rapid Peeler 6.0930,victorinox rapid peeler,,0
46,Capacitor/ Kapasitor 50kvar 400V / 86kvar 525V...,capacitor kapasitor kvar v kvar v al nokian,,0
97,Pen Scriber Holder with SOFT Grip dawning cutt...,pen scriber holder soft grip dawning cutting b...,,0
109,Klockner Moeller | PS416-MEM-440 | 512kB EEPRO...,klockner moeller ps mem kb eeprom flash cpu me...,,0
116,BIOAQUA Peach Makeup Remover Wipes 9g×30pcs,bioaqua peach makeup remover wipes g pcs,,0
...,...,...,...,...
8009,Love Beauty Planet Conditioner 400ml,love beauty planet conditioner ml,,0
8016,Fort Industrial Plug CEE-023 3 X 32A IP44,fort industrial plug cee x ip,,0
8021,Sample Tester Sirup 60ml,sample tester sirup ml,,0
8044,Portable Hard Shell EVA Travel Carrying Case S...,portable hard shell eva travel carrying case s...,,0


In [12]:
if(len(empty_column) > 0):
    df_stanza.loc[df_stanza['name_length'] == 0, 'noun_id_rm10'] = df_stanza['clean_name']

check_empty_column(df_stanza, 'noun_id_rm10')

Unnamed: 0,name,clean_name,noun_id_rm10,name_length


## REMOVE WORD THAT HAS LENGTH LESS THAN 20

In [13]:
remove_word_20 = remove_word_bottom(df_stanza, 20, 'noun_id')
print(len(remove_word_20))
df_stanza['noun_id_rm20'] = df_stanza['noun_id'].apply(lambda x: remove_specific_words(x, remove_word_20))

10863


In [14]:
empty_column = check_empty_column(df_stanza, 'noun_id_rm20')
empty_column

Unnamed: 0,name,clean_name,noun_id_rm20,name_length
15,Victorinox Rapid Peeler 6.0930,victorinox rapid peeler,,0
24,Spanduk / Banner Toko Kelontong / Warung Sembako,spanduk banner toko kelontong warung sembako,,0
32,Konseling dan Terapi Qurani,konseling terapi qurani,,0
45,Piano Kawai US-50 Semi Grand Upright,piano kawai us semi grand upright,,0
46,Capacitor/ Kapasitor 50kvar 400V / 86kvar 525V...,capacitor kapasitor kvar v kvar v al nokian,,0
...,...,...,...,...
8021,Sample Tester Sirup 60ml,sample tester sirup ml,,0
8044,Portable Hard Shell EVA Travel Carrying Case S...,portable hard shell eva travel carrying case s...,,0
8045,Olaif Powerful Cleaning Liquid Detergent 1L- D...,olaif powerful cleaning liquid detergent l det...,,0
8046,Viva Queen Perfect Art Eye Liner Pen,viva queen perfect art eye liner pen,,0


In [15]:
if(len(empty_column) > 0):
    df_stanza.loc[df_stanza['name_length'] == 0, 'noun_id_rm20'] = df_stanza['clean_name']

check_empty_column(df_stanza, 'noun_id_rm20')

Unnamed: 0,name,clean_name,noun_id_rm20,name_length


## REMOVE WORD THAT HAS LENGTH LESS THAN 30

In [16]:
remove_word_30 = remove_word_bottom(df_stanza, 30, 'noun_id')
print(len(remove_word_30))
df_stanza['noun_id_rm30'] = df_stanza['noun_id'].apply(lambda x: remove_specific_words(x, remove_word_30))

11051


In [17]:
empty_column = check_empty_column(df_stanza, 'noun_id_rm30')
empty_column

Unnamed: 0,name,clean_name,noun_id_rm30,name_length
10,Asin amigo/asin kerupuk kualitas super,asin amigo asin kerupuk kualitas super,,0
15,Victorinox Rapid Peeler 6.0930,victorinox rapid peeler,,0
23,benih bibit tanaman sayuran repack,benih bibit tanaman sayuran repack,,0
24,Spanduk / Banner Toko Kelontong / Warung Sembako,spanduk banner toko kelontong warung sembako,,0
32,Konseling dan Terapi Qurani,konseling terapi qurani,,0
...,...,...,...,...
8044,Portable Hard Shell EVA Travel Carrying Case S...,portable hard shell eva travel carrying case s...,,0
8045,Olaif Powerful Cleaning Liquid Detergent 1L- D...,olaif powerful cleaning liquid detergent l det...,,0
8046,Viva Queen Perfect Art Eye Liner Pen,viva queen perfect art eye liner pen,,0
8048,MAKE OVER Lip Amplify Contour Liner | Lip Liner,make lip amplify contour liner lip liner,,0


In [18]:
if(len(empty_column) > 0):
    df_stanza.loc[df_stanza['name_length'] == 0, 'noun_id_rm30'] = df_stanza['clean_name']

check_empty_column(df_stanza, 'noun_id_rm30')

Unnamed: 0,name,clean_name,noun_id_rm30,name_length


## REMOVE WORD THAT HAS LENGTH LESS THAN 40

In [19]:
remove_word_40 = remove_word_bottom(df_stanza, 40, 'noun_id')
print(len(remove_word_40))
df_stanza['noun_id_rm40'] = df_stanza['noun_id'].apply(lambda x: remove_specific_words(x, remove_word_40))

11151


In [20]:
empty_column = check_empty_column(df_stanza, 'noun_id_rm40')
empty_column

Unnamed: 0,name,clean_name,noun_id_rm40,name_length
10,Asin amigo/asin kerupuk kualitas super,asin amigo asin kerupuk kualitas super,,0
15,Victorinox Rapid Peeler 6.0930,victorinox rapid peeler,,0
21,WeRoam Travel Sim Card Taiwan Kuota Besar Data...,weroam travel sim card taiwan kuota besar data...,,0
23,benih bibit tanaman sayuran repack,benih bibit tanaman sayuran repack,,0
24,Spanduk / Banner Toko Kelontong / Warung Sembako,spanduk banner toko kelontong warung sembako,,0
...,...,...,...,...
8045,Olaif Powerful Cleaning Liquid Detergent 1L- D...,olaif powerful cleaning liquid detergent l det...,,0
8046,Viva Queen Perfect Art Eye Liner Pen,viva queen perfect art eye liner pen,,0
8048,MAKE OVER Lip Amplify Contour Liner | Lip Liner,make lip amplify contour liner lip liner,,0
8058,Gurita Potong Rebus Beku Frozen / Octopus Tako...,gurita potong rebus beku frozen octopus takoya...,,0


In [21]:
if(len(empty_column) > 0):
    df_stanza.loc[df_stanza['name_length'] == 0, 'noun_id_rm40'] = df_stanza['clean_name']

check_empty_column(df_stanza, 'noun_id_rm40')

Unnamed: 0,name,clean_name,noun_id_rm40,name_length


# STANZA EN

In [22]:
df_stanza['name_length'] = df_stanza['noun_en'].apply(lambda x: len(x))

df_stanza[df_stanza['name_length'] == 0][['name', 'noun_en', 'name_length']]

Unnamed: 0,name,noun_en,name_length


## REMOVE WORD THAT HAS LENGTH LESS THAN 10

In [23]:
remove_word_10 = remove_word_bottom(df_stanza, 10, 'noun_en')
print(len(remove_word_10))
df_stanza['noun_en_rm10'] = df_stanza['noun_en'].apply(lambda x: remove_specific_words(x, remove_word_10))

9551


In [24]:
empty_column = check_empty_column(df_stanza, 'noun_en_rm10')
empty_column

Unnamed: 0,name,clean_name,noun_en_rm10,name_length
15,Victorinox Rapid Peeler 6.0930,victorinox rapid peeler,,0
18,Yaxiya Gelang Bayi Perempuan Permata Perhiasan...,yaxiya gelang bayi perempuan permata perhiasan...,,0
19,BUSI MESIN FOGGING TASCO - SWINGFOG / MERK CHA...,busi mesin fogging tasco swingfog merk champio...,,0
23,benih bibit tanaman sayuran repack,benih bibit tanaman sayuran repack,,0
24,Spanduk / Banner Toko Kelontong / Warung Sembako,spanduk banner toko kelontong warung sembako,,0
...,...,...,...,...
8021,Sample Tester Sirup 60ml,sample tester sirup ml,,0
8026,Setelan Syari Wanita Muslimah Simple Rury one ...,setelan syari wanita muslimah simple rury one ...,,0
8031,(GOSEND/GRAB) LE MINERALE - Air Mineral Galon ...,gosend grab le minerale air mineral galon l se...,,0
8036,"[RCU24] Undangan Kalender Duduk, Undangan Pern...",rcu undangan kalender duduk undangan pernikaha...,,0


In [25]:
if(len(empty_column) > 0):
    df_stanza.loc[df_stanza['name_length'] == 0, 'noun_en_rm10'] = df_stanza['clean_name']

check_empty_column(df_stanza, 'noun_en_rm10')

Unnamed: 0,name,clean_name,noun_en_rm10,name_length


## REMOVE WORD THAT HAS LENGTH LESS THAN 20

In [26]:
remove_word_20 = remove_word_bottom(df_stanza, 20, 'noun_en')
print(len(remove_word_20))
df_stanza['noun_en_rm20'] = df_stanza['noun_en'].apply(lambda x: remove_specific_words(x, remove_word_20))

10069


In [27]:
empty_column = check_empty_column(df_stanza, 'noun_en_rm20')
empty_column

Unnamed: 0,name,clean_name,noun_en_rm20,name_length
9,Mesin Coding Automatic Cetak Expired Date Prod...,mesin coding automatic cetak expired date prod...,,0
10,Asin amigo/asin kerupuk kualitas super,asin amigo asin kerupuk kualitas super,,0
11,Panlandwoo - Gelang Bangle Stainless Wanita Bu...,panlandwoo gelang bangle stainless wanita butter,,0
15,Victorinox Rapid Peeler 6.0930,victorinox rapid peeler,,0
18,Yaxiya Gelang Bayi Perempuan Permata Perhiasan...,yaxiya gelang bayi perempuan permata perhiasan...,,0
...,...,...,...,...
8036,"[RCU24] Undangan Kalender Duduk, Undangan Pern...",rcu undangan kalender duduk undangan pernikaha...,,0
8037,majalah musik Q Januari 2006,majalah musik q januari,,0
8040,Pakaian Bayi dan Anak Laki-laki Motif Best Friend,pakaian bayi anak laki laki motif best friend,,0
8045,Olaif Powerful Cleaning Liquid Detergent 1L- D...,olaif powerful cleaning liquid detergent l det...,,0


In [28]:
if(len(empty_column) > 0):
    df_stanza.loc[df_stanza['name_length'] == 0, 'noun_en_rm20'] = df_stanza['clean_name']

check_empty_column(df_stanza, 'noun_en_rm20')

Unnamed: 0,name,clean_name,noun_en_rm20,name_length


## REMOVE WORD THAT HAS LENGTH LESS THAN 30

In [29]:
remove_word_30 = remove_word_bottom(df_stanza, 30, 'noun_en')
print(len(remove_word_30))
df_stanza['noun_en_rm30'] = df_stanza['noun_en'].apply(lambda x: remove_specific_words(x, remove_word_30))

10230


In [30]:
empty_column = check_empty_column(df_stanza, 'noun_en_rm30')
empty_column

Unnamed: 0,name,clean_name,noun_en_rm30,name_length
1,Buket Bunga Mawar Flanel / Bunga wisuda / Bung...,buket bunga mawar flanel bunga wisuda bunga so...,,0
9,Mesin Coding Automatic Cetak Expired Date Prod...,mesin coding automatic cetak expired date prod...,,0
10,Asin amigo/asin kerupuk kualitas super,asin amigo asin kerupuk kualitas super,,0
11,Panlandwoo - Gelang Bangle Stainless Wanita Bu...,panlandwoo gelang bangle stainless wanita butter,,0
15,Victorinox Rapid Peeler 6.0930,victorinox rapid peeler,,0
...,...,...,...,...
8036,"[RCU24] Undangan Kalender Duduk, Undangan Pern...",rcu undangan kalender duduk undangan pernikaha...,,0
8037,majalah musik Q Januari 2006,majalah musik q januari,,0
8040,Pakaian Bayi dan Anak Laki-laki Motif Best Friend,pakaian bayi anak laki laki motif best friend,,0
8045,Olaif Powerful Cleaning Liquid Detergent 1L- D...,olaif powerful cleaning liquid detergent l det...,,0


In [31]:
if(len(empty_column) > 0):
    df_stanza.loc[df_stanza['name_length'] == 0, 'noun_en_rm30'] = df_stanza['clean_name']

check_empty_column(df_stanza, 'noun_en_rm30')

Unnamed: 0,name,clean_name,noun_en_rm30,name_length


## REMOVE WORD THAT HAS LENGTH LESS THAN 40

In [32]:
remove_word_40 = remove_word_bottom(df_stanza, 40, 'noun_en')
print(len(remove_word_40))
df_stanza['noun_en_rm40'] = df_stanza['noun_en'].apply(lambda x: remove_specific_words(x, remove_word_40))

10301


In [33]:
empty_column = check_empty_column(df_stanza, 'noun_en_rm40')
empty_column

Unnamed: 0,name,clean_name,noun_en_rm40,name_length
1,Buket Bunga Mawar Flanel / Bunga wisuda / Bung...,buket bunga mawar flanel bunga wisuda bunga so...,,0
5,SUAVECITO wax rambut warna abu abu grey silver...,suavecito wax rambut warna abu abu grey silver...,,0
9,Mesin Coding Automatic Cetak Expired Date Prod...,mesin coding automatic cetak expired date prod...,,0
10,Asin amigo/asin kerupuk kualitas super,asin amigo asin kerupuk kualitas super,,0
11,Panlandwoo - Gelang Bangle Stainless Wanita Bu...,panlandwoo gelang bangle stainless wanita butter,,0
...,...,...,...,...
8045,Olaif Powerful Cleaning Liquid Detergent 1L- D...,olaif powerful cleaning liquid detergent l det...,,0
8048,MAKE OVER Lip Amplify Contour Liner | Lip Liner,make lip amplify contour liner lip liner,,0
8052,Glowies Lash Lift Effect With Comb Eyelash Cur...,glowies lash lift effect comb eyelash curler p...,,0
8053,Buku Novel Kisah Nyata ANGELA,buku novel kisah nyata angela,,0


In [34]:
if(len(empty_column) > 0):
    df_stanza.loc[df_stanza['name_length'] == 0, 'noun_en_rm40'] = df_stanza['clean_name']

check_empty_column(df_stanza, 'noun_en_rm40')

Unnamed: 0,name,clean_name,noun_en_rm40,name_length


In [35]:
df_stanza.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8061 entries, 0 to 8060
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          8061 non-null   object
 1   clean_name    8061 non-null   object
 2   name_length   8061 non-null   int64 
 3   noun_id       8061 non-null   object
 4   noun_en       8061 non-null   object
 5   noun_id_rm10  8061 non-null   object
 6   noun_id_rm20  8061 non-null   object
 7   noun_id_rm30  8061 non-null   object
 8   noun_id_rm40  8061 non-null   object
 9   noun_en_rm10  8061 non-null   object
 10  noun_en_rm20  8061 non-null   object
 11  noun_en_rm30  8061 non-null   object
 12  noun_en_rm40  8061 non-null   object
dtypes: int64(1), object(12)
memory usage: 818.8+ KB


In [36]:
df_stanza.to_csv('data/clean/clean_dataset_posStanza1.csv', index=False)