<a href="https://colab.research.google.com/github/ahsanuamal/ahsanuamal/blob/main/Text_Mining_%26_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Setting Environment**

**Linux Standard Base**. Perintah ini untuk mengidentifikasi dan mendapatkan informasi distribusi Linux yang sedang digunakan.

In [None]:
! lsb_release -a


**Checking package version**

In [None]:
import pkg_resources

In [None]:
dists = [d for d in pkg_resources.working_set] 
for i in dists:
    print(i)

**Import Library**

In [None]:
import nltk, warnings; warnings.simplefilter('ignore')
import logging; logging.captureWarnings(True)
import re

**Download punctuation**

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('stopwords')

**Install Sastrawi Library**

In [None]:
!pip install sastrawi

In [None]:
!pip list | grep -i sastrawi

# **Tokenization & Cleansing**

**Create Dataset**

In [None]:
string01 = "Asep dan Sumardi sedang bermain layangan di lapangan depan masjid"
string02 = "Bagaimanakah reaksi orang tua Ponirah ketika Tukijan datang melamarnya?"

**Get Indonesia Stop Words**

In [None]:
stopwords = nltk.corpus.stopwords.words('indonesian')

In [None]:
len(stopwords)

**Create function**

In [None]:
def tokenize_clean(text):
    
    #tokenisasi
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word
        in nltk.word_tokenize(sent)]
    
    #clean token from numeric and other character like punctuation
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    return filtered_tokens

In [None]:
def remove_stopwords(tokenized_text):

  cleaned_token = []
  for token in tokenized_text:
    if token not in stopwords:
      cleaned_token.append(token)
  return cleaned_token

**Processing data**

In [None]:
result01 = tokenize_clean(string01)
result = remove_stopwords(result01)


**Checking results**

In [None]:
print(string01)
print(result01)
print(result)

In [None]:
len(result01)

In [None]:
result01

In [None]:
result01[0]

In [None]:
result01[-1]

In [None]:
result01[2:4]

In [None]:
for kata in result01:
  print(kata) 

# **Stemming & Lemmatization**

**Import required library**

In [None]:
import nltk
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

**Stemming Function**

In [None]:
def stemming_text(tokenized_text):

  #stem using Sastrawi StemmerFactory
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()

  stems = []
  for token in tokenized_text:
      stems.append(stemmer.stem(token))

  return stems

In [None]:
result01 = tokenize_clean(string01)
result = remove_stopwords(result01)
stem_result = stemming_text(result)
print(string01)
print(result01)
print(result)
print(stem_result)

In [None]:
stemming_text(result)

# **TF-IDF**

**Import required library**

In [None]:
import nltk
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer

**Preprocessing**

In [None]:
def text_preprocessing(text):
    
    prep01 = tokenize_clean(text)
    prep02 = remove_stopwords(prep01)
    prep03 = stemming_text(prep02)
    
    return prep03

**Create Dataset**

In [None]:
dataset = [
     'kucing kucing kucing hitam putih belang',
     'tikus belang',
     'tikus hitam',
     'tikus tikus tikus'
]

In [None]:
len(dataset)

In [None]:
dataset[2]

**Compute TF-IDF**

In [None]:
#perform tf-idf vectorization
vectorizer = TfidfVectorizer(use_idf=True)
result_tfidf = vectorizer.fit_transform(dataset)

**View Result**

Get List Words

In [None]:
print(vectorizer.get_feature_names())

In [None]:
print(vectorizer.vocabulary_)

View TF-IDF Result

In [None]:
print(result_tfidf.shape)

In [None]:
type(result_tfidf)

In [None]:
print(result_tfidf.toarray())

View First Sentence

In [None]:
dataset[0]

In [None]:
print(result_tfidf[0])

In [None]:
print(result_tfidf[0].toarray())

View Second Sentence

In [None]:
print(result_tfidf[1])

In [None]:
print(result_tfidf[1].toarray())

In [None]:
dataset[1]

In [None]:
vectorizer.get_feature_names()

In [None]:
import pandas as pd
df = pd.DataFrame(result_tfidf[1].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df.sort_values(by=['TF-IDF'])

View IDF

In [None]:
# print idf values
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(),columns=["idf"])
 
# sort ascending
df_idf.sort_values(by=['idf'])

# **Compute TF-IDF with new sentence**

In [None]:
new_text = 'kambing hitam'
result_tfidf = vectorizer.transform([new_text])

feature_names = vectorizer.get_feature_names()

In [None]:
feature_names

In [None]:
result_tfidf.toarray()

# **Bigger Dataset**

**Create Dataset**

In [None]:
files = []
files.append("Sekelompok ibu dan kaum perempuan duduk beralaskan rumput lapangan sambil fokus menganyam bambu yang ia genggam ditangan.")
files.append("Sebagian besar masyarakat rupanya tak mau melewatkan waktu begitu  saja untuk meratapi erupsi.")
files.append("Lombok memang memiliki sejuta pesona yang mampu menyedot perhatian orang untuk datang berwisata.")
files.append("Perempuan yang bergelut di dunia kerelawanan akan belajar caranya bertanggung jawab bagi sendiri dan orang lain.")
files.append("Kami berkoordinasi dan melapor pada posko relawan, kami berkomitmen  siap membantu dengan siaga 24 jam")

**Corpus preparation**

In [None]:
token_dict = {}
i = 0
for t in files:
    filename = "file" + str(i)
    token_dict[filename] = t
    i = i + 1

token_dict

In [None]:
token_dict.values()

In [None]:
token_dict['file0']

**Compute TF-IDF**

In [None]:
#perform tf-idf vectorization
tfidf = TfidfVectorizer(max_df=0.8,             # terms with document frequency value > 0.8 will be removed
                        min_df=0.2,             # terms with document frequency value < 0.2 will be removed
                        max_features=200000,    # create maximum 200.000 vocabulary that only consider the top max_features ordered by term frequency across the corpus.
                        stop_words = stopwords, # stopwords list
                        use_idf=True,           # enable inverse-document-frequency reweighting
                        tokenizer=text_preprocessing, # override the string tokenization step by using text_prepocessing function 
                        ngram_range=(1,3))      # ngram range 1 - 3 


tfs = tfidf.fit_transform(token_dict.values())

**View Result**

In [None]:
tfs.shape

In [None]:
print(tfs[0])

View the list of feature

In [None]:
feature_names = tfidf.get_feature_names()

In [None]:
print(len(feature_names))

In [None]:
print(feature_names)

In [None]:
# print idf values
df_idf = pd.DataFrame(tfidf.idf_, index=feature_names,columns=["idf"])
 
# sort ascending
df_idf.sort_values(by=['idf'])

**New sentence TF-IDF transformation**

In [None]:
str1 = 'Di kejauhan tampak seorang relawan pria dari Lombok sedang berjalan.'
response = tfidf.transform([str1])

#show result
for col in response.nonzero()[1]:
    print (feature_names[col], ' - ', response[0, col])

In [None]:
print(response[0])

In [None]:
print (text_preprocessing(str1))