#Import Library dan Dataset

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


* Pandas untuk mengubaha dataset ke dataframe dan memanipulasi data dalam bentuk dataframe
*	NLTK merupakan library natural language yang digunakan untuk melakukan preprocessing dataset
* word_tokenize untuk melakukan tokenizitaion
*	Stopwordsuntuk menghapus stop words
* TreebankWordDetokenizer untuk mengembalikan bentuk token menjadi kalimat (kebalikan dari word tokenize)
* TFIDFVectorizer untuk mengubah dataset menjadi sebuah vector TF-IDF



In [None]:
data = [['Saya suka dengan makanan laut'],
        ['Saya suka dengan minuman yang manis'],
        ['Kemarin saya makan makanan laut'], 
        ['Hari ini saya makan makanan khas Sunda'],
        ['Besok saya berencana makan makanan khas Betawi'],
        ['Kemarin saya makan roti'],
        ['Hari ini saya makan Pizza'],
        ['Besok saya berencana makan Burger'],
        ['Hari ini saya minum teh tawar'],
        ['Kemarin saya minum Coca Cola'],
        ['Besok saya berencana minum kopi'],
        ['Lusa saya berencana minum teh manis']]
 
df = pd.DataFrame(data, columns = ['text'])
df

Unnamed: 0,text
0,Saya suka dengan makanan laut
1,Saya suka dengan minuman yang manis
2,Kemarin saya makan makanan laut
3,Hari ini saya makan makanan khas Sunda
4,Besok saya berencana makan makanan khas Betawi
5,Kemarin saya makan roti
6,Hari ini saya makan Pizza
7,Besok saya berencana makan Burger
8,Hari ini saya minum teh tawar
9,Kemarin saya minum Coca Cola


# Tokenizing

In [None]:
# word_tokenize digunakan untuk mengubah kalimat menjadi token (memisahkan satu kata dengan kata lain menjadi individu dalam satu array)
df['text'] = df['text'].apply(nltk.word_tokenize)
df

Unnamed: 0,text
0,"[Saya, suka, dengan, makanan, laut]"
1,"[Saya, suka, dengan, minuman, yang, manis]"
2,"[Kemarin, saya, makan, makanan, laut]"
3,"[Hari, ini, saya, makan, makanan, khas, Sunda]"
4,"[Besok, saya, berencana, makan, makanan, khas,..."
5,"[Kemarin, saya, makan, roti]"
6,"[Hari, ini, saya, makan, Pizza]"
7,"[Besok, saya, berencana, makan, Burger]"
8,"[Hari, ini, saya, minum, teh, tawar]"
9,"[Kemarin, saya, minum, Coca, Cola]"


# Stopword Removal

In [None]:
# karena kalimat pada corpus menggunakan bahasa Indonesia, maka penghapusan stopwords harus dilakukan sesuai dengan bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))
df['text']= df['text'].apply(lambda x: [i for i in x if i not in stop_words])
df

Unnamed: 0,text
0,"[Saya, suka, makanan, laut]"
1,"[Saya, suka, minuman, manis]"
2,"[Kemarin, makan, makanan, laut]"
3,"[Hari, makan, makanan, khas, Sunda]"
4,"[Besok, berencana, makan, makanan, khas, Betawi]"
5,"[Kemarin, makan, roti]"
6,"[Hari, makan, Pizza]"
7,"[Besok, berencana, makan, Burger]"
8,"[Hari, minum, teh, tawar]"
9,"[Kemarin, minum, Coca, Cola]"


#Detokenization

In [None]:
# token yang telah dibersihkan dari stopwords diubah kembali menjadi bentuk kalimat
df['text'] = df['text'].apply(TreebankWordDetokenizer().detokenize)
df.head()

Unnamed: 0,text
0,Saya suka makanan laut
1,Saya suka minuman manis
2,Kemarin makan makanan laut
3,Hari makan makanan khas Sunda
4,Besok berencana makan makanan khas Betawi


#Mengubah Data menjadi Vector TF-IDF

In [None]:
# Data pada dataframe 'text' diubah menjadi vector TF-IDF
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['text'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()

# Data yang telah dalam bentuk vector dimasukkan kembali ke dalam dataframe
vectorized_datasets = pd.DataFrame(denselist, columns=feature_names)



In [None]:
vectorized_datasets

Unnamed: 0,berencana,besok,betawi,burger,coca,cola,hari,kemarin,khas,kopi,...,manis,minum,minuman,pizza,roti,saya,suka,sunda,tawar,teh
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.524961,0.524961,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.479143,0.0,0.557913,0.0,0.0,0.479143,0.479143,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.524184,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.431253,0.0,0.488198,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.568457,0.0,0.0
4,0.360983,0.402174,0.530128,0.0,0.0,0.0,0.0,0.0,0.45528,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.551336,0.0,0.0,...,0.0,0.0,0.0,0.0,0.726746,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.551336,0.0,0.0,0.0,...,0.0,0.0,0.0,0.726746,0.0,0.0,0.0,0.0,0.0,0.0
7,0.443529,0.49414,0.0,0.651353,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.455266,0.0,0.0,0.0,...,0.0,0.408636,0.0,0.0,0.0,0.0,0.0,0.0,0.60011,0.515381
9,0.0,0.0,0.0,0.0,0.573615,0.573615,0.0,0.435165,0.0,0.0,...,0.0,0.390594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
