### Import Libary

In [15]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [16]:
# download stopword indonesia
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [17]:
df = pd.read_csv("tweet_pak_jokowi.csv")

In [18]:
print(df.head())

   Unnamed: 0                                               text    Label
0           0  rt @megatop99: @conannkri @ccicpolri @mohmahfu...  positif
1           1  @jokowi harga2 pd naik gaji aparat negara gk n...  negatif
2           2  xx : kalian coba mengusir yang mau membersihka...  negatif
3           3  @jokowi haturnuhun bapak presiden @jokowi tela...   netral
4           4  @rifanrobani @catatan_ali7 @erickthohir @jokow...   netral


### inisialisasi Stemmer dan Stopwords

In [19]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

In [20]:
def preprocess_data(text):
    # Case Folding
    text = text.lower()
    
    # Cleaning utk mengahpus simbol
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[-+]?[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenizing utk memecah teks menjadi per-kata
    tokens = text.split()
    
    # Stopword Removal utk menghapus kata umum
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming utk mengubah ke kata dasar
    tokens = [stemmer.stem(word) for word in tokens]
    
    return " ".join(tokens)


In [21]:
# 100 data
df_clean = df.head(100).copy()
df_clean['hasil_preprocessing'] = df_clean['text'].apply(preprocess_data)

In [22]:
# menampilkan hasil perbandingannya
print(df_clean[['text', 'hasil_preprocessing']].head())

                                                text  \
0  rt @megatop99: @conannkri @ccicpolri @mohmahfu...   
1  @jokowi harga2 pd naik gaji aparat negara gk n...   
2  xx : kalian coba mengusir yang mau membersihka...   
3  @jokowi haturnuhun bapak presiden @jokowi tela...   
4  @rifanrobani @catatan_ali7 @erickthohir @jokow...   

                                 hasil_preprocessing  
0  rt megatop conannkri ccicpolri mohmahfudmd neg...  
1  jokowi harga pd gaji aparat negara gk naikyg e...  
2   xx coba usir bersih air planet gagal tukar air l  
3  jokowi haturnuhun presiden jokowi kenan kunker...  
4     rifanrobani catat ali erickthohir jokowi titis  


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
tfidf = TfidfVectorizer()

In [25]:
#teks hasil processing menjadi angka
X = tfidf.fit_transform(df_clean['hasil_preprocessing'])

In [26]:
# menampilkan 20 daftar pertama kata yang berhasil diekstrak 
print(tfidf.get_feature_names_out()[:20])

['abdulhaadi' 'ad' 'adeem' 'adil' 'adilibarat' 'ahhh' 'air' 'aja' 'akun'
 'alami' 'ali' 'alsannyaku' 'ama' 'aman' 'amin' 'amp' 'ancam' 'andai'
 'andi' 'angkat']


### Modeling

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [28]:
# menentukan target/label menggunakan 100 data yang sudah di processing
y = df_clean['Label']

# menginsialisasi dan melatih model
model = LogisticRegression()
model.fit(X, y)

# cek akurasi
y_pred = model.predict(X)
print(f"akurasi model di data training: {accuracy_score(y, y_pred) * 100:2f}%")

akurasi model di data training: 95.000000%
