In [None]:
!pip install nltk
!pip install Sastrawi
!pip install scikit-learn

In [None]:
from google.colab import files
import pandas as pd

uploaded = files.upload()

# Asumsi file yang diupload adalah CSV
for fn in uploaded.keys():
    data = pd.read_csv(fn)

# Tampilkan data
print(data.head())

In [None]:
data

In [None]:
# menghapus kolom yang tidak diperlukan
data = data.drop(columns=['conversation_id_str','created_at','favorite_count','id_str','image_url','in_reply_to_screen_name','lang','quote_count','reply_count','retweet_count','tweet_url','user_id_str'])

# Tampilkan data setelah kolom dihapus
print(data.head())

In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Fungsi Cleaning
def clean_text(text):
    text = re.sub(r'http\S+|www.\S+', '', text)  # Menghilangkan URL
    text = re.sub(r'\d+', '', text)  # Menghilangkan angka
    text = re.sub(r'[^\w\s]', '', text)  # Menghilangkan tanda baca
    text = re.sub(r'\s+', ' ', text).strip()  # Menghilangkan spasi ganda
    return text

# Fungsi Case Folding
def case_folding(text):
    return text.lower()

# Fungsi Tokenizing
def tokenize(text):
    return word_tokenize(text)

# Fungsi Stopword Removal
stop_words = set(stopwords.words('indonesian'))
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Fungsi Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def stemming(tokens):
    return [stemmer.stem(word) for word in tokens]

# Fungsi Preprocessing Utuh
def preprocess_text(text):
    # Tangani nilai NaN
    if pd.isna(text):
        text = ''
    text = clean_text(text)
    text = case_folding(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)
    return ' '.join(tokens)


In [None]:
data['processed_location'] = data['location'].apply(preprocess_text)
data['processed_text'] = data['full_text'].apply(preprocess_text)
data = data[data['processed_location'] != '']
print(data.head())

In [None]:
# Menampilkan DataFrame dengan pengaturan lebar yang disesuaikan
from IPython.display import display

# Tampilkan DataFrame hanya dengan kolom 'processed_location'
display(data[['processed_location']])


In [None]:
def label_text(text):
    positive_words = { 'Alhamdulilah', 'selamat', 'baik', 'semangat', 'keren', 'luar', 'biasa','sangat', 'insya', 'allah'}
    negative_words = {'gila', 'doxxing', 'serem', 'protes', 'menghina', 'sanksi', 'nyindir', 'singgung', 'umpatan', 'aneh', 'adu', 'tolol', 'goblok', 'bodoh', 'menyesatkan', 'bangsat', 'buruk', 'anjing', 'gaasik', 'kecurangan', 'capek'}

    if any(word in text for word in positive_words):
        return 'positif'
    else :
        return 'negatif'

# Terapkan fungsi pelabelan ke kolom 'final_text'
data['sentiment'] = data['processed_text'].apply(label_text)

# Tampilkan DataFrame untuk memverifikasi
print(data[['processed_text', 'sentiment', 'location']])

MENGHITUNG SENTIMENT BERDASARKAN LOKASI

In [None]:
sentiment_distribution = data.groupby(['processed_location', 'sentiment']).size().reset_index(name='counts')
print(sentiment_distribution[['processed_location', 'sentiment', 'counts']])

In [None]:
from sklearn.model_selection import train_test_split

X = data['processed_text']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Mendapatkan fitur (kata-kata) dari vectorizer
full_text = vectorizer.get_feature_names_out()

# Mengubah X_train_vec menjadi DataFrame
X_train_tfidf_df = pd.DataFrame(X_train_vec.toarray(), columns= full_text)

# Mengubah X_test_vec menjadi DataFrame (opsional)
X_test_tfidf_df = pd.DataFrame(X_test_vec.toarray(), columns= full_text)

# Menampilkan DataFrame TF-IDF
print("TF-IDF untuk data pelatihan:")
print(X_train_tfidf_df)

print("\nTF-IDF untuk data pengujian:")
print(X_test_tfidf_df)

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

for text, true_label, predicted_label in zip(X_test, y_test, y_pred):
    print(f"Text: {text}")
    print(f"Actual Sentiment: {true_label}")
    print(f"Predicted Sentiment: {predicted_label}")
    print()


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

y_pred = model.predict(X_test_vec)

# Akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Laporan Klasifikasi
report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{report}')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.show()