In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/caffeine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/caffeine/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/caffeine/nltk_data...


True

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [4]:
# --- 1. Persiapan Data Sampel ---
# Membuat DataFrame sampel sesuai struktur kolom yang Anda minta.
# Dalam implementasi nyata, Anda akan memuat data Anda (misal: pd.read_csv('your_data.csv')).
data = {
    'userName': ['user_A', 'user_B', 'user_C', 'user_D', 'user_A'],
    'content': [
        'The #COVID19 Delta variant is spreading fast!! 😱 So worried about the new cases. Check https://info.gov/stats',
        'Just got my second vaccine dose. Feeling hopeful and safe. Thanks to the amazing staff @healthclinic. 😊 #vaccinated',
        'Another lockdown? Seriously? I am so done with this. This is the 4th time!',
        'This research on the Epsilon variant is quite interesting. It suggests a different approach might be needed.',
        'The #COVID19 Delta variant is spreading fast!! 😱 So worried about the new cases. Check https://info.gov/stats'
    ],
    'score': [2, 5, 1, 4, 2],
    'reviewCreatedVersion': ['2.1', '2.1', '2.2', '2.2', '2.1'],
    'at': ['2022-07-11', '2022-07-12', '2022-07-13', '2022-07-14', '2022-07-11'],
    'Sentimen Akhir': ['Negative', 'Positive', 'Negative', 'Neutral', 'Negative']
}
df = pd.DataFrame(data)

In [5]:
print("--- Data Awal ---")
print(f"Jumlah baris awal: {len(df)}")
print(df[['userName', 'content']].to_markdown(index=False))
print("\n" + "="*50 + "\n")


--- Data Awal ---
Jumlah baris awal: 5
| userName   | content                                                                                                              |
|:-----------|:---------------------------------------------------------------------------------------------------------------------|
| user_A     | The #COVID19 Delta variant is spreading fast!! 😱 So worried about the new cases. Check https://info.gov/stats       |
| user_B     | Just got my second vaccine dose. Feeling hopeful and safe. Thanks to the amazing staff @healthclinic. 😊 #vaccinated |
| user_C     | Another lockdown? Seriously? I am so done with this. This is the 4th time!                                           |
| user_D     | This research on the Epsilon variant is quite interesting. It suggests a different approach might be needed.         |
| user_A     | The #COVID19 Delta variant is spreading fast!! 😱 So worried about the new cases. Check https://info.gov/stats       |




In [6]:
# --- 2. Penghapusan Tweet Duplikat ---
# Sesuai metodologi untuk menghindari redundansi data[cite: 90].
df.drop_duplicates(subset=['content'], inplace=True)
df.reset_index(drop=True, inplace=True)

print("--- Setelah Penghapusan Duplikat ---")
print(f"Jumlah baris setelah menghapus duplikat: {len(df)}")
print(df[['userName', 'content']].to_markdown(index=False))
print("\n" + "="*50 + "\n")

--- Setelah Penghapusan Duplikat ---
Jumlah baris setelah menghapus duplikat: 4
| userName   | content                                                                                                              |
|:-----------|:---------------------------------------------------------------------------------------------------------------------|
| user_A     | The #COVID19 Delta variant is spreading fast!! 😱 So worried about the new cases. Check https://info.gov/stats       |
| user_B     | Just got my second vaccine dose. Feeling hopeful and safe. Thanks to the amazing staff @healthclinic. 😊 #vaccinated |
| user_C     | Another lockdown? Seriously? I am so done with this. This is the 4th time!                                           |
| user_D     | This research on the Epsilon variant is quite interesting. It suggests a different approach might be needed.         |




In [7]:
# --- 3. Fungsi Pra-pemrosesan Utama ---
# Menggabungkan beberapa langkah pembersihan yang dijelaskan dalam jurnal[cite: 87, 88, 89, 91].
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_tweet(text):
    # a. Konversi ke huruf kecil [cite: 89]
    text = text.lower()
    
    # b. Menghapus URL, mention, tagar, simbol, dan angka [cite: 88]
    # Menghapus URL
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Menghapus mention (@username)
    text = re.sub(r'@[a-zA-Z0-9_]+', '', text)
    # Menghapus tagar (#hashtag)
    text = re.sub(r'#\w+', '', text)
    # Menghapus emoji dan simbol lainnya (mempertahankan karakter alfanumerik dan spasi)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Menghapus angka tidak diperlukan karena regex di atas sudah menanganinya
    
    # c. Tokenisasi teks
    tokens = word_tokenize(text)
    
    # d. Menghapus stopwords [cite: 91]
    tokens = [word for word in tokens if word not in stop_words]
    
    # e. Lemmatization [cite: 94]
    # Proses ini mengurangi kata ke bentuk dasarnya (lemma)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Menggabungkan kembali token menjadi kalimat bersih
    return ' '.join(tokens)

# Menerapkan fungsi pra-pemrosesan ke kolom 'content'
df['processed_content'] = df['content'].apply(preprocess_tweet)

In [8]:
print("--- DataFrame Final Setelah Semua Tahapan Pra-pemrosesan ---")
# Menampilkan kolom-kolom relevan dari awal hingga akhir
final_columns = ['userName', 'content', 'processed_content', 'Sentimen Akhir']
print(df[final_columns].to_markdown(index=False))

--- DataFrame Final Setelah Semua Tahapan Pra-pemrosesan ---
| userName   | content                                                                                                              | processed_content                                                                   | Sentimen Akhir   |
|:-----------|:---------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|:-----------------|
| user_A     | The #COVID19 Delta variant is spreading fast!! 😱 So worried about the new cases. Check https://info.gov/stats       | delta variant spreading fast worried new case check                                 | Negative         |
| user_B     | Just got my second vaccine dose. Feeling hopeful and safe. Thanks to the amazing staff @healthclinic. 😊 #vaccinated | got second vaccine dose feeling hopeful safe thanks amazing staff                   | Posi