<a href="https://colab.research.google.com/github/alvintnw/AnalysisStentimentFilm/blob/main/AnalysisStentimentFilm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re # Untuk regular expression (pembersihan teks)
import nltk # Natural Language Toolkit
from nltk.corpus import stopwords # Untuk menghapus kata-kata umum
from nltk.stem import PorterStemmer # Untuk stemming (mengurangi kata ke akar)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # Untuk mengubah teks menjadi angka
from sklearn.linear_model import LogisticRegression # Model klasifikasi
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Unduh resource NLTK yang dibutuhkan (hanya perlu sekali)
# Jika ada error, coba jalankan ini di sel terpisah:
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt') # Untuk tokenisasi

In [None]:
# URL dataset (raw CSV dari GitHub)
# Dataset ini berisi ulasan film dan label sentimennya (positif/negatif)
url = "https://raw.githubusercontent.com/Ankit152/IMDB-Sentiment-Analysis/master/IMDB-Dataset.csv"
df = pd.read_csv(url)

# Menampilkan 5 baris pertama dari DataFrame
print("5 baris pertama dari dataset:")
print(df.head())

# Menampilkan informasi dasar tentang dataset
print("\nInformasi dataset:")
df.info()

# Menampilkan jumlah ulasan positif dan negatif
print("\nDistribusi Sentimen:")
print(df['sentiment'].value_counts())

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# Inisialisasi Porter Stemmer dan stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english')) # Dataset ini dalam bahasa Inggris

def clean_text(text):
    # 1. Hapus tag HTML
    text = re.sub(r'<.*?>', '', text)
    # 2. Hapus karakter non-alfabet dan ubah ke huruf kecil
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    # 3. Tokenisasi (pisahkan teks menjadi kata-kata)
    words = text.split()
    # 4. Hapus stop words dan lakukan stemming
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    # 5. Gabungkan kembali kata-kata menjadi string
    text = ' '.join(words)
    return text

# Terapkan fungsi pembersihan ke kolom 'review'
df['cleaned_review'] = df['review'].apply(clean_text)

# Menampilkan beberapa ulasan asli dan yang sudah dibersihkan
print("Contoh Ulasan Asli vs Dibersihkan:")
for i in range(5):
    print(f"Asli: {df['review'][i][:100]}...") # Ambil 100 karakter pertama
    print(f"Dibersihkan: {df['cleaned_review'][i][:100]}...\n")

In [None]:
# Mengubah 'positive' menjadi 1 dan 'negative' menjadi 0
df['sentiment_numeric'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Menampilkan 5 baris pertama dengan kolom sentimen numerik baru
print(df.head())
print("\nDistribusi Sentimen Numerik:")
print(df['sentiment_numeric'].value_counts())

In [None]:
X = df['cleaned_review']
y = df['sentiment_numeric']

print("Bentuk X (Fitur):", X.shape)
print("Bentuk y (Target):", y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Jumlah data pelatihan: {len(X_train)} ({len(X_train)/len(df)*100:.2f}%)")
print(f"Jumlah data pengujian: {len(X_test)} ({len(X_test)/len(df)*100:.2f}%)")
print("\nDistribusi sentimen dalam data pelatihan:")
print(y_train.value_counts(normalize=True))
print("\nDistribusi sentimen dalam data pengujian:")
print(y_test.value_counts(normalize=True))

In [None]:
# Inisialisasi TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Batasi jumlah fitur (kata)

# Pelajari kosakata dari data pelatihan dan ubah teks menjadi vektor TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Ubah data pengujian menggunakan kosakata yang sama
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Bentuk X_train_tfidf:", X_train_tfidf.shape)
print("Bentuk X_test_tfidf:", X_test_tfidf.shape)

In [None]:
# Membuat instance model Regresi Logistik
model = LogisticRegression(max_iter=1000) # Tingkatkan max_iter jika konvergensi tidak tercapai

print("Model Regresi Logistik berhasil diinisialisasi!")

In [None]:
# Melatih model menggunakan data pelatihan yang sudah di-TF-IDF
print("Melatih model...")
model.fit(X_train_tfidf, y_train)
print("Model berhasil dilatih!")

In [None]:
# Membuat prediksi pada data pengujian
y_pred = model.predict(X_test_tfidf)

print("Prediksi pada data pengujian berhasil dibuat!")

In [None]:
# Menghitung Akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi Model: {accuracy:.4f}\n")

# Menampilkan Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

# Menampilkan Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
def predict_sentiment(text):
    # 1. Bersihkan teks input
    cleaned_text = clean_text(text)
    # 2. Ubah teks menjadi vektor TF-IDF menggunakan vectorizer yang sudah dilatih
    text_tfidf = tfidf_vectorizer.transform([cleaned_text])
    # 3. Buat prediksi
    prediction = model.predict(text_tfidf)
    # 4. Kembalikan label sentimen
    if prediction[0] == 1:
        return "Positif"
    else:
        return "Negatif"

# Contoh ulasan baru
ulasan1 = "This movie was absolutely fantastic! I loved every minute of it."
ulasan2 = "What a terrible film. I wasted my money and time."
ulasan3 = "The acting was okay, but the plot was a bit confusing."
ulasan4 = "A masterpiece of cinema, truly breathtaking."
ulasan5 = "I wouldn't recommend this to anyone. Very disappointing."

print(f"Ulasan 1: '{ulasan1}' -> Sentimen: {predict_sentiment(ulasan1)}")
print(f"Ulasan 2: '{ulasan2}' -> Sentimen: {predict_sentiment(ulasan2)}")
print(f"Ulasan 3: '{ulasan3}' -> Sentimen: {predict_sentiment(ulasan3)}")
print(f"Ulasan 4: '{ulasan4}' -> Sentimen: {predict_sentiment(ulasan4)}")
print(f"Ulasan 5: '{ulasan5}' -> Sentimen: {predict_sentiment(ulasan5)}")