In [None]:
# ============================================================
# 1. Import Library
# ============================================================
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import joblib
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer


In [None]:
# ============================================================
# 2. Load Data (aman untuk file besar)
# ============================================================
# Ambil sample acak 50k review dari review.json (soalnya file nyampe 4GB+)
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
SAMPLE_SIZE = 50000
np.random.seed(42)


sample = []
with open("review2.json", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i < SAMPLE_SIZE:
            sample.append(line)
        else:
            j = np.random.randint(0, i+1)
            if j < SAMPLE_SIZE:
                sample[j] = line

df = pd.read_json("".join(sample), lines=True)

print("✅ Cek Missing Value:", df.isnull().sum())
print("✅ Data loaded. Shape:", df.shape)
print(df.head())

In [None]:
# ============================================================
# 3. Kolom Penting
# ============================================================
df = df[["text", "stars"]]

# ============================================================
# 4. Label Sentimen
# ============================================================
def to_sentiment(stars):
    if stars >= 4:
        return "positive"
    elif stars <= 2:
        return "negative"
    else:
        return "neutral"

df["sentiment"] = df["stars"].apply(to_sentiment)
print(df["sentiment"].value_counts())

In [None]:
# ============================================================
# 5. EDA (Exploratory Data Analysis)
# ============================================================
# Distribusi rating
df["stars"].value_counts().sort_index().plot(kind="bar")
plt.title("Distribusi Rating Bintang")
plt.xlabel("Stars")
plt.ylabel("Jumlah Review")
plt.show()

# Distribusi sentimen
df["sentiment"].value_counts().plot(kind="bar", color=["green","red","gray"])
plt.title("Distribusi Sentimen")
plt.xlabel("Sentimen")
plt.ylabel("Jumlah Review")
plt.show()

# Panjang review
df["review_length"] = df["text"].apply(lambda x: len(x.split()))
df["review_length"].hist(bins=50)
plt.title("Distribusi Panjang Review")
plt.xlabel("Jumlah kata")
plt.ylabel("Frekuensi")
plt.show()


In [None]:
# ============================================================
# 6. Preprocessing
# ============================================================
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", str(text))
    words = nltk.word_tokenize(text.lower())
    words = [w for w in words if w not in stop_words and len(w) > 2]
    pos_tags = nltk.pos_tag(words)
    words = [lemmatizer.lemmatize(w, get_wordnet_pos(tag)) for w, tag in pos_tags]
    return " ".join(words)


def top_words(sentiment_label, n=10):
    words = " ".join(df[df["sentiment"]==sentiment_label]["clean_text"]).split()
    return Counter(words).most_common(n)

df["clean_text"] = df["text"].apply(clean_text)
print("Top words POSITIVE:", top_words("positive"))
print("Top words NEGATIVE:", top_words("negative"))


In [None]:
# ============================================================
# 7. Split Data
# ============================================================
X = df["clean_text"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================================
# 8. Vectorization (TF-IDF)
# ============================================================
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# ============================================================
# 9. Modeling (Logistic Regression)
# ============================================================
model = LogisticRegression(max_iter=2000, class_weight="balanced")
model.fit(X_train_tfidf, y_train)

In [None]:
# ============================================================
# 10. Evaluasi
# ============================================================
y_pred = model.predict(X_test_tfidf)

print("✅ Akurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=["positive","neutral","negative"])
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["positive","neutral","negative"],
            yticklabels=["positive","neutral","negative"])
plt.title("Confusion Matrix")
plt.show()

# ============================================================
# 11. Interpretasi (Top Features per Sentiment)
# ============================================================
feature_names = np.array(vectorizer.get_feature_names_out())
for i, label in enumerate(model.classes_):
    top10 = np.argsort(model.coef_[i])[-10:]
    print(f"Top kata untuk kelas {label}:", feature_names[top10])


In [None]:
# ============================================================
# 12. Simpan Model
# ============================================================
joblib.dump(model, "yelp_sentiment_model.joblib")
joblib.dump(vectorizer, "yelp_tfidf_vectorizer.joblib")

# ============================================================
# 13. Chatbot Responder
# ============================================================
def chatbot_response(text, model=model, vectorizer=vectorizer):
    text_clean = clean_text(text)
    text_vec = vectorizer.transform([text_clean])
    sentiment = model.predict(text_vec)[0]

    if sentiment == "positive":
        return "Terima kasih atas ulasannya! Senang mendengar pengalaman positif Anda 😊"
    elif sentiment == "negative":
        return "Mohon maaf atas pengalaman yang mengecewakan. Kami akan menindaklanjuti dan berusaha memperbaiki."
    else:
        return "Terima kasih atas masukan Anda. Kami akan pertimbangkan untuk meningkatkan layanan kami."

# ============================================================
# 14. Test Chatbot
# ============================================================
print(chatbot_response("The food was amazing and the service was great!"))
print(chatbot_response("The food was terrible and the service was slow."))
print(chatbot_response("The restaurant was okay, nothing special."))


In [None]:
# ============================================================
# 15.Pie Chart Distribusi Sentimen
# ============================================================
sentiment_counts = df["sentiment"].value_counts()
labels = sentiment_counts.index.tolist()
sizes = sentiment_counts.values.tolist()

# "positive" agak keluar
explode = [0.1 if label == "positive" else 0 for label in labels]

plt.figure(figsize=(7,7))
plt.pie(
    sizes,
    labels=labels,
    autopct=lambda p: f'{p:.1f}%\n({int(p*sum(sizes)/100)})',
    startangle=90,
    colors=["green", "gray", "red"],
    explode=explode,
    shadow=True
)
plt.title("Distribusi Sentimen Yelp Reviews")
plt.tight_layout()
plt.show()


# Pie Chart Rating Bintang 
stars_counts = df["stars"].value_counts().sort_index()
labels = [f"{star} Stars" for star in stars_counts.index]
sizes = stars_counts.values

# Cari index dengan jumlah terbanyak
max_index = np.argmax(sizes)
explode = [0.1 if i == max_index else 0 for i in range(len(sizes))]

colors = plt.cm.PuBuGn(np.linspace(0.3, 0.9, len(labels)))

plt.figure(figsize=(7,7))
plt.pie(
    sizes,
    labels=labels,
    autopct=lambda p: f'{p:.1f}%\n({int(p*sum(sizes)/100)})',
    startangle=90,
    colors=colors,
    explode=explode,
    shadow=True
)
plt.title("Distribusi Rating Bintang Yelp Reviews")
plt.axis("equal")
plt.tight_layout()
plt.show()

In [None]:
df.head(10)

In [None]:
import pandas as pd
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords NLTK (cukup sekali aja)
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Tambah stopwords custom kalau perlu
extra_stopwords = {"food", "place", "restaurant"}  # kata umum di Yelp yg terlalu sering muncul
stop_words = stop_words.union(extra_stopwords)

# Fungsi clean text
def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", str(text))  # hapus non-alfabet
    words = text.lower().split()
    words = [w for w in words if w not in stop_words and len(w) > 2]  # buang stopwords & kata pendek
    return " ".join(words)

df["clean_text"] = df["text"].apply(clean_text)

# Pisahin review negatif & positif
neg_text = " ".join(df[df["stars"] <= 2]["clean_text"])
pos_text = " ".join(df[df["stars"] >= 4]["clean_text"])

# Hitung kata terbanyak
neg_words = Counter(neg_text.split()).most_common(20)
pos_words = Counter(pos_text.split()).most_common(20)

print("Kata terbanyak di review negatif:")
print(neg_words)

print("\nKata terbanyak di review positif:")
print(pos_words)



from wordcloud import WordCloud
import matplotlib.pyplot as plt

# WordCloud untuk review negatif
wc_neg = WordCloud(width=800, height=400, background_color="white").generate(neg_text)
plt.imshow(wc_neg, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud - Review Negatif")
plt.show()

# WordCloud untuk review positif
wc_pos = WordCloud(width=800, height=400, background_color="white").generate(pos_text)
plt.imshow(wc_pos, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud - Review Positif")
plt.show()



In [None]:
# ============================================================
# Import Library
# ============================================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

print(df.head())

# 3. TF-IDF Vectorizer
# ============================================================
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 4. Model Candidates
# ============================================================
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC()
}

# 5. Training & Evaluation
# ============================================================
results = {}

for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    results[name] = acc
    
# 6. Summary Akurasi (Visualisasi)
# ============================================================
plt.figure(figsize=(6,4))
plt.bar(results.keys(), results.values(), color="skyblue")
plt.ylabel("Accuracy")
plt.title("Perbandingan Model Sentiment Analysis (Yelp Reviews)")
plt.ylim(0,1)
for i, v in enumerate(results.values()):
    plt.text(i, v + 0.01, f"{v:.3f}", ha="center")
plt.show()
