In [None]:
# ==============================================================================
# JURNAL PROYEK SAINS DATA: ANALISIS & PREDIKSI PEAK HOUR COFFEE SHOP
# TEAM: Akmal, Danang, Hafiz
#
# NOTEBOOK INI MENGGABUNGKAN PROGRES DARI EDA AWAL SAMPAI MODEL FINAL
# ==============================================================================

# --- [0] IMPORT LIBRARIES ---
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib  # Untuk menyimpan model ke Streamlit

# Libraries dari Modul Praktikum (Original + Tambahan)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB         # Modul 6 (Original)
from sklearn.ensemble import RandomForestClassifier # Modul 7 (Original)
from sklearn.svm import SVC                        # Tambahan Video 1
from sklearn.linear_model import LogisticRegression, LinearRegression # Tambahan Video 3
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score

# Library Imbalanced Data (Tambahan Video 2 - Revisi Dosen)
# Pastikan install: pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from collections import Counter

# Library Time Series (Tambahan Video 4)
from statsmodels.tsa.seasonal import seasonal_decompose

# Config Visualisasi
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries Ready!")


# ==============================================================================
# BAGIAN 1: FONDASI (ORIGINAL WORK)
# Mencakup: Data Understanding, Cleaning, EDA, & Feature Engineering
# ==============================================================================

# 1. LOAD DATA
print("\n--- [STEP 1] LOAD & CLEAN DATA ---")
df = pd.read_csv('Coffe_sales.csv')
df['Date'] = pd.to_datetime(df['Date']) # Convert date
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

print("Info Data:")
df.info()

# 2. EDA (EXPLORATORY DATA ANALYSIS) - SESUAI FILE coffeeNew.ipynb
print("\n--- [STEP 2] EDA (ORIGINAL) ---")

# (A) Distribusi Transaksi per Jam
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='hour_of_day', bins=16, kde=True, color='skyblue')
plt.title('Distribusi Jumlah Transaksi Berdasarkan Jam Operasional')
plt.xlabel('Jam Transaksi')
plt.ylabel('Jumlah Transaksi')
plt.show()

# (B) Top 10 Produk
plt.figure(figsize=(10, 6))
top_products = df['coffee_name'].value_counts().head(10)
sns.barplot(x=top_products.values, y=top_products.index, palette='viridis')
plt.title('Top 10 Jenis Kopi dengan Transaksi Terbanyak')
plt.xlabel('Jumlah Transaksi')
plt.show()

# (C) Transaksi per Hari
# Kita buat kolom Weekday dulu biar sama kayak notebook asli
df['Weekday'] = df['Date'].dt.day_name()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='Weekday', order=day_order, palette='viridis')
plt.title('Jumlah Transaksi Berdasarkan Hari')
plt.xticks(rotation=45)
plt.show()

# (D) Heatmap Hari vs Jam
df_heatmap = df.groupby(['Weekday', 'hour_of_day']).size().unstack(fill_value=0)
df_heatmap = df_heatmap.reindex(day_order) # Urutkan hari
plt.figure(figsize=(12, 6))
sns.heatmap(df_heatmap, cmap='YlGnBu', annot=True, fmt='d')
plt.title('Heatmap Kepadatan Transaksi (Hari vs Jam)')
plt.show()


# 3. DATA PREPARATION (TRANSFORMASI)
print("\n--- [STEP 3] DATA PREPARATION (AGREGASI & FEATURE ENGINEERING) ---")
# Agregasi data per jam untuk menentukan Peak Hour
df_agg = df.groupby(['Date', 'hour_of_day']).agg({
    'coffee_name': 'count',
    'money': 'mean' # Rata-rata tiket size (fitur tambahan dari notebook asli)
}).reset_index()

df_agg.rename(columns={'coffee_name': 'order_count', 'money': 'avg_ticket_size'}, inplace=True)

# Tambah Fitur Waktu
df_agg['day_name'] = df_agg['Date'].dt.day_name()
df_agg['month'] = df_agg['Date'].dt.month

# Labeling Target: Peak Hour (1) jika order > Threshold
threshold = df_agg['order_count'].quantile(0.75)
df_agg['is_peak_hour'] = (df_agg['order_count'] > threshold).astype(int)

# Encoding Hari (Categorical -> Numeric)
le = LabelEncoder()
df_agg['day_code'] = le.fit_transform(df_agg['day_name'])

print(f"Threshold Peak Hour: > {threshold} order/jam")
print("Data Siap Modeling:")
display(df_agg.head())

# Korelasi Fitur (Sesuai notebook asli)
plt.figure(figsize=(8, 6))
sns.heatmap(df_agg[['hour_of_day', 'month', 'day_code', 'is_peak_hour', 'avg_ticket_size']].corr(), 
            annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Korelasi Antar Variabel')
plt.show()


# ==============================================================================
# BAGIAN 2: VIDEO 1 - BASELINE MODEL & SVM
# Membandingkan Naive Bayes, Random Forest (Awal), dan SVM
# ==============================================================================
print("\n" + "="*50)
print("  VIDEO 1: PEMODELAN AWAL (NB, RF) & EKSPERIMEN SVM")
print("="*50)

# 1. Split Data
# Catatan: Kita hapus 'avg_ticket_size' dulu biar adil sama deployment nanti, 
# atau kalau mau tetap pakai juga gapapa, tapi nanti di app harus input manual.
# Di sini kita pakai fitur waktu saja biar robust.
X = df_agg[['hour_of_day', 'day_code', 'month']] 
y = df_agg['is_peak_hour']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2. Scaling (Penting buat SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Komparasi Model
models = {
    "Naive Bayes": GaussianNB(),
    "Random Forest (Base)": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM (Eksperimen Baru)": SVC(kernel='rbf', random_state=42)
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"\n--- {name} ---")
    print(f"Akurasi: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))

print("ANALISIS VIDEO 1: Random Forest paling stabil, tapi SVM juga kompetitif. Naive Bayes kurang cocok.")

In [None]:
# ==============================================================================
# BAGIAN 3: VIDEO 2 - IMBALANCED DATA (AUGMENTASI SMOTE)
# Revisi Dosen: "Data Bias, Hasil Jelek". Solusi: SMOTE
# ==============================================================================
print("\n\n" + "="*50)
print("  VIDEO 2: SOLUSI DATA BIAS (AUGMENTASI SMOTE)")
print("="*50)

# 1. Cek Distribusi Sebelum
print(f"Jumlah Data Train Sebelum SMOTE: {Counter(y_train)}")
# Pasti 0 >> 1

# 2. Penerapan SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"Jumlah Data Train Setelah SMOTE: {Counter(y_train_smote)}")

# 3. Retrain Random Forest (Model Andalan) dengan Data Baru
rf_final = RandomForestClassifier(n_estimators=100, random_state=42)
rf_final.fit(X_train_smote, y_train_smote)
y_pred_final = rf_final.predict(X_test_scaled)

# 4. Evaluasi
print("\n--- HASIL RANDOM FOREST + SMOTE ---")
print(classification_report(y_test, y_pred_final))

# Feature Importance (Sesuai notebook asli)
importances = rf_final.feature_importances_
feat_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)
sns.barplot(x='Importance', y='Feature', data=feat_df, palette='viridis')
plt.title('Feature Importance (Model Final)')
plt.show()

In [None]:
# ==============================================================================
# BAGIAN 4: VIDEO 3 - LINEAR MODELS
# Pembanding: Apakah metode Linear lebih baik?
# ==============================================================================
print("\n\n" + "="*50)
print("  VIDEO 3: LINEAR MODELS (REGRESSION & CLASSIFICATION)")
print("="*50)

# A. Logistic Regression (Klasifikasi)
log_model = LogisticRegression()
log_model.fit(X_train_smote, y_train_smote)
y_pred_log = log_model.predict(X_test_scaled)
print("Akurasi Logistic Regression:", accuracy_score(y_test, y_pred_log))

# B. Linear Regression (Prediksi Jumlah Order)
print("\n--- Linear Regression (Jam vs Jumlah Order) ---")
# Kita coba lihat pola hubungan Jam dengan Jumlah Order
X_reg = df_agg[['hour_of_day']]
y_reg = df_agg['order_count']

lin_reg = LinearRegression()
lin_reg.fit(X_reg, y_reg)
y_reg_pred = lin_reg.predict(X_reg)

# Visualisasi
plt.figure(figsize=(10, 6))
plt.scatter(X_reg, y_reg, color='gray', alpha=0.5, label='Data Asli')
plt.plot(X_reg, y_reg_pred, color='red', linewidth=3, label='Garis Regresi')
plt.title('Kenapa Linear Regression Kurang Cocok?')
plt.xlabel('Jam')
plt.ylabel('Order')
plt.legend()
plt.show()
print("Insight: Garis merah gagal menangkap pola naik-turun jam sibuk.")

In [None]:
# ==============================================================================
# BAGIAN 5: VIDEO 4 - TIME SERIES ANALYSIS
# ==============================================================================
print("\n\n" + "="*50)
print("  VIDEO 4: TIME SERIES ANALYSIS")
print("="*50)

# Resampling ke Harian
df_ts = df.set_index('Date').resample('D').size()

plt.figure(figsize=(12, 5))
df_ts.plot(title='Tren Penjualan Harian')
plt.show()

# Dekomposisi Musiman (Mingguan)
if len(df_ts) > 14:
    decomposition = seasonal_decompose(df_ts, model='additive', period=7)
    fig = decomposition.plot()
    fig.set_size_inches(10, 8)
    plt.show()
    print("Insight: Pola mingguan (Seasonal) terlihat jelas.")


# ==============================================================================
# FINAL: EXPORT MODEL (UNTUK STREAMLIT)
# ==============================================================================
print("\n--- SAVE MODEL ---")
joblib.dump(rf_final, 'model_final.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(le, 'day_encoder.pkl')
# Simpan data bersih untuk visualisasi di App
df_agg.to_csv('clean_data_agg.csv', index=False)
print("Model & Data Tersimpan!")