<a href="https://colab.research.google.com/github/apriandito/dkem/blob/main/Final_Nowcasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install pytrends

Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Downloading pytrends-4.9.2-py3-none-any.whl (15 kB)
Installing collected packages: pytrends
Successfully installed pytrends-4.9.2


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import time
from pytrends.request import TrendReq
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.feature_selection import f_regression
import numpy as np
from statsmodels.tsa.stattools import adfuller

# 1. Data Collection

def scrape_inflasi():
    def scrape_page(soup):
        table = soup.find('table', {'class': 'table table-striped table-no-bordered table-lg'})
        data = []
        for row in table.find_all('tr')[1:]:
            cols = row.find_all('td')
            if cols:
                date = cols[0].text.strip()
                inflation = cols[1].text.strip()
                data.append({'Tanggal': date, 'Data Inflasi': inflation})
        return data

    url = "https://www.bi.go.id/id/statistik/indikator/data-inflasi.aspx"
    session = requests.Session()
    all_data = []

    for page in range(1, 27):
        print(f"Scraping Inflation page {page}...")
        if page == 1:
            response = session.get(url)
        else:
            payload = {
                "__EVENTTARGET": "ctl00$ctl54$g_1f0a867d_90e9_4a92_b1c8_de34738fc4f1$ctl00$DataPagerDataInflasi$ctl02$ctl00",
                "__EVENTARGUMENT": "",
                "__LASTFOCUS": "",
                "__VIEWSTATE": soup.find('input', {'name': '__VIEWSTATE'})['value'],
                "__VIEWSTATEGENERATOR": soup.find('input', {'name': '__VIEWSTATEGENERATOR'})['value'],
                "__EVENTVALIDATION": soup.find('input', {'name': '__EVENTVALIDATION'})['value'],
            }
            response = session.post(url, data=payload)

        soup = BeautifulSoup(response.content, 'html.parser')
        page_data = scrape_page(soup)
        all_data.extend(page_data)

    return pd.DataFrame(all_data)

def fetch_google_trends(keywords):
    pytrends = TrendReq(hl='id-ID', tz=420)
    df_trends = pd.DataFrame()

    for keyword in keywords:
        print(f"Fetching Google Trends data for '{keyword}'...")
        pytrends.build_payload([keyword], cat=0, timeframe='today 5-y', geo='ID', gprop='')
        interest_over_time_df = pytrends.interest_over_time()
        df_trends[keyword] = interest_over_time_df[keyword]
        time.sleep(2)

    return df_trends

# 2. Data Preprocessing

def improved_preprocess_data(df_inflasi, df_trends, keywords):
    bulan_dict = {
        'Januari': 'January', 'Februari': 'February', 'Maret': 'March', 'April': 'April',
        'Mei': 'May', 'Juni': 'June', 'Juli': 'July', 'Agustus': 'August',
        'September': 'September', 'Oktober': 'October', 'November': 'November', 'Desember': 'December'
    }

    def convert_date(date_str):
        for indo, eng in bulan_dict.items():
            if indo in date_str:
                return date_str.replace(indo, eng)
        return date_str

    df_inflasi['Tanggal'] = df_inflasi['Tanggal'].apply(convert_date)
    df_inflasi['Tanggal'] = pd.to_datetime(df_inflasi['Tanggal'], format='%B %Y')
    df_inflasi['Data Inflasi'] = df_inflasi['Data Inflasi'].str.rstrip('%').astype('float') / 100.0
    df_inflasi = df_inflasi.sort_values('Tanggal')
    df_inflasi.set_index('Tanggal', inplace=True)

    df_trends.index = pd.to_datetime(df_trends.index)
    df_trend_monthly = df_trends.resample('M').mean()
    df_trend_monthly.index = df_trend_monthly.index - pd.offsets.MonthBegin(1)

    df_combined = df_inflasi.join(df_trend_monthly)
    df_combined = df_combined.ffill().dropna()
    df_combined.reset_index(inplace=True)

    # Check for stationarity
    def check_stationarity(timeseries):
        result = adfuller(timeseries, autolag='AIC')
        return result[1] <= 0.05  # p-value <= 0.05 indicates stationarity

    # Make series stationary if needed
    for col in df_combined.columns:
        if col != 'Tanggal' and not check_stationarity(df_combined[col]):
            df_combined[f'{col}_diff'] = df_combined[col].diff()

    # Feature engineering
    for keyword in keywords:
        df_combined[f'{keyword}_MA3'] = df_combined[keyword].rolling(window=3).mean()
        df_combined[f'{keyword}_MA6'] = df_combined[keyword].rolling(window=6).mean()

    # Add time-based features
    df_combined['month'] = df_combined['Tanggal'].dt.month
    df_combined['quarter'] = df_combined['Tanggal'].dt.quarter

    # Scale features
    scaler = StandardScaler()
    feature_columns = [col for col in df_combined.columns if col != 'Tanggal' and col != 'Data Inflasi']
    df_combined[feature_columns] = scaler.fit_transform(df_combined[feature_columns])

    # Drop rows with NaN values after feature engineering
    df_combined = df_combined.dropna()

    return df_combined

# 3. Modeling

def train_models(X, y, n_splits=3):
    tscv = TimeSeriesSplit(n_splits=n_splits)

    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(alpha=1.0),
        'Lasso Regression': Lasso(alpha=1.0),
        'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
        'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
        'Support Vector Regression': SVR(kernel='rbf')
    }

    results = {name: {'RMSE': [], 'R2': [], 'MAPE': []} for name in models}

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            mape = mean_absolute_percentage_error(y_test, y_pred)

            results[name]['RMSE'].append(rmse)
            results[name]['R2'].append(r2)
            results[name]['MAPE'].append(mape)

    # Calculate average metrics across all folds
    for name in models:
        results[name]['avg_RMSE'] = np.mean(results[name]['RMSE'])
        results[name]['avg_R2'] = np.mean(results[name]['R2'])
        results[name]['avg_MAPE'] = np.mean(results[name]['MAPE'])
        results[name]['model'] = models[name]

    return results

# 4. Evaluation

def print_evaluation_results(results):
    for model_name, metrics in results.items():
        print(f"\n{model_name}:")
        print(f"  Average RMSE: {metrics['avg_RMSE']:.4f}")
        print(f"  Average R2: {metrics['avg_R2']:.4f}")
        print(f"  Average MAPE: {metrics['avg_MAPE']:.4f}")
        print(f"  RMSE per fold: {', '.join([f'{rmse:.4f}' for rmse in metrics['RMSE']])}")
        print(f"  R2 per fold: {', '.join([f'{r2:.4f}' for r2 in metrics['R2']])}")
        print(f"  MAPE per fold: {', '.join([f'{mape:.4f}' for mape in metrics['MAPE']])}")

# 5. Prediction

def predict_inflation(best_model, X_nowcast):
    predicted_inflation = best_model.predict(X_nowcast)
    return predicted_inflation[0]

In [None]:
# Main execution

# Define keywords
keywords = ['inflasi', 'harga naik', 'kenaikan harga', 'ekonomi', 'krisis', 'minyak mahal', 'beras langka', 'hidup susah']

# 1. Data Collection
df_inflasi = scrape_inflasi()
df_trends = fetch_google_trends(keywords)

# 2. Data Preprocessing
df_combined = improved_preprocess_data(df_inflasi, df_trends, keywords)

# Feature selection
def select_features(X, y):
    f_scores, _ = f_regression(X, y)
    return pd.Series(f_scores, index=X.columns).sort_values(ascending=False)

feature_columns = [col for col in df_combined.columns if col not in ['Tanggal', 'Data Inflasi']]
X = df_combined[feature_columns]
y = df_combined['Data Inflasi']

feature_importance = select_features(X, y)
print("\nFeature Importance:")
print(feature_importance)

# Select top 5 features
top_features = feature_importance.nlargest(5).index.tolist()
X = df_combined[top_features]

# 3. Modeling
model_results = train_models(X, y)

# 4. Evaluation
print_evaluation_results(model_results)

# 5. Prediction
best_model_name = min(model_results, key=lambda x: model_results[x]['avg_RMSE'])
best_model = model_results[best_model_name]['model']

latest_data = df_combined.iloc[-1][top_features].values.reshape(1, -1)
predicted_inflation = predict_inflation(best_model, latest_data)
print(f"\nPredicted Inflation for next month using {best_model_name}: {predicted_inflation:.4f}")

# Visualize actual vs predicted values
plt.figure(figsize=(12,6))
plt.plot(df_combined['Tanggal'], df_combined['Data Inflasi'], label='Actual Inflation')
plt.plot(df_combined['Tanggal'], best_model.predict(X), label='Predicted Inflation')
plt.title('Actual vs Predicted Inflation')
plt.xlabel('Date')
plt.ylabel('Inflation')
plt.legend()
plt.show()