In [1]:
import pandas as pd
import json
import numpy as np

## Load data

In [2]:
# Load Price Data
try:
    df_price = pd.read_csv('data/ohlcv/BUMI_daily_oct_nov_2025_ohlcv.csv')
    df_price.columns = df_price.columns.str.lower() 
    df_price['date'] = pd.to_datetime(df_price['date'])
    df_price = df_price.sort_values('date').reset_index(drop=True)
    print(f"Data Saham Loaded: {len(df_price)} rows")
except FileNotFoundError:
    print("Error: File 'BUMI_daily_ohlcv.csv' tidak ditemukan.")
    exit()

# Load News Data
try:
    with open('data/news/filtered_BUMI_oct_nov_2025.json', 'r') as f:
        news_data = json.load(f)
    
    df_news = pd.DataFrame(news_data)
    df_news['date'] = pd.to_datetime(df_news['date'])
    print(f"Data Berita Loaded: {len(df_news)} articles")
except FileNotFoundError:
    print("Error: File 'filtered_BUMI_oct_nov_2025.json' tidak ditemukan.")
    exit()

Data Saham Loaded: 43 rows
Data Berita Loaded: 10 articles


## Feature Engineering 

In [3]:
# Hitung Daily Return (%)
df_price['daily_return'] = df_price['close'].pct_change() * 100

# Logic Perhitungan Dampak (Event Study) 
def calculate_news_impact(news_df, price_df, look_ahead_days=3):
    results = []
    
    # Baseline volatility (standar deviasi return harian seluruh periode)
    # Digunakan untuk menilai apakah gerakan harga itu signifikan atau noise biasa
    avg_volatility = price_df['daily_return'].std()
    
    for idx, row in news_df.iterrows():
        news_date = row['date']
        
        # Sinkronisasi Tanggal: Cari hari bursa terdekat jika berita rilis weekend
        market_date_rows = price_df[price_df['date'] >= news_date]
        
        if market_date_rows.empty:
            continue
            
        market_date = market_date_rows['date'].min()
        start_idx = price_df[price_df['date'] == market_date].index[0]
        
        # Tentukan window dampak (misal: Hari H s/d H+3)
        end_idx = min(start_idx + look_ahead_days, len(price_df) - 1)
        window_data = price_df.loc[start_idx:end_idx].copy()
        
        if window_data.empty:
            continue

        # --- METRIK 1: Cumulative Return (Total Kenaikan selama window) ---
        # Kita fokus pada dampak POSITIF (Kenaikan Harga)
        cum_return = window_data['daily_return'].sum()
        
        # Jika kumulatif return negatif, kita anggap probabilitas menaikkan harga = 0
        impact_score = max(0, cum_return) 
        
        # --- METRIK 2: Lag Detection (Kapan harga memuncak?) ---
        # Mencari di hari keberapa (0, 1, 2, atau 3) return tertinggi terjadi
        max_return_idx = window_data['daily_return'].idxmax()
        days_lag = max_return_idx - start_idx
        
        results.append({
            'date_news': news_date,
            'date_market_reaction': market_date, # Tanggal pasar merespons
            'title': row['title'],
            'impact_return_pct': impact_score, # Seberapa besar harga naik
            'lag_days': days_lag
        })
    
    return pd.DataFrame(results)

In [4]:
df_impact = calculate_news_impact(df_news, df_price, look_ahead_days=3)

# Normalisasi
total_impact = df_impact['impact_return_pct'].sum()

if total_impact > 0:
    df_impact['probability_score'] = (df_impact['impact_return_pct'] / total_impact) * 100
else:
    df_impact['probability_score'] = 0



In [5]:
final_output = df_impact[['date_news', 'title', 'lag_days', 'impact_return_pct', 'probability_score']].sort_values('date_news')

# Format angka agar mudah dibaca
final_output['impact_return_pct'] = final_output['impact_return_pct'].round(2)
final_output['probability_score'] = final_output['probability_score'].round(1)


print("\n" + "="*80)
print("HASIL ANALISIS PROBABILITAS DAMPAK BERITA (EVENT STUDY)")
print("="*80)
print(final_output.to_string(index=False)) # Print tabel rapi ke terminal


output_filename = 'hasil_analisis_berita_bumi.csv'
final_output.to_csv(output_filename, index=False)
print(f"\n[INFO] Hasil lengkap telah disimpan ke file: {output_filename}")


HASIL ANALISIS PROBABILITAS DAMPAK BERITA (EVENT STUDY)
 date_news                                                                         title  lag_days  impact_return_pct  probability_score
2025-10-08                          BUMI Rampungkan Akuisisi Wolfram, Masuk Emasâ€“Tembaga         3               0.00                0.0
2025-10-10   Momentum Bullish IHSG Diprediksi Terjaga: Perhatikan BUMI, DKFT, ERAL, WIFI         1               0.00                0.0
2025-10-17                         Mulai Gersang, Saham BUMI hingga ENRG Ditinggal Asing         3               0.25                0.2
2025-10-17                        IHSG Hari ini Diprediksi Koreksi: BoW PANI, BRIS, BUMI         3               0.25                0.2
2025-11-02                               Margin BUMI Membaik, Laba Bersih Masih Tertekan         2               0.00                0.0
2025-11-10         BUMI Kantongi Potensi Pendapatan Rp26 Triliun dari Wolfram, Kok Bisa?         1              52.02  

In [6]:
# import pandas as pd
# import json
# import numpy as np
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import MinMaxScaler
# import re

# # 1. Load Data
# news_file = 'data/news/filtered_BUMI_oct_nov_2025.json'
# price_file = 'data/ohlcv/BUMI_daily_oct_nov_2025_ohlcv.csv'

# with open(news_file, 'r', encoding='utf-8') as f:
#     news_data = json.load(f)

# news_df = pd.DataFrame(news_data)
# news_df['date'] = pd.to_datetime(news_df['date'])

# price_df = pd.read_csv(price_file)
# price_df.columns = [col.replace('<', '').replace('>', '').lower() for col in price_df.columns]
# price_df['date'] = pd.to_datetime(price_df['date'])
# price_df = price_df.sort_values('date')

# # 2. Financial Feature Engineering & Lag Effects
# # Calculate daily returns
# price_df['return'] = (price_df['close'] - price_df['open']) / price_df['open']
# price_df['prev_close'] = price_df['close'].shift(1)
# price_df['return_c2c'] = (price_df['close'] - price_df['prev_close']) / price_df['prev_close']

# # Calculate future returns for lag effects (max return over next 3 days)
# # Does the price go up significantly in the next 1-3 days?
# price_df['future_close_1'] = price_df['close'].shift(-1)
# price_df['future_close_2'] = price_df['close'].shift(-2)
# price_df['future_close_3'] = price_df['close'].shift(-3)

# price_df['max_future_close'] = price_df[['future_close_1', 'future_close_2', 'future_close_3']].max(axis=1)
# price_df['max_future_return'] = (price_df['max_future_close'] - price_df['close']) / price_df['close']

# # Target: 1 if the stock rises more than 1% in the next 3 days, else 0
# price_df['target_up'] = (price_df['max_future_return'] > 0.01).astype(int)

# # 3. Simple Indonesian Financial Sentiment Analyzer
# positive_words = ['akuisisi', 'laba', 'naik', 'beli', 'untung', 'tinggi', 'borong', 'target', 'potensi', 'penguatan', 'bullish', 'positif', 'masuk', 'dorong', 'rampungkan']
# negative_words = ['rugi', 'turun', 'jual', 'tekanan', 'lepas', 'gagal', 'anjlok', 'rendah', 'koreksi', 'bearish', 'negatif', 'keluar']

# def get_sentiment(text):
#     text = text.lower()
#     pos_count = sum(len(re.findall(r'\b' + word + r'\b', text)) for word in positive_words)
#     neg_count = sum(len(re.findall(r'\b' + word + r'\b', text)) for word in negative_words)
#     total = pos_count + neg_count
#     if total == 0: return 0
#     return (pos_count - neg_count) / total

# news_df['sentiment_score'] = news_df['full_content'].apply(get_sentiment)

# # 4. Merge Data (Aligning News with Stock Market Days)
# # If a news comes out on a weekend, it affects the next trading day.
# # We will use pandas merge_asof for this, but standard merge with forward fill is easier.
# merged_df = pd.merge(news_df, price_df, on='date', how='left')
# # Fill missing price data for weekends with the next available trading day
# merged_df = merged_df.sort_values('date')
# merged_df['target_up'] = merged_df['target_up'].fillna(method='bfill')
# merged_df['volume'] = merged_df['volume'].fillna(method='bfill')
# merged_df['return'] = merged_df['return'].fillna(method='bfill')

# # Drop rows where we can't find future price data
# train_df = merged_df.dropna(subset=['target_up', 'volume'])

# # 5. Machine Learning Model (Probability Scoring)
# # Features: Sentiment Score, Current Volume (normalized)
# scaler = MinMaxScaler()
# train_df['vol_norm'] = scaler.fit_transform(train_df[['volume']])

# X = train_df[['sentiment_score', 'vol_norm']]
# y = train_df['target_up']

# # Train Random Forest to get probabilities
# clf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=3)
# clf.fit(X, y)

# # Predict probabilities
# train_df['prob_up'] = clf.predict_proba(X)[:, 1] # Probability of Class 1 (Up)

# # Clean up output for the user
# output_df = train_df[['date', 'title', 'sentiment_score', 'max_future_return', 'prob_up']].copy()
# output_df['prob_up_pct'] = (output_df['prob_up'] * 100).round(2)
# output_df['sentiment_score'] = output_df['sentiment_score'].round(2)
# output_df['max_future_return'] = (output_df['max_future_return'] * 100).round(2)

# print(output_df[['date', 'title', 'prob_up_pct']].to_string(index=False))