In [1]:
import json
import pandas as pd
import re
import os
from datetime import datetime

In [2]:
#KEYWORDS SCREENING 
KEYWORDS = {
    # HIGH PRIORITY: Corporate Actions yang langsung relevan dengan free float 15%
    'HIGH_PRIORITY': [
        # Rights Issue / HMETD
        r'rights issue', r'right issue', r'HMETD', r'hak memesan efek',
        r'penambahan modal', r'tambahan modal', r'emisi saham',
        
        # Private Placement
        r'private placement', r'penempatan pribadi', r'penempatan saham',
        
        # Divestasi ke investor strategis
        r'divestasi', r'divestment', r'penjualan saham',
        r'investor strategis', r'strategic investor',
        
        # Free Float & Aturan 15%
        r'free float', r'15%', r'lima belas persen',
        r'aturan minimum', r'kepatuhan float', r'pemenuhan float',
        
        # Bonds / Surat Utang (sering jadi sinyal korporasi besar)
        r'obligasi', r'surat utang', r'bond', r'debenture',
        r'penerbitan obligasi', r'penerbitan bond',
        
        # Merger & Akuisisi
        r'merger', r'penggabungan', r'akuisisi', r'pengambilalihan',
        
        # Stock Split / Reverse Split (sering momentum)
        r'stock split', r'pemecahan saham', r'reverse split',
    ],
    
    # MEDIUM PRIORITY: Sinyal potensi korporasi
    'MEDIUM_PRIORITY': [
        r'rencana korporasi', r'aksi korporasi', r'corporate action',
        r'penawaran saham', r'ipo', r'go public',
        r'restrukturisasi', r'reorganisasi',
        r'spin-off', r'pemisahan',
    ],
    
    # FUNDAMENTAL SIGNALS: Sinyal kinerja bagus
    'FUNDAMENTAL': [
        r'laba naik', r'profit meningkat', r'pendapatan tumbuh',
        r'kinerja membaik', r'rekor laba', r'laba tertinggi',
        r'pertumbuhan dua digit', r'double digit growth',
        r'fundamental kuat', r'prospek cerah',
    ],
    
    # RED FLAGS: Hindari ini
    'RED_FLAGS': [
        r'suspensi', r'ditunda', r'gagal', r'masalah hukum',
        r'audit buruk', r'penipuan', r'manipulasi',
        r'rugi besar', r'merosot tajam', r'anjlok',
        r'pelanggaran', r'denda', r'gugatan',
    ],
    
    # TICKER PATTERNS: Untuk ekstrak kode saham
    'TICKER': [
        r'\(([A-Z]{2,4})\)',  # (BBCA)
        r'Kode\s*:?\s*([A-Z]{2,4})',
        r'Ticker\s*:?\s*([A-Z]{2,4})',
        r'Emiten\s+([A-Z]{2,4})\b',
    ]
}

In [3]:
#  SCREENING FUNCTION
def screen_news_for_multi_bagger(news_df):
    """Screening sederhana untuk menemukan berita potensial multi-bagger berdasarkan framework 15% Compliance Multi-Bagger Hunter"""   
    results = []
    
    for idx, row in news_df.iterrows():
        title = str(row.get('title', ''))
        content = str(row.get('full_content', ''))
        source = str(row.get('source', ''))
        date = str(row.get('date', ''))
        
        # Gabungkan title dan content untuk screening
        text = f"{title} {content}".lower()
        
        # Hitung matches untuk setiap kategori
        matches = {
            'high_priority': [],
            'medium_priority': [],
            'fundamental': [],
            'red_flags': [],
        }
        
        # Cari keyword HIGH_PRIORITY
        for keyword in KEYWORDS['HIGH_PRIORITY']:
            pattern = re.compile(keyword, re.IGNORECASE)
            if pattern.search(text):
                matches['high_priority'].append(keyword)
        
        # Cari keyword MEDIUM_PRIORITY
        for keyword in KEYWORDS['MEDIUM_PRIORITY']:
            pattern = re.compile(keyword, re.IGNORECASE)
            if pattern.search(text):
                matches['medium_priority'].append(keyword)
        
        # Cari keyword FUNDAMENTAL
        for keyword in KEYWORDS['FUNDAMENTAL']:
            pattern = re.compile(keyword, re.IGNORECASE)
            if pattern.search(text):
                matches['fundamental'].append(keyword)
        
        # Cari RED_FLAGS
        for keyword in KEYWORDS['RED_FLAGS']:
            pattern = re.compile(keyword, re.IGNORECASE)
            if pattern.search(text):
                matches['red_flags'].append(keyword)
        
        # Ekstrak ticker
        tickers = []
        for pattern in KEYWORDS['TICKER']:
            found = re.findall(pattern, text, re.IGNORECASE)
            for ticker in found:
                if 2 <= len(ticker) <= 4 and ticker.isalpha() and ticker.isupper():
                    tickers.append(ticker)
        
        tickers = list(set(tickers))
        
        # Hitung skor sederhana
        score = (
            len(matches['high_priority']) * 3 +  # High priority paling penting
            len(matches['medium_priority']) * 2 +
            len(matches['fundamental']) * 1 -
            len(matches['red_flags']) * 2  # Red flags mengurangi skor
        )
        
        # Cek deadline (Februari 2026)
        urgency_score = 0
        if '2026-01' in date or '2026-02' in date:
            urgency_score = 10  # Sangat urgent
        elif '2025-12' in date or '2026-03' in date:
            urgency_score = 5   # Cukup urgent
        
        total_score = score + urgency_score
        
        # Tentukan kategori berdasarkan skor
        if total_score >= 5:
            category = "üöÄ HIGH POTENTIAL"
            action = "IMMEDIATE REVIEW"
        elif total_score >= 2:
            category = "‚ö†Ô∏è  MEDIUM POTENTIAL"
            action = "MONITOR"
        else:
            category = "‚è∏Ô∏è  LOW POTENTIAL"
            action = "IGNORE"
        
        # Simpan hasil
        results.append({
            'no': idx + 1,
            'ticker': ', '.join(tickers) if tickers else '-',
            'title': title[:80] + '...' if len(title) > 80 else title,
            'date': date,
            'source': source,
            'high_priority_hits': len(matches['high_priority']),
            'medium_priority_hits': len(matches['medium_priority']),
            'fundamental_hits': len(matches['fundamental']),
            'red_flags': len(matches['red_flags']),
            'urgency_score': urgency_score,
            'total_score': total_score,
            'category': category,
            'action': action,
            'key_findings': ', '.join(matches['high_priority'][:3]) if matches['high_priority'] else '-',
        })
    
    return pd.DataFrame(results)

# FUNGSI UNTUK ANALISIS LEBIH DETAIL
def analyze_potential_multi_bagger(df):
    """Analisis lebih detail untuk berita yang sudah discreen"""
    
    print("\n" + "="*100)
    print("ANALISIS POTENSI MULTI-BAGGER BERDASARKAN BERITA")
    print("="*100)
    
    # 1. TAMPILKAN SEMUA HIGH POTENTIAL
    high_potential = df[df['category'] == 'üöÄ HIGH POTENTIAL']
    
    if len(high_potential) > 0:
        print(f"\nüöÄ BERITA HIGH POTENTIAL ({len(high_potential)} berita):")
        print("-"*100)
        
        for idx, row in high_potential.iterrows():
            print(f"\n[{row['no']}] {row['ticker']} - {row['title']}")
            print(f"   üìÖ {row['date']} | üì∞ {row['source']}")
            print(f"   üéØ Key Findings: {row['key_findings']}")
            print(f"   ‚≠ê Score: {row['total_score']} (High: {row['high_priority_hits']}, Med: {row['medium_priority_hits']}, Fund: {row['fundamental_hits']}, Red: {row['red_flags']})")
            print(f"   üî• Action: {row['action']}")
    
    # 2. GROUP BY TICKER
    ticker_groups = {}
    for idx, row in df.iterrows():
        if row['ticker'] != '-':
            tickers = [t.strip() for t in row['ticker'].split(',')]
            for ticker in tickers:
                if ticker not in ticker_groups:
                    ticker_groups[ticker] = []
                ticker_groups[ticker].append(row)
    
    if ticker_groups:
        print(f"\n\nüìä ANALISIS PER TICKER ({len(ticker_groups)} ticker ditemukan):")
        print("-"*100)
        
        for ticker, news_list in ticker_groups.items():
            avg_score = sum([n['total_score'] for n in news_list]) / len(news_list)
            high_count = len([n for n in news_list if n['category'] == 'üöÄ HIGH POTENTIAL'])
            
            print(f"\nüìà {ticker}:")
            print(f"   ‚Ä¢ Jumlah berita: {len(news_list)}")
            print(f"   ‚Ä¢ Rata-rata skor: {avg_score:.1f}")
            print(f"   ‚Ä¢ High potential: {high_count}")
            
            # Tampilkan berita terbaik untuk ticker ini
            best_news = max(news_list, key=lambda x: x['total_score'])
            print(f"   ‚Ä¢ Berita terbaik: '{best_news['title'][:60]}...' (Score: {best_news['total_score']})")
    
    # 3. DISTRIBUSI SKOR
    print(f"\n\nüìà DISTRIBUSI HASIL SCREENING:")
    print("-"*100)
    
    categories = df['category'].value_counts()
    for cat, count in categories.items():
        percentage = (count / len(df)) * 100
        print(f"   {cat}: {count} berita ({percentage:.1f}%)")
    
    # 4. TOP KEYWORDS FOUND
    print(f"\n\nüîç KEYWORDS PALING SERING DITEMUKAN:")
    print("-"*100)
    
    # Simulasi hitung keyword (dalam implementasi nyata butuh tracking lebih detail)
    keyword_counts = {
        'rights issue': 0,
        'private placement': 0,
        'free float': 0,
        'investor strategis': 0,
        'obligasi': 0,
    }
    
    for idx, row in df.iterrows():
        if 'rights issue' in row['key_findings'].lower():
            keyword_counts['rights issue'] += 1
        if 'private placement' in row['key_findings'].lower():
            keyword_counts['private placement'] += 1
        if 'free float' in row['key_findings'].lower():
            keyword_counts['free float'] += 1
        if 'investor strategis' in row['key_findings'].lower():
            keyword_counts['investor strategis'] += 1
        if 'obligasi' in row['key_findings'].lower():
            keyword_counts['obligasi'] += 1
    
    for keyword, count in keyword_counts.items():
        if count > 0:
            print(f"   {keyword}: {count}x ditemukan")
    
    return df, ticker_groups

# FUNGSI UNTUK EKSTRAKSI SPESIFIK
def extract_corporate_actions(text):
    """Ekstrak informasi spesifik tentang corporate actions"""
    
    actions = []
    
    # Cari Rights Issue
    ri_patterns = [
        r'rights issue.*?(\d+(?:\.\d+)?)\s*(?:juta|miliar|ribu)?\s*saham',
        r'terbitkan.*?(\d+(?:\.\d+)?)\s*(?:juta|miliar|ribu)?\s*saham.*?rights',
        r'HMETD.*?(\d+(?:\.\d+)?)\s*(?:juta|miliar|ribu)?\s*saham',
    ]
    
    for pattern in ri_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            actions.append(f"Rights Issue: {match.group(1)} saham")
            break
    
    # Cari Private Placement
    pp_patterns = [
        r'private placement.*?(\d+(?:\.\d+)?)\s*(?:juta|miliar|ribu)?\s*saham',
        r'penempatan pribadi.*?(\d+(?:\.\d+)?)\s*(?:juta|miliar|ribu)?\s*saham',
    ]
    
    for pattern in pp_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            actions.append(f"Private Placement: {match.group(1)} saham")
            break
    
    # Cari Harga
    price_pattern = r'harga\s*[:\s]*Rp\s*(\d+(?:\.\d+)*)'
    price_match = re.search(price_pattern, text, re.IGNORECASE)
    if price_match:
        actions.append(f"Harga: Rp {price_match.group(1)}")
    
    # Cari Investor Strategis
    investor_pattern = r'(?:oleh|dari|kepada)\s+([A-Z][a-zA-Z\s&]+)(?=\s+sebagai\s+investor)'
    investor_match = re.search(investor_pattern, text)
    if investor_match:
        actions.append(f"Investor: {investor_match.group(1)}")
    
    return actions

# FUNGSI UTAMA UNTUK COLAB
def main_screening():
    """Fungsi utama untuk screening berita di Google Colab"""
    
    print("üéØ SCREENING BERITA MULTI-BAGGER HUNTER")
    print("="*70)
    print("Fokus: Rights Issue, Private Placement, Bonds, Free Float 15%")
    print("="*70)
    
    # Contoh data (ganti dengan data JSON Anda)
    sample_data = [
        {
            "title": "Balik Rugi Jadi Laba, Minna Padi (PADI) Raup Rp24,74 Miliar hingga Kuartal III-2025",
            "date": "2026-01-21",
            "source": "IDX Channel",
            "full_content": "IDXChannel- PTMinna PadiInvestama Sekuritas Tbk (PADI) mencatatkan perbaikan kinerja keuangan secara signifikan hingga akhir September 2025. Emiten jasa perantara perdagangan efek ini mencatat lonjakan pendapatan usaha seiring menguatnya aktivitas transaksi pasar modal dan membaiknya kinerja perdagangan efek. PADI Bakal Minta Persetujuan OJK Soal Rights Issue pada Maret 2026 dengan investor strategis dari Singapura.",
            "summary": "Minna Padi mencatatkan perbaikan kinerja keuangan secara signifikan."
        },
        {
            "title": "Asuransi Digital (YOII) Gelar Rights Issue, Terbitkan 684 Juta Saham",
            "date": "2026-01-21",
            "source": "IDX Channel",
            "full_content": "IDXChannel- PTAsuransi DigitalBersama Tbk (YOII) berencana melakukanpenambahan modalmelalui pemberian Hak Memesan Efek Terlebih Dahulu (HMETD) ataurights issuetahap I. Dalam aksi korporasi tersebut, perseroan akan menerbitkan sebanyak 684.937.500 saham baru dengan nilai nominal Rp100 per saham. Berdasarkan prospektus yang disampaikan dalam keterbukaan informasi Rabu (21/1/2026), pemegang saham yang tidak melaksanakan HMETD berpotensi mengalami dilusi kepemilikan hingga maksimal 16,67 persen.",
            "summary": "Seluruh dana hasil rights issue akan dialokasikan untuk keperluan modal kerja perseroan."
        },
        {
            "title": "Emiten Wajib Penuhi Free Float 15% Februari 2026",
            "date": "2026-01-20",
            "source": "Kontan",
            "full_content": "BEI mengingatkan emiten untuk memenuhi aturan free float minimum 15% paling lambat Februari 2026. Sejumlah emiten sudah mulai melakukan rights issue dan private placement untuk memenuhi ketentuan ini. Investor asing mulai melirik saham-saham yang akan melakukan corporate action.",
            "summary": "Deadline free float 15% semakin dekat."
        },
        {
            "title": "Bank ABC Terbitkan Obligasi Rp 2 Triliun",
            "date": "2026-01-19",
            "source": "Bisnis",
            "full_content": "Bank ABC (BBCA) menerbitkan obligasi senilai Rp 2 triliun dengan tenor 5 tahun. Penerbitan obligasi ini untuk memperkuat modal dan mendukung ekspansi kredit.",
            "summary": "Bank ABC terbitkan obligasi senilai Rp 2 triliun."
        },
        {
            "title": "Harga Saham Biasa Naik Tipis",
            "date": "2026-01-21",
            "source": "Reuters",
            "full_content": "Harga saham di bursa naik tipis hari ini tanpa berita spesifik.",
            "summary": "Pasar saham naik tipis."
        }
    ]
    
    # Convert to DataFrame
    news_df = pd.DataFrame(sample_data)
    print(f"üìä Data berita: {len(news_df)} berita\n")
    
    # Lakukan screening
    print("üîç Melakukan screening...")
    screened_df = screen_news_for_multi_bagger(news_df)
    
    # Urutkan berdasarkan skor tertinggi
    screened_df = screened_df.sort_values('total_score', ascending=False)
    
    # Tampilkan hasil dalam tabel sederhana
    print("\n" + "="*120)
    print("HASIL SCREENING BERITA")
    print("="*120)
    
    # Tampilkan kolom penting
    display_cols = ['no', 'ticker', 'title', 'total_score', 'category', 'action', 'key_findings']
    print(screened_df[display_cols].to_string(index=False))
    
    # Analisis lebih detail
    analyze_potential_multi_bagger(screened_df)
    
    # Tampilkan rekomendasi akhir
    print("\n" + "="*100)
    print("üí° REKOMENDASI INVESTIGASI LEBIH LANJUT")
    print("="*100)
    
    high_potential = screened_df[screened_df['category'] == 'üöÄ HIGH POTENTIAL']
    
    if len(high_potential) > 0:
        print("\nüéØ PRIORITAS UTAMA (Harus diteliti lebih lanjut):")
        for idx, row in high_potential.iterrows():
            print(f"\n{row['no']}. [{row['ticker']}] {row['title']}")
            
            # Ekstrak info spesifik
            full_text = f"{row['title']} {news_df.iloc[row['no']-1]['full_content']}"
            actions = extract_corporate_actions(full_text)
            
            if actions:
                print(f"   üìã Corporate Actions ditemukan:")
                for action in actions:
                    print(f"      ‚Ä¢ {action}")
            
            print(f"   üîç Langkah selanjutnya:")
            print(f"      1. Cek keterbukaan informasi di website BEI")
            print(f"      2. Verifikasi data free float di RTI/IDX")
            print(f"      3. Cek track record investor (jika ada)")
            print(f"      4. Analisis fundamental dasar")
    else:
        print("\n‚ö†Ô∏è  Tidak ada berita high potential yang ditemukan.")
        print("   Coba tambah data berita atau adjust keyword.")
    
    return screened_df

# FUNGSI UNTUK FILE JSON
def screen_json_file(file_path):
    """Screening dari file JSON"""
    print(f"\nüìÇ Membaca file: {file_path}")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        news_df = pd.DataFrame(data)
        print(f"‚úì {len(news_df)} berita dimuat")
        
        # Screening
        screened_df = screen_news_for_multi_bagger(news_df)
        screened_df = screened_df.sort_values('total_score', ascending=False)
        
        # Tampilkan top 10
        print("\n" + "="*120)
        print("TOP 10 BERITA POTENSIAL")
        print("="*120)
        
        top_10 = screened_df.head(10)
        display_cols = ['no', 'ticker', 'title', 'total_score', 'category', 'key_findings']
        print(top_10[display_cols].to_string(index=False))
        
        # Simpan hasil
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f'/Users/albert/Documents/Finances/projects/02_alpha_research/06_filtering_news/output/corporate_action_news_screener_{timestamp}.csv'
        screened_df.to_csv(output_file, index=False, encoding='utf-8-sig')
        
        print(f"\nüíæ Hasil disimpan: {output_file}")
        
        # Tampilkan rekomendasi
        high_potential = screened_df[screened_df['category'] == 'üöÄ HIGH POTENTIAL']
        
        if len(high_potential) > 0:
            print(f"\nüéØ TOTAL HIGH POTENTIAL: {len(high_potential)} berita")
            print("   Berita ini perlu investigasi lebih lanjut!")
        
        return screened_df
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None


In [4]:
dataset_path = "/Users/albert/Documents/Finances/data/processed/20250901_20260131/merged_detailed_20250901-20260131.json"
if not os.path.exists(dataset_path):
    print("FIle not found")
else:
    results = screen_json_file(dataset_path)


üìÇ Membaca file: /Users/albert/Documents/Finances/data/processed/20250901_20260131/merged_detailed_20250901-20260131.json
‚úì 3304 berita dimuat

TOP 10 BERITA POTENSIAL
  no ticker                                                                               title  total_score         category                              key_findings
1767      -                       PACK Tawarkan Surat Utang Lewat Right Issue: Ini Hak Investor           29 üöÄ HIGH POTENTIAL          rights issue, right issue, HMETD
1852      -                 NINE Targetkan Pendaftaran Rights Issue ke OJK pada Kuartal II-2026           28 üöÄ HIGH POTENTIAL     rights issue, HMETD, hak memesan efek
1690      -                    Tawarkan OWK Lewat Rights Issue, PACK Himpun Dana Rp3,25 Triliun           28 üöÄ HIGH POTENTIAL     rights issue, HMETD, hak memesan efek
2500      - Butuh Modal, Bakrie Grup (BNBR) Siapkan Rights Issue Usai Akuisisi Tol Cimanggis...           28 üöÄ HIGH POTENTIAL     rights issue,

In [5]:
dataset_path = "/Users/albert/Documents/Finances/data/processed/20260121_20260131/merged_news_filtered_20260121_20260131.json"
if not os.path.exists(dataset_path):
    print("FIle not found")
else:
    results = screen_json_file(dataset_path)


üìÇ Membaca file: /Users/albert/Documents/Finances/data/processed/20260121_20260131/merged_news_filtered_20260121_20260131.json
‚úì 603 berita dimuat

TOP 10 BERITA POTENSIAL
 no ticker                                                                               title  total_score         category                               key_findings
511      -    ELPI Umumkan Rencana Rights Issue, Bakal Terbitkan hingga 2,03 Miliar Saham Baru           27 üöÄ HIGH POTENTIAL      rights issue, HMETD, hak memesan efek
 68      -                         Akuisisi Emway, BABY Gelar Rights Issue Pakai Skema Inbreng           27 üöÄ HIGH POTENTIAL      rights issue, HMETD, hak memesan efek
249      -                    TRUE Tunda Private Placement, Buka Opsi Cari Aksi Korporasi Lain           27 üöÄ HIGH POTENTIAL  HMETD, hak memesan efek, penambahan modal
  3      -                Asuransi Digital (YOII) Gelar Rights Issue, Terbitkan 684 Juta Saham           27 üöÄ HIGH POTENTIAL      rights is