In [1]:
import praw
import os
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime
import time

# Load environment variables
load_dotenv()

# Reddit API credentials
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
USER_AGENT = os.getenv("USER_AGENT")

# Inisialisasi Reddit API
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT
)

# Subreddit yang akan di-scrape
subreddits = ['politics']

# Definisi rentang tanggal
START_DATE = datetime(2019, 1, 1).timestamp()  # 1 Januari 2019
END_DATE = datetime(2025, 3, 21).timestamp()  # 21 Maret 2025

# Fungsi untuk mengumpulkan data
def collect_reddit_data(target_count=10000):
    data = []
    posts_processed = 0
    
    for subreddit_name in subreddits:
        subreddit = reddit.subreddit(subreddit_name)
        
        # Mengambil post dari kategori "top" sepanjang waktu
        for submission in subreddit.top(time_filter="all", limit=None):
            # Filter berdasarkan timestamp postingan
            submission_timestamp = submission.created_utc
            if not (START_DATE <= submission_timestamp <= END_DATE):
                posts_processed += 1
                print(f"Skipping post {posts_processed} - Outside date range: {datetime.fromtimestamp(submission_timestamp)}")
                continue
                
            if len(data) >= target_count:
                break
                
            posts_processed += 1
            print(f"Processing post {posts_processed} - Current data count: {len(data)}")
            
            # Memastikan semua komentar dimuat
            submission.comments.replace_more(limit=0)
            
            # Menggunakan permalink Reddit sebagai Content_Link
            content_link = f"https://www.reddit.com{submission.permalink}"
            
            for comment in submission.comments.list():
                if len(data) >= target_count:
                    break
                    
                # Filter komentar berdasarkan timestamp
                comment_timestamp = comment.created_utc
                if not (START_DATE <= comment_timestamp <= END_DATE):
                    continue
                    
                try:
                    comment_data = {
                        'Subreddit': subreddit_name,
                        'Post_Title': submission.title,
                        'Username': comment.author.name if comment.author else '[deleted]',
                        'Comment': comment.body,
                        'Score': comment.score,
                        'Content_Link': content_link,
                        'Timestamp': datetime.fromtimestamp(comment_timestamp).strftime('%Y-%m-%d %H:%M:%S')
                    }
                    data.append(comment_data)
                    
                except AttributeError as e:
                    print(f"Error processing comment: {e}")
                    continue
            
            # Delay untuk menghindari rate limit
            time.sleep(1)
            
        if len(data) >= target_count:
            break
    
    return data

# Fungsi untuk menyimpan ke TXT
def save_to_txt(data, filename="reddit_data.txt"):
    with open(filename, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(f"Subreddit: {item['Subreddit']}\n")
            f.write(f"Post Title: {item['Post_Title']}\n")
            f.write(f"Username: {item['Username']}\n")
            f.write(f"Comment: {item['Comment']}\n")
            f.write(f"Score: {item['Score']}\n")
            f.write(f"Content Link: {item['Content_Link']}\n")
            f.write(f"Timestamp: {item['Timestamp']}\n")
            f.write("-" * 50 + "\n")

# Fungsi untuk menyimpan ke CSV
def save_to_csv(data, filename="reddit_data.csv"):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding='utf-8')

# Main execution
def main():
    print("Starting Reddit data collection...")
    start_time = time.time()
    
    # Mengumpulkan data
    collected_data = collect_reddit_data()
    
    # Menyimpan data
    print("Saving data to files...")
    save_to_txt(collected_data)
    save_to_csv(collected_data)
    
    end_time = time.time()
    execution_time = (end_time - start_time) / 60  # dalam menit
    
    print(f"Data collection completed!")
    print(f"Total data collected: {len(collected_data)}")
    print(f"Execution time: {execution_time:.2f} minutes")

if __name__ == "__main__":
    main()

Starting Reddit data collection...
Processing post 1 - Current data count: 0
Processing post 2 - Current data count: 493
Processing post 3 - Current data count: 984
Processing post 4 - Current data count: 1479
Processing post 5 - Current data count: 1973
Processing post 6 - Current data count: 2467
Processing post 7 - Current data count: 2953
Processing post 8 - Current data count: 3445
Processing post 9 - Current data count: 3935
Processing post 10 - Current data count: 4430
Processing post 11 - Current data count: 4920
Processing post 12 - Current data count: 5408
Processing post 13 - Current data count: 5898
Processing post 14 - Current data count: 6386
Processing post 15 - Current data count: 6879
Processing post 16 - Current data count: 7364
Processing post 17 - Current data count: 7856
Processing post 18 - Current data count: 8351
Processing post 19 - Current data count: 8839
Processing post 20 - Current data count: 9332
Processing post 21 - Current data count: 9825
Saving data t

## Deteksi Subreddit Unique

In [3]:
import pandas as pd

# Fungsi untuk mendeteksi data unik di kolom Subreddit
def detect_unique_subreddits(csv_file="reddit_data.csv"):
    try:
        # Membaca file CSV
        df = pd.read_csv(csv_file)
        
        # Mengekstrak nilai unik dari kolom 'Subreddit'
        unique_subreddits = df['Subreddit'].unique()
        
        # Menampilkan hasil
        print("Data Unik di Kolom Subreddit:")
        print("-" * 30)
        for subreddit in unique_subreddits:
            print(subreddit)
        print("-" * 30)
        print(f"Total jumlah subreddit unik: {len(unique_subreddits)}")
        
        return unique_subreddits
        
    except FileNotFoundError:
        print(f"File {csv_file} tidak ditemukan. Pastikan file sudah ada.")
        return None
    except KeyError:
        print("Kolom 'Subreddit' tidak ditemukan di file CSV.")
        return None
    except Exception as e:
        print(f"Terjadi kesalahan: {e}")
        return None

# Main execution
def main():
    print("Memulai deteksi data unik di kolom Subreddit...")
    unique_subreddits = detect_unique_subreddits()
    
    if unique_subreddits is not None:
        print("Proses selesai!")
    else:
        print("Proses gagal.")

if __name__ == "__main__":
    main()

Memulai deteksi data unik di kolom Subreddit...
Data Unik di Kolom Subreddit:
------------------------------
politics
------------------------------
Total jumlah subreddit unik: 1
Proses selesai!


## Rentang Tanggal Unique

In [2]:
import pandas as pd
from datetime import datetime

# Fungsi untuk membaca data dari CSV dan mendapatkan tanggal unik
def get_unique_dates(filename="reddit_data.csv"):
    try:
        # Membaca file CSV ke DataFrame
        df = pd.read_csv(filename, encoding='utf-8')
        
        # Pastikan kolom 'Timestamp' ada
        if 'Timestamp' not in df.columns:
            print("Error: Kolom 'Timestamp' tidak ditemukan dalam file CSV.")
            return
        
        # Konversi kolom 'Timestamp' ke datetime dan ekstrak hanya tanggal (YYYY-MM-DD)
        df['Date'] = pd.to_datetime(df['Timestamp']).dt.date
        
        # Dapatkan tanggal unik
        unique_dates = sorted(df['Date'].unique())
        
        # Tampilkan hasil
        print(f"Total tanggal unik: {len(unique_dates)}")
        print("Daftar tanggal unik:")
        for date in unique_dates:
            print(date)
            
        return unique_dates
    
    except FileNotFoundError:
        print(f"Error: File '{filename}' tidak ditemukan.")
        return None
    except Exception as e:
        print(f"Error: Terjadi kesalahan - {str(e)}")
        return None

# Main execution
def main():
    print("Menampilkan tanggal unik dari reddit_data.csv...")
    unique_dates = get_unique_dates()
    
    if unique_dates:
        print("\nProses selesai!")
    else:
        print("\nProses gagal.")

if __name__ == "__main__":
    main()

Menampilkan tanggal unik dari reddit_data.csv...
Total tanggal unik: 63
Daftar tanggal unik:
2019-12-19
2020-03-17
2020-03-18
2020-03-30
2020-04-11
2020-04-13
2020-04-14
2020-04-18
2020-06-03
2020-06-04
2020-06-09
2020-06-12
2020-06-13
2020-06-14
2020-06-15
2020-06-16
2020-06-17
2020-06-18
2020-06-19
2020-06-25
2020-06-26
2020-07-08
2020-07-17
2020-07-31
2020-08-01
2020-08-11
2020-08-13
2020-08-15
2020-08-21
2020-10-17
2020-10-18
2020-11-06
2020-11-07
2020-11-08
2020-11-12
2020-11-13
2021-01-06
2021-01-07
2021-01-11
2021-01-12
2021-01-14
2021-01-20
2021-01-21
2021-01-22
2021-01-25
2021-01-26
2021-01-29
2021-01-30
2021-03-26
2021-03-27
2021-05-01
2021-05-02
2021-05-03
2021-05-04
2021-05-13
2021-05-17
2021-05-22
2021-05-25
2021-05-26
2021-06-19
2022-02-28
2022-03-01
2022-03-06

Proses selesai!
