In [2]:
import praw
import os
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime
import time

# Load environment variables
load_dotenv()

# Reddit API credentials
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
USER_AGENT = os.getenv("USER_AGENT")

# Inisialisasi Reddit API
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT
)

# Subreddit yang akan di-scrape
subreddits = ['politics']

# Fungsi untuk mengumpulkan data
def collect_reddit_data(target_count=10000):
    data = []
    posts_processed = 0
    
    for subreddit_name in subreddits:
        subreddit = reddit.subreddit(subreddit_name)
        
        # Mengambil post dari kategori "hot"
        for submission in subreddit.hot(limit=None):
            if len(data) >= target_count:
                break
                
            posts_processed += 1
            print(f"Processing post {posts_processed} - Current data count: {len(data)}")
            
            # Memastikan semua komentar dimuat
            submission.comments.replace_more(limit=0)
            
            # Menggunakan permalink Reddit sebagai Content_Link
            content_link = f"https://www.reddit.com{submission.permalink}"
            
            for comment in submission.comments.list():
                if len(data) >= target_count:
                    break
                    
                try:
                    comment_data = {
                        'Subreddit': subreddit_name,
                        'Post_Title': submission.title,
                        'Username': comment.author.name if comment.author else '[deleted]',
                        'Comment': comment.body,
                        'Score': comment.score,  # Upvotes - Downvotes
                        'Content_Link': content_link,  # Hanya URL Reddit
                        'Timestamp': datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                    }
                    data.append(comment_data)
                    
                except AttributeError as e:
                    print(f"Error processing comment: {e}")
                    continue
            
            # Delay untuk menghindari rate limit
            time.sleep(1)
            
        if len(data) >= target_count:
            break
    
    return data

# Fungsi untuk menyimpan ke TXT
def save_to_txt(data, filename="reddit_data.txt"):
    with open(filename, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(f"Subreddit: {item['Subreddit']}\n")
            f.write(f"Post Title: {item['Post_Title']}\n")
            f.write(f"Username: {item['Username']}\n")
            f.write(f"Comment: {item['Comment']}\n")
            f.write(f"Score: {item['Score']}\n")
            f.write(f"Content Link: {item['Content_Link']}\n")
            f.write(f"Timestamp: {item['Timestamp']}\n")
            f.write("-" * 50 + "\n")

# Fungsi untuk menyimpan ke CSV
def save_to_csv(data, filename="reddit_data.csv"):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding='utf-8')

# Main execution
def main():
    print("Starting Reddit data collection...")
    start_time = time.time()
    
    # Mengumpulkan data
    collected_data = collect_reddit_data()
    
    # Menyimpan data
    print("Saving data to files...")
    save_to_txt(collected_data)
    save_to_csv(collected_data)
    
    end_time = time.time()
    execution_time = (end_time - start_time) / 60  # dalam menit
    
    print(f"Data collection completed!")
    print(f"Total data collected: {len(collected_data)}")
    print(f"Execution time: {execution_time:.2f} minutes")

if __name__ == "__main__":
    main()

Starting Reddit data collection...
Processing post 1 - Current data count: 0
Processing post 2 - Current data count: 1
Processing post 3 - Current data count: 501
Processing post 4 - Current data count: 643
Processing post 5 - Current data count: 1064
Processing post 6 - Current data count: 1508
Processing post 7 - Current data count: 1631
Processing post 8 - Current data count: 2108
Processing post 9 - Current data count: 2197
Processing post 10 - Current data count: 2249
Processing post 11 - Current data count: 2362
Processing post 12 - Current data count: 2857
Processing post 13 - Current data count: 3322
Processing post 14 - Current data count: 3517
Processing post 15 - Current data count: 3575
Processing post 16 - Current data count: 3608
Processing post 17 - Current data count: 4066
Processing post 18 - Current data count: 4142
Processing post 19 - Current data count: 4224
Processing post 20 - Current data count: 4723
Processing post 21 - Current data count: 4768
Processing post 

In [3]:
import pandas as pd

# Fungsi untuk mendeteksi data unik di kolom Subreddit
def detect_unique_subreddits(csv_file="reddit_data.csv"):
    try:
        # Membaca file CSV
        df = pd.read_csv(csv_file)
        
        # Mengekstrak nilai unik dari kolom 'Subreddit'
        unique_subreddits = df['Subreddit'].unique()
        
        # Menampilkan hasil
        print("Data Unik di Kolom Subreddit:")
        print("-" * 30)
        for subreddit in unique_subreddits:
            print(subreddit)
        print("-" * 30)
        print(f"Total jumlah subreddit unik: {len(unique_subreddits)}")
        
        return unique_subreddits
        
    except FileNotFoundError:
        print(f"File {csv_file} tidak ditemukan. Pastikan file sudah ada.")
        return None
    except KeyError:
        print("Kolom 'Subreddit' tidak ditemukan di file CSV.")
        return None
    except Exception as e:
        print(f"Terjadi kesalahan: {e}")
        return None

# Main execution
def main():
    print("Memulai deteksi data unik di kolom Subreddit...")
    unique_subreddits = detect_unique_subreddits()
    
    if unique_subreddits is not None:
        print("Proses selesai!")
    else:
        print("Proses gagal.")

if __name__ == "__main__":
    main()

Memulai deteksi data unik di kolom Subreddit...
Data Unik di Kolom Subreddit:
------------------------------
politics
------------------------------
Total jumlah subreddit unik: 1
Proses selesai!
