## Install Dependencies and Imports

In [None]:
# Install required packages
!pip install requests google-cloud-storage pandas python-dotenv beautifulsoup4

In [None]:
# Imports
import os
import json
import logging
import pandas as pd
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
import requests
import time
import re
from google.cloud import storage
import warnings
from bs4 import BeautifulSoup
warnings.filterwarnings('ignore')

# Configure logging for Jupyter
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ All packages imported successfully!")
print("🔥 Ready to collect high-quality tech discourse from Hacker News!")

## Configuration Setup

In [None]:
# Hacker News API Configuration (no authentication needed!)
HN_API_BASE = "https://hacker-news.firebaseio.com/v0"

# GCP Configuration 
GCP_CONFIG = {
    'project_id': '...',  # Your project ID
    'bucket_name': '...',  # Your bucket name
    'credentials_path': None  # Set if using service account key file
}

# Collection Parameters
COLLECTION_PARAMS = {
    'max_stories': 200,          # Number of stories to check
    'max_comments_per_story': 20, # Comments per relevant story
    'min_comment_length': 15,     # Minimum comment character length
    'max_comment_depth': 3,       # How deep in comment threads to go
    'stories_lookback_hours': 168 # Look back 1 week (168 hours)
}

# OpenAI-related keywords (expanded for HN tech discourse)
OPENAI_KEYWORDS = [
    'openai', 'chatgpt', 'gpt-4', 'gpt-3', 'gpt', 'dall-e', 'dalle',
    'sam altman', 'artificial general intelligence', 'agi',
    'large language model', 'llm', 'transformer', 'generative ai',
    'artificial intelligence', 'machine learning ml', 'neural network',
    'deep learning', 'natural language processing', 'nlp',
    'prompt engineering', 'fine-tuning', 'ai safety', 'alignment',
    'microsoft openai', 'github copilot', 'ai assistant'
]

print("⚙️ Configuration complete!")
print(f"🎯 Targeting stories with keywords: {', '.join(OPENAI_KEYWORDS[:5])}...")
print(f"📊 Will collect up to {COLLECTION_PARAMS['max_stories']} stories")

## Hacker News API Client

In [None]:
class HackerNewsAPI:
    """Client for interacting with Hacker News API"""
    
    def __init__(self, base_url: str = HN_API_BASE):
        self.base_url = base_url
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'OpenAI-Sentiment-Collector/1.0'
        })
        
    def get_item(self, item_id: int) -> Optional[Dict]:
        """Get a single item (story, comment, etc.) by ID"""
        try:
            response = self.session.get(f"{self.base_url}/item/{item_id}.json")
            response.raise_for_status()
            return response.json()
        except Exception as e:
            logger.error(f"Error fetching item {item_id}: {e}")
            return None
    
    def get_top_stories(self, limit: int = 500) -> List[int]:
        """Get top story IDs"""
        try:
            response = self.session.get(f"{self.base_url}/topstories.json")
            response.raise_for_status()
            story_ids = response.json()
            return story_ids[:limit]
        except Exception as e:
            logger.error(f"Error fetching top stories: {e}")
            return []
    
    def get_new_stories(self, limit: int = 500) -> List[int]:
        """Get new story IDs"""
        try:
            response = self.session.get(f"{self.base_url}/newstories.json")
            response.raise_for_status()
            story_ids = response.json()
            return story_ids[:limit]
        except Exception as e:
            logger.error(f"Error fetching new stories: {e}")
            return []
    
    def get_best_stories(self, limit: int = 200) -> List[int]:
        """Get best story IDs"""
        try:
            response = self.session.get(f"{self.base_url}/beststories.json")
            response.raise_for_status()
            story_ids = response.json()
            return story_ids[:limit]
        except Exception as e:
            logger.error(f"Error fetching best stories: {e}")
            return []

In [None]:
# Initialize API client
hn_api = HackerNewsAPI()

In [None]:
# Test connection
print("🧪 Testing Hacker News API connection...")
test_item = hn_api.get_item(1)  # Get the first HN item ever
if test_item:
    print(f"✅ API connection successful!")
    print(f"   First HN item: '{test_item.get('title', 'No title')}' by {test_item.get('by', 'unknown')}")
else:
    print("❌ API connection failed")

print("🔧 Hacker News API client ready!")

## Data Processing Functions

In [None]:
def contains_openai_keywords(text: str) -> bool:
    """Check if text contains OpenAI-related keywords"""
    if not text:
        return False
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in OPENAI_KEYWORDS)

def clean_text(text: str) -> str:
    """Clean and preprocess text"""
    if not text:
        return ""
    
    # Remove HTML entities and tags if present
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove excessive whitespace
    text = ' '.join(text.split())
    # Remove common HN artifacts
    text = re.sub(r'\[flagged\]', '', text)
    text = re.sub(r'\[dead\]', '', text)
    
    return text.strip()

def extract_story_data(story_item: Dict) -> Optional[Dict[str, Any]]:
    """Extract relevant data from a HN story"""
    try:
        if not story_item or story_item.get('deleted') or story_item.get('dead'):
            return None
            
        # Get story text content
        title = story_item.get('title', '')
        text = story_item.get('text', '')
        url = story_item.get('url', '')
        
        full_content = f"{title} {text}".strip()
        
        # Check if story is OpenAI-related
        if not contains_openai_keywords(full_content):
            return None
            
        return {
            'id': story_item['id'],
            'title': clean_text(title),
            'text': clean_text(text),
            'url': url,
            'score': story_item.get('score', 0),
            'descendants': story_item.get('descendants', 0),  # comment count
            'time': story_item.get('time', 0),
            'created_date': datetime.fromtimestamp(story_item.get('time', 0)).strftime('%Y-%m-%d %H:%M:%S'),
            'author': story_item.get('by', 'unknown'),
            'content_type': 'story',
            'full_text': clean_text(full_content),
            'text_length': len(clean_text(full_content)),
            'kids': story_item.get('kids', [])  # comment IDs
        }
    except Exception as e:
        logger.error(f"Error extracting story data: {e}")
        return None

def extract_comment_data(comment_item: Dict, story_id: int, depth: int = 0) -> Optional[Dict[str, Any]]:
    """Extract relevant data from a HN comment"""
    try:
        if not comment_item or comment_item.get('deleted') or comment_item.get('dead'):
            return None
            
        text = comment_item.get('text', '')
        if not text or len(text) < COLLECTION_PARAMS['min_comment_length']:
            return None
            
        clean_comment_text = clean_text(text)
        if len(clean_comment_text) < COLLECTION_PARAMS['min_comment_length']:
            return None
            
        return {
            'id': comment_item['id'],
            'parent_id': comment_item.get('parent', story_id),
            'story_id': story_id,
            'text': clean_comment_text,
            'time': comment_item.get('time', 0),
            'created_date': datetime.fromtimestamp(comment_item.get('time', 0)).strftime('%Y-%m-%d %H:%M:%S'),
            'author': comment_item.get('by', 'unknown'),
            'content_type': 'comment',
            'full_text': clean_comment_text,
            'text_length': len(clean_comment_text),
            'depth': depth,
            'kids': comment_item.get('kids', [])  # reply IDs
        }
    except Exception as e:
        logger.error(f"Error extracting comment data: {e}")
        return None

print("📝 Data processing functions defined!")

## Story Collection Function

In [None]:
def collect_relevant_stories() -> List[Dict[str, Any]]:
    """Collect OpenAI-related stories from HN"""
    print("📱 Collecting OpenAI-related stories from Hacker News...")
    
    collected_stories = []
    processed_count = 0
    relevant_count = 0
    
    # Get story IDs from multiple sources
    print("🔍 Fetching story lists...")
    top_stories = hn_api.get_top_stories(limit=100)
    new_stories = hn_api.get_new_stories(limit=100)
    best_stories = hn_api.get_best_stories(limit=50)
    
    # Combine and deduplicate
    all_story_ids = list(set(top_stories + new_stories + best_stories))
    print(f"   Found {len(all_story_ids)} unique stories to check")
    
    # Calculate time cutoff
    cutoff_time = time.time() - (COLLECTION_PARAMS['stories_lookback_hours'] * 3600)
    
    for story_id in all_story_ids[:COLLECTION_PARAMS['max_stories']]:
        processed_count += 1
        
        if processed_count % 20 == 0:
            print(f"   📊 Processed {processed_count}/{len(all_story_ids)} stories, found {relevant_count} relevant")
        
        # Get story details
        story_item = hn_api.get_item(story_id)
        if not story_item:
            continue
            
        # Check if story is recent enough
        if story_item.get('time', 0) < cutoff_time:
            continue
            
        # Extract and check relevance
        story_data = extract_story_data(story_item)
        if story_data:
            collected_stories.append(story_data)
            relevant_count += 1
            print(f"   ✅ Found: '{story_data['title'][:60]}...' (Score: {story_data['score']})")
        
        # Rate limiting
        time.sleep(0.1)
    
    print(f"\n📈 Story collection complete:")
    print(f"   Processed: {processed_count} stories")
    print(f"   Relevant: {relevant_count} OpenAI-related stories")
    
    return collected_stories

In [None]:
# Collect stories
relevant_stories = collect_relevant_stories()

## Comment Collection Function

In [None]:
def collect_comments_for_story(story_data: Dict, max_comments: int = 20) -> List[Dict[str, Any]]:
    """Collect comments for a specific story"""
    collected_comments = []
    
    if not story_data.get('kids'):
        return collected_comments
    
    comment_queue = [(kid_id, 0) for kid_id in story_data['kids'][:max_comments]]
    
    while comment_queue and len(collected_comments) < max_comments:
        comment_id, depth = comment_queue.pop(0)
        
        if depth > COLLECTION_PARAMS['max_comment_depth']:
            continue
            
        # Get comment details
        comment_item = hn_api.get_item(comment_id)
        if not comment_item:
            continue
            
        # Extract comment data
        comment_data = extract_comment_data(comment_item, story_data['id'], depth)
        if comment_data:
            collected_comments.append(comment_data)
            
            # Add replies to queue if we haven't gone too deep
            if depth < COLLECTION_PARAMS['max_comment_depth'] and comment_item.get('kids'):
                for kid_id in comment_item['kids'][:5]:  # Limit replies per comment
                    comment_queue.append((kid_id, depth + 1))
        
        # Rate limiting
        time.sleep(0.05)
    
    return collected_comments

def collect_all_comments(stories: List[Dict]) -> List[Dict[str, Any]]:
    """Collect comments for all relevant stories"""
    print("💬 Collecting comments for relevant stories...")
    
    all_comments = []
    
    for i, story in enumerate(stories):
        print(f"   📝 Processing comments for story {i+1}/{len(stories)}: '{story['title'][:50]}...'")
        
        comments = collect_comments_for_story(story, COLLECTION_PARAMS['max_comments_per_story'])
        all_comments.extend(comments)
        
        print(f"      Found {len(comments)} comments")
        
        # Rate limiting between stories
        time.sleep(0.2)
    
    print(f"\n💬 Comment collection complete: {len(all_comments)} comments collected")
    return all_comments

In [None]:
# Collect comments for all relevant stories
if relevant_stories:
    collected_comments = collect_all_comments(relevant_stories)
else:
    collected_comments = []
    print("⚠️ No relevant stories found, skipping comment collection")

## Combine and Analyze Data

In [None]:
# Combine all collected data
all_collected_data = relevant_stories + collected_comments

print("📊 Data Analysis:")
print(f"   Total items collected: {len(all_collected_data)}")
print(f"   Stories: {len(relevant_stories)}")
print(f"   Comments: {len(collected_comments)}")

if all_collected_data:
    # Convert to DataFrame for analysis
    df = pd.DataFrame(all_collected_data)
    
    print(f"   Date range: {df['created_date'].min()} to {df['created_date'].max()}")
    print(f"   Average text length: {df['text_length'].mean():.0f} characters")
    
    # Show top stories by score
    story_df = df[df['content_type'] == 'story'].sort_values('score', ascending=False)
    print(f"\n🔥 Top stories by score:")
    for idx, story in story_df.head(5).iterrows():
        print(f"   • {story['title'][:70]}... (Score: {story['score']}, Comments: {story['descendants']})")
    
    # Show comment distribution
    if len(collected_comments) > 0:
        comment_df = df[df['content_type'] == 'comment']
        print(f"\n💬 Comment insights:")
        print(f"   Average comment length: {comment_df['text_length'].mean():.0f} characters")
        print(f"   Comment depth distribution: {comment_df['depth'].value_counts().to_dict()}")
    
    # Show sample content
    print(f"\n🔍 Sample collected content:")
    for idx, item in df.head(3).iterrows():
        content_preview = item['full_text'][:100] + "..." if len(item['full_text']) > 100 else item['full_text']
        print(f"   [{item['content_type']}] {content_preview}")
        
else:
    print("⚠️ No data collected. Consider adjusting keywords or time range.")

## GCP Upload Setup

In [None]:
class GCPUploader:
    """Handles uploading data to Google Cloud Storage"""
    
    def __init__(self, gcp_config: dict):
        if gcp_config.get('credentials_path'):
            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = gcp_config['credentials_path']
        
        self.client = storage.Client(project=gcp_config['project_id'])
        self.bucket_name = gcp_config['bucket_name']
        self.bucket = self.client.bucket(gcp_config['bucket_name'])
        
        print(f"☁️ Connected to GCS bucket: {self.bucket_name}")
    
    def upload_json_data(self, data: List[Dict[str, Any]], filename: str = None) -> str:
        """Upload collected data as JSON to GCS bucket"""
        if not filename:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"hackernews_openai_data_{timestamp}.json"
        
        # Convert to JSON
        json_data = json.dumps(data, indent=2, default=str)
        
        try:
            # Upload to bucket
            blob = self.bucket.blob(f"raw_data/{filename}")
            blob.upload_from_string(json_data, content_type='application/json')
            
            gcs_uri = f"gs://{self.bucket_name}/raw_data/{filename}"
            print(f"✅ JSON data uploaded to: {gcs_uri}")
            
            return gcs_uri
            
        except Exception as e:
            print(f"❌ Failed to upload JSON data: {e}")
            raise
    
    def upload_csv_data(self, data: List[Dict[str, Any]], filename: str = None) -> str:
        """Convert to DataFrame and upload as CSV"""
        df = pd.DataFrame(data)
        
        if not filename:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"hackernews_openai_data_{timestamp}.csv"
        
        try:
            # Upload CSV
            csv_data = df.to_csv(index=False)
            blob = self.bucket.blob(f"processed_data/{filename}")
            blob.upload_from_string(csv_data, content_type='text/csv')
            
            gcs_uri = f"gs://{self.bucket_name}/processed_data/{filename}"
            print(f"✅ CSV data uploaded to: {gcs_uri}")
            
            return gcs_uri
            
        except Exception as e:
            print(f"❌ Failed to upload CSV data: {e}")
            raise
    
    def upload_metadata(self, data_stats: dict, filename: str = None) -> str:
        """Upload collection metadata"""
        if not filename:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"hn_metadata_{timestamp}.json"
        
        metadata = {
            'collection_timestamp': datetime.now().isoformat(),
            'data_source': 'hacker_news',
            'collection_params': COLLECTION_PARAMS,
            'openai_keywords': OPENAI_KEYWORDS,
            **data_stats
        }
        
        try:
            metadata_blob = self.bucket.blob(f"metadata/{filename}")
            metadata_blob.upload_from_string(
                json.dumps(metadata, indent=2, default=str), 
                content_type='application/json'
            )
            
            gcs_uri = f"gs://{self.bucket_name}/metadata/{filename}"
            print(f"✅ Metadata uploaded to: {gcs_uri}")
            return gcs_uri
            
        except Exception as e:
            print(f"❌ Failed to upload metadata: {e}")
            raise

print("☁️ GCP Uploader class defined!")

## Upload to GCS

In [None]:
if all_collected_data:
    # Initialize uploader
    uploader = GCPUploader(GCP_CONFIG)
    
    # Prepare data statistics
    data_stats = {
        'total_records': len(all_collected_data),
        'stories_count': len(relevant_stories),
        'comments_count': len(collected_comments),
        'date_range_start': df['created_date'].min() if not df.empty else None,
        'date_range_end': df['created_date'].max() if not df.empty else None,
        'avg_score': float(story_df['score'].mean()) if not story_df.empty else 0,
        'avg_text_length': float(df['text_length'].mean()) if not df.empty else 0,
        'total_story_score': int(story_df['score'].sum()) if not story_df.empty else 0
    }
    
    print("📤 Uploading data to GCS...")
    
    try:
        # Upload JSON data
        json_uri = uploader.upload_json_data(all_collected_data)
        
        # Upload CSV data  
        csv_uri = uploader.upload_csv_data(all_collected_data)
        
        # Upload metadata
        metadata_uri = uploader.upload_metadata(data_stats)
        
        print(f"\n🎉 Upload completed successfully!")
        print(f"   📁 JSON: {json_uri}")
        print(f"   📊 CSV: {csv_uri}")
        print(f"   📋 Metadata: {metadata_uri}")
        
    except Exception as e:
        print(f"❌ Upload failed: {e}")
        
else:
    print("⚠️ No data to upload. Try adjusting collection parameters or keywords.")


## Final Summary and Next Steps

In [None]:
print("=" * 70)
print("🚀 HACKER NEWS DATA COLLECTION COMPLETED!")
print("=" * 70)

if all_collected_data:
    print(f"\n📊 Final Statistics:")
    print(f"   • Total OpenAI-related items: {len(all_collected_data)}")
    print(f"   • Stories: {len(relevant_stories)}")
    print(f"   • Comments: {len(collected_comments)}")
    print(f"   • Average story score: {story_df['score'].mean():.1f}" if not story_df.empty else "   • No stories collected")
    print(f"   • Total engagement: {story_df['descendants'].sum()} comments across all stories" if not story_df.empty else "")
    print(f"   • Average text length: {df['text_length'].mean():.0f} characters")
    
    print(f"\n🎯 Data Quality Insights:")
    print(f"   • High-quality tech discourse from HN community")
    print(f"   • Mix of technical and business perspectives")
    print(f"   • Recent discussions (last {COLLECTION_PARAMS['stories_lookback_hours']} hours)")
    print(f"   • Engaged community (average story score: {story_df['score'].mean():.1f})")
    
    print(f"\n🔮 Next Steps:")
    print(f"   1. ✅ High-quality HN data collected and stored in GCS")
    print(f"   2. 🏷️  Create sentiment labeling pipeline (HN tends to be more analytical)")
    print(f"   3. 🔧 Preprocess data for BERT fine-tuning")
    print(f"   4. 🎯 Fine-tune your existing BERT model on this tech-focused data")
    print(f"   5. 📈 Compare performance: movie reviews → HN tech discourse")
    
    print(f"\n💡 Pro Tips:")
    print(f"   • HN discussions are often more nuanced than binary sentiment")
    print(f"   • Consider multi-class labels: positive/negative/neutral/analytical")
    print(f"   • Tech jargon and acronyms are common - great for domain adaptation")
    print(f"   • High signal-to-noise ratio compared to other social platforms")
    
    # Show sample of highest-scoring content
    if not story_df.empty:
        print(f"\n🔥 Top collected stories:")
        for idx, story in story_df.head(3).iterrows():
            print(f"   • [{story['score']} pts] {story['title']}")
            print(f"     {story['descendants']} comments | {story['created_date']}")
    
else:
    print(f"\n⚠️ No data collected this run. Consider:")
    print(f"   • Expanding keywords list")
    print(f"   • Increasing lookback time window")
    print(f"   • Checking if there are recent OpenAI discussions on HN")

print(f"\n🎊 Ready for the next phase: Transform this high-quality tech discourse into training data!")