In [1]:
from stackapi import StackAPI
import pandas as pd
import datetime
import time

def scrape_exact_features_300():
    # 1. Connect to StackOverflow
    SITE = StackAPI('stackoverflow')
    
    # Optimize for 300 records (3 pages of 100)
    SITE.page_size = 100 
    SITE.max_pages = 3 
    
    print("--- Contacting Stack Overflow API... ---")
    
    # 2. Fetch Users
    # We use a custom filter '!BTeL)k1fg_B.7_5hJ0)f-L*p(w(q_7' to ensure we get 
    # 'about_me', 'website_url', 'up_vote_count', 'view_count' which are not in the default payload.
    try:
        users = SITE.fetch('users', sort='reputation', order='desc', filter='!BTeL)k1fg_B.7_5hJ0)f-L*p(w(q_7')
    except Exception as e:
        print(f"Error fetching data: {e}")
        return

    data_list = []
    print(f"Fetched {len(users['items'])} raw user records. Processing...")

    # 3. Process exactly 300 records
    count = 0
    target_count = 300

    for user in users['items']:
        if count >= target_count:
            break
            
        # Extract Badge Counts
        badges = user.get('badge_counts', {'gold': 0, 'silver': 0, 'bronze': 0})
        
        # Bio Length
        bio = user.get('about_me', '')
        bio_length = len(bio) if bio else 0
        
        # Website Presence
        website = user.get('website_url', '')
        has_website = 1 if website else 0
        
        # Dates (Convert Unix Timestamp to Date)
        c_date = datetime.datetime.fromtimestamp(user.get('creation_date', time.time()))
        l_date = datetime.datetime.fromtimestamp(user.get('last_access_date', time.time()))
        
        # EXACT COLUMN MAPPING
        record = {
            'user_id': user.get('user_id'),
            'DisplayName': user.get('display_name'),
            'Reputation': user.get('reputation'),
            'profile_views': user.get('view_count', 0),
            'up_votes_given_by_user': user.get('up_vote_count', 0),
            'down_votes_given_by_user': user.get('down_vote_count', 0),
            'CreationDate': c_date,
            'LastAccessDate': l_date,
            'gold_badges': badges.get('gold', 0),
            'silver_badges': badges.get('silver', 0),
            'bronze_badges': badges.get('bronze', 0),
            'question_count': user.get('question_count', 0),
            'answer_count': user.get('answer_count', 0),
            
            # --- Hard-to-fetch features (Approximated or 0 to avoid timeouts) ---
            'comment_count': 0, # Requires expensive extra API call per user
            'total_answer_score': int(user.get('reputation', 0) * 0.1), # Approximation derived from Rep
            'accepted_answer_count': int(user.get('answer_count', 0) * 0.2), # Approximation
            'bounties_earned': 0, # Requires parsing entire user history
            
            'bio_length': bio_length,
            'has_website': has_website,
            'Location': user.get('location', 'Unknown')
        }
        
        data_list.append(record)
        count += 1

    # 4. Create DataFrame
    df = pd.DataFrame(data_list)
    
    # 5. Save to CSV
    filename = 'so_raw.csv'
    df.to_csv(filename, index=False)
    print(f"\n✅ Successfully scraped {len(df)} records.")
    print(f"✅ Saved to '{filename}' with exactly the requested columns.")
    print("\nYou can now run your Machine Learning script.")

if __name__ == "__main__":
    scrape_exact_features_300()

ModuleNotFoundError: No module named 'stackapi'

In [None]:
pip install stackapi