In [None]:
import re
import os
import json
import datetime
import requests
import time
from collections import deque
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/andrewosborne/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Attempt 2

In [11]:
class ScraperPipeline:
    def __init__(self, platform, keyword, output_file=None, max_results=20):
        self.platform = platform.lower()
        self.keyword = keyword
        self.max_results = max_results
        self.output_file = output_file or f"{self.platform}_{self.keyword.replace(' ', '_')}_{datetime.datetime.now().strftime('%m%d_%H%M%S')}.json"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
        }
        
    def get_search_url(self):
        if self.platform == "youtube":
            return f"https://www.youtube.com/results?search_query={quote_plus(self.keyword)}"
        elif self.platform == "twitter":
            return f"https://twitter.com/search?q={quote_plus(self.keyword)}&src=typed_query"
        else:
            raise ValueError(f"Unsupported platform: {self.platform}")
    
    def download_content(self, url):
        if self.platform == "twitter":
            # Use Selenium with headless Chrome to execute JavaScript
            
            # Configure Chrome options
            chrome_options = Options()
            chrome_options.add_argument("--headless")  # Run in headless mode (no visible browser)
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument(f"user-agent={self.headers['User-Agent']}")
            
            try:
                # Initialize the driver
                driver = webdriver.Chrome(options=chrome_options)
                
                # Navigate to the URL
                driver.get(url)
                
                # Wait for JavaScript to load content (adjust time as needed)
                time.sleep(5)
                
                # Get the page source after JavaScript has executed
                html_content = driver.page_source
                
                # Close the browser
                driver.quit()
                
                return html_content
            
            except Exception as e:
                print(f"Error with Selenium browser: {e}")
                return None
        else:
            # Keep using requests for other platforms like YouTube
            try:
                response = requests.get(url, headers=self.headers)
                response.raise_for_status()
                return response.text
            except requests.RequestException as e:
                print(f"Error downloading content: {e}")
                return None
    
    def extract_youtube_data(self, html_content):
        results = []
        
        # Extract initial data JSON
        initial_data_match = re.search(r'var ytInitialData = (.+?);</script>', html_content)
        if not initial_data_match:
            print("Could not find YouTube initial data")
            return results
            
        try:
            # Parse the video data from the extracted JSON
            data_json = json.loads(initial_data_match.group(1))
            contents = data_json.get('contents', {}).get('twoColumnSearchResultsRenderer', {}).get('primaryContents', {}).get('sectionListRenderer', {}).get('contents', [])
            
            for content in contents:
                if 'itemSectionRenderer' in content:
                    items = content.get('itemSectionRenderer', {}).get('contents', [])
                    
                    for item in items:
                        if 'videoRenderer' in item:
                            video_data = item['videoRenderer']
                            
                            # Extract video details using regex for safety
                            video_id = video_data.get('videoId', '')
                            
                            # Title
                            title_runs = video_data.get('title', {}).get('runs', [])
                            title = ' '.join([run.get('text', '') for run in title_runs]) if title_runs else ''
                            
                            # Channel name
                            channel_name = ''
                            owner_text = video_data.get('ownerText', {}).get('runs', [])
                            if owner_text:
                                channel_name = owner_text[0].get('text', '')
                            
                            # View count
                            view_count_text = video_data.get('viewCountText', {}).get('simpleText', '')
                            view_count = re.search(r'(\d[\d,]*) views', view_count_text)
                            view_count = view_count.group(1).replace(',', '') if view_count else '0'
                            
                            # Published time
                            published_time = video_data.get('publishedTimeText', {}).get('simpleText', '')
                            
                            # Description
                            description_snippet = video_data.get('detailedMetadataSnippets', [{}])
                            description = ''
                            if description_snippet:
                                snippet_text = description_snippet[0].get('snippetText', {}).get('runs', [])
                                description = ' '.join([run.get('text', '') for run in snippet_text])
                            
                            # URL
                            url = f"https://www.youtube.com/watch?v={video_id}"
                            
                            video_info = {
                                'platform': 'youtube',
                                'title': title,
                                'channel': channel_name,
                                'description': description,
                                'views': view_count,
                                'published': published_time,
                                'url': url,
                                'video_id': video_id
                            }
                            
                            results.append(video_info)
                            
                            if len(results) >= self.max_results:
                                return results
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON data: {e}")
        except Exception as e:
            print(f"Error extracting YouTube data: {e}")
            
        return results
    
    def extract_twitter_data(self, html_content):
        results = []
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find tweets using regex patterns
        tweet_pattern = re.compile(r'<article[^>]*data-testid="tweet"[^>]*>(.*?)</article>', re.DOTALL)
        tweets = tweet_pattern.findall(html_content)
        
        for tweet_html in tweets:
            try:
                # Create a BeautifulSoup object for the tweet
                tweet_soup = BeautifulSoup(tweet_html, 'html.parser')
                
                # Extract author name
                author_element = tweet_soup.select_one('[data-testid="User-Name"]')
                author = ''
                if author_element:
                    author_name_element = author_element.select_one('a span')
                    if author_name_element:
                        author = author_name_element.get_text().strip()
                
                # Extract username
                username = ''
                username_pattern = re.compile(r'@([A-Za-z0-9_]+)')
                username_match = username_pattern.search(tweet_html)
                if username_match:
                    username = username_match.group(1)
                
                # Extract tweet text
                text_element = tweet_soup.select_one('[data-testid="tweetText"]')
                text = text_element.get_text().strip() if text_element else ''
                
                # Extract time
                time_element = tweet_soup.select_one('time')
                timestamp = ''
                if time_element:
                    timestamp = time_element.get('datetime', '')
                
                # Extract tweet stats
                reply_count = '0'
                retweet_count = '0'
                like_count = '0'
                
                reply_element = tweet_soup.select_one('[data-testid="reply"]')
                if reply_element:
                    reply_text = reply_element.get_text()
                    reply_match = re.search(r'(\d+)', reply_text)
                    if reply_match:
                        reply_count = reply_match.group(1)
                
                retweet_element = tweet_soup.select_one('[data-testid="retweet"]')
                if retweet_element:
                    retweet_text = retweet_element.get_text()
                    retweet_match = re.search(r'(\d+)', retweet_text)
                    if retweet_match:
                        retweet_count = retweet_match.group(1)
                
                like_element = tweet_soup.select_one('[data-testid="like"]')
                if like_element:
                    like_text = like_element.get_text()
                    like_match = re.search(r'(\d+)', like_text)
                    if like_match:
                        like_count = like_match.group(1)
                
                # Extract tweet ID and URL
                tweet_id = ''
                tweet_url = ''
                link_pattern = re.compile(r'https://twitter\.com/[^/]+/status/(\d+)')
                link_match = link_pattern.search(tweet_html)
                if link_match:
                    tweet_id = link_match.group(1)
                    tweet_url = f"https://twitter.com/{username}/status/{tweet_id}"
                
                tweet_info = {
                    'platform': 'twitter',
                    'author': author,
                    'username': username,
                    'text': text,
                    'timestamp': timestamp,
                    'replies': reply_count,
                    'retweets': retweet_count,
                    'likes': like_count,
                    'tweet_id': tweet_id,
                    'url': tweet_url
                }
                
                results.append(tweet_info)
                
                if len(results) >= self.max_results:
                    break
                
            except Exception as e:
                print(f"Error extracting tweet data: {e}")
        
        return results
    
    def extract_data(self, html_content):
        if self.platform == "youtube":
            return self.extract_youtube_data(html_content)
        elif self.platform == "twitter":
            return self.extract_twitter_data(html_content)
        return []
    
    def save_results(self, results):
        with open(os.path.join('scraped_data/', self.output_file), 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"Results saved to {self.output_file}")
    
    def run(self):
        print(f"Scraping {self.platform} for keyword: {self.keyword}")
        url = self.get_search_url()
        
        # Download content
        html_content = self.download_content(url)
        if not html_content:
            print("Failed to download content")
            return []
        
        # Save raw HTML for debugging
        with open(f"scraped_data/raw_{self.platform}_{self.keyword.replace(' ', '_')}.html", 'w', encoding='utf-8') as f:
            f.write(html_content)
        
        # Extract data
        results = self.extract_data(html_content)
        print(f"Extracted {len(results)} items from {self.platform}")
        
        # Save results
        if results:
            self.save_results(results)

In [None]:
KEYWORDS = 'Etherium'
MAX_RESULTS = 200

scraper = ScraperPipeline('Youtube', KEYWORDS, None, MAX_RESULTS)
scraper.run()

# scraper = ScraperPipeline('twitter', KEYWORDS, 'twitter_output.json', MAX_RESULTS)
# scraper.run()

Scraping youtube for keyword: Etherium
Extracted 19 items from youtube
Results saved to youtube_Etherium_0402_183102.json
