# Selenium Web Scraping

In [21]:
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urljoin

def get_politifact_fact_check_urls(search_term, page_num):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(options=chrome_options)

    base_url = "https://www.politifact.com"
    search_url = f"{base_url}/search/factcheck/?page={page_num}&q={search_term}"

    fact_check_urls = []

    try:
        driver.get(search_url)
        WebDriverWait(driver, 100).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.c-textgroup__title a"))
        )

        link_elements = driver.find_elements(By.CSS_SELECTOR, "div.c-textgroup__title a")

        for link_element in link_elements:
            href = link_element.get_attribute("href")
            if href:
                full_url = urljoin(base_url, href)
                fact_check_urls.append(full_url)

        return fact_check_urls

    except Exception as e:
        print(f"An error occurred on page {page_num}: {e}")
        return []
    finally:
        driver.quit()

if __name__ == '__main__':
    all_urls = []
    search_query = "video"
    for page_num in range(1, 75): #goes to pages 1225 
        urls = get_politifact_fact_check_urls(search_query, page_num)
        if urls:
            print(f"Found the following fact-check URLs on page {page_num}:")
            for url in urls:
                  print(url)
                  all_urls.append(url)
        else:
            print(f"No fact-check URLs found on page {page_num} for '{search_query}'.")

    print("\nAll collected URLs:")
    print(all_urls)

    filepath = os.path.join(".", "input.txt")
    try:
        with open(filepath, "w") as f:
            for url in all_urls:
                f.write(url + "\n")
        print(f"URLs saved to {filepath}")
    except Exception as e:
      print(f"Could not save URLs to file: {e}")

Found the following fact-check URLs on page 1:
https://www.politifact.com/factchecks/2025/apr/01/marjorie-taylor-greene/rep-marjorie-taylor-greene-is-wrong-sesame-street/
https://www.politifact.com/factchecks/2025/apr/01/facebook-posts/video-shows-police-support-ahead-of-us-delegations/
https://www.politifact.com/factchecks/2025/apr/01/tiktok-posts/theres-no-evidence-that-speaker-mike-johnson-is-me/
https://www.politifact.com/factchecks/2025/mar/31/tiktok-posts/apple-isnt-secretly-installing-starlink-into-your/
https://www.politifact.com/factchecks/2025/mar/28/facebook-posts/phoenixs-wildfire-risk-linked-to-climate-change-an/
Found the following fact-check URLs on page 2:
https://www.politifact.com/factchecks/2025/mar/28/facebook-posts/theres-no-20000-hidden-grant-program-for-roof-repa/
https://www.politifact.com/factchecks/2025/mar/28/susan-crawford/in-wisconsin-supreme-court-race-crawford-claims-sc/
https://www.politifact.com/factchecks/2025/mar/28/markwayne-mullin/hillary-clinton-se

# BS4 Scraping + JSON Builder

In [None]:
from bs4 import BeautifulSoup
import requests
import yt_dlp
import os
import subprocess
import json
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import shutil

def check_dependencies():
    if not shutil.which("yt-dlp"):
        print("Error: yt-dlp command not found. Please install it (pip install yt-dlp) or add it to your PATH.")
        return False
    if not shutil.which("ffmpeg"):
        print("Error: ffmpeg command not found. Please install it or add it to your PATH.")
        return False
    return True


def get_headline(url):
    try:
        response = requests.get(url, timeout=15) 
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        meta_tag = soup.find('meta', property='og:title')
        if meta_tag and meta_tag.get('content'):
            return meta_tag['content'].strip()
        h1_tag = soup.find('h1')
        if h1_tag:
            return h1_tag.get_text(strip=True)
        return None

    except requests.exceptions.Timeout:
        print(f"Timeout error fetching headline for URL: {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching headline for URL: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred getting headline: {e}")
        return None

def get_subheadline(url):
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        meta_tag = soup.find('meta', property='og:description')
        if meta_tag and meta_tag.get('content'):
            return meta_tag['content'].strip()
        return None

    except requests.exceptions.Timeout:
        print(f"Timeout error fetching subheadline for URL: {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching subheadline for URL: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred getting subheadline: {e}")
        return None

def _get_social_media_platform(link):
    if not link:
        return None
    social_media_domains = {
        "tiktok.com": "tiktok",
        "youtube.com": "youtube",
        "youtu.be": "youtube",
        "twitter.com": "twitter",
        "x.com": "twitter",
        "facebook.com": "facebook",
        "fb.watch": "facebook",
        "reddit.com": "reddit",
        "instagram.com": "instagram",
        "mvau.lt": "media vault (archive)"
    }

    link_lower = link.lower()
    for domain, platform_name in social_media_domains.items():
        try:
            from urllib.parse import urlparse
            parsed_uri = urlparse(link_lower)
            if domain in parsed_uri.netloc:
                 return platform_name
        except Exception:
             if domain in link_lower:
                 return platform_name
    return None

def get_rating(url):
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        meta_tag = soup.find('meta', property='og:image:secure_url')
        if meta_tag and meta_tag.get('content'):
            image_url = meta_tag['content']
            if image_url.endswith("tom_ruling_pof.png"):
                return "false"
            filename = image_url.split('/')[-1]
            if filename.startswith('meter-') and filename.endswith(('.jpg', '.png', '.gif')):
                 rating = filename[len('meter-'):filename.rfind('.')]
                 return rating.strip().replace('-', ' ')

        meta_tag_insecure = soup.find('meta', property='og:image')
        if meta_tag_insecure and meta_tag_insecure.get('content'):
             image_url = meta_tag_insecure['content']
             if image_url.endswith("tom_ruling_pof.png"):
                 return "false"
             filename = image_url.split('/')[-1]
             if filename.startswith('meter-') and filename.endswith(('.jpg', '.png', '.gif')):
                  rating = filename[len('meter-'):filename.rfind('.')]
                  return rating.strip().replace('-', ' ')

        return None

    except requests.exceptions.Timeout:
        print(f"Timeout error fetching rating for URL: {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for rating: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while getting rating: {e}")
        return None

def get_links_from_article_rail(url):
    """Extracts links specifically from the 'sources' section or a similar relevant container."""
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        sources_section = soup.find('section', id='sources')
        if not sources_section:
             sources_section = soup.find('div', class_='entry-content')
             if not sources_section:
                 sources_section = soup.body

        if sources_section:
            links = []
            for a_tag in sources_section.find_all('a', href=True):
                href = a_tag['href']
                if href and href.startswith(('http://', 'https://')):
                    links.append(href)
            return links
        return []

    except requests.exceptions.Timeout:
        print(f"Timeout error fetching links for URL: {url}")
        return []
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for links: {e}")
        return []
    except Exception as e:
        print(f"An error occurred getting links: {e}")
        return []

def get_first_social_link(url):
    links = get_links_from_article_rail(url)
    if not links:
        return None

    for link in links:
        platform = _get_social_media_platform(link)
        if platform and platform in ["tiktok", "youtube", "twitter", "facebook", "reddit", "instagram", "media vault (archive)"]:
            return link

    return "No social link associated with article found."


def download_video(url, output_dir, output_filename="downloaded_video.mp4", verbose=False):
    output_path = os.path.join(output_dir, output_filename)
    duration = None
    platform = _get_social_media_platform(url)
    is_fb_insta_mediavault = platform in ["facebook", "instagram","media vault (archive)"]

    # --- 1. Get Duration ---
    if platform != "media vault (archive)": 
        try:
            ydl_opts_info = {'quiet': not verbose, 'no_warnings': True, 'extract_flat': True, 'skip_download': True}
            with yt_dlp.YoutubeDL(ydl_opts_info) as ydl:
                info_dict = ydl.extract_info(url, download=False)
                if 'entries' in info_dict and info_dict['entries']:
                    duration = info_dict['entries'][0].get('duration')
                else:
                     duration = info_dict.get('duration')


            if duration is None:
                 print(f"Warning: Could not determine video duration for {url} via initial check.")
            elif duration > 600:
                return False, f"Video exceeds 10-minute limit ({duration}s).", duration
            if verbose and duration is not None:
                print(f"Video duration ({duration}s) is within the limit.")

        except yt_dlp.utils.DownloadError as e:
            print(f"Warning: yt-dlp error getting video info/duration for {url}: {e}. Proceeding with download attempt.")
        except Exception as e:
            print(f"Warning: Unexpected error getting video info for {url}: {e}. Proceeding with download attempt.")
    else: 
        if verbose:
            print("Skipping initial duration check for Media Vault link.")

    # --- 2. Download Logic ---
    if is_fb_insta_mediavault:
        # --- Special Handling for Facebook/Instagram using specific commands ---
        if verbose:
            print(f"Detected {platform} URL, using specific download/merge commands.")

        temp_audio_path = os.path.join(output_dir, "audio_temp.mp3")
        temp_video_path = os.path.join(output_dir, "video_temp.mp4")

        # --- Pre-cleanup of potential leftover temp files ---
        try:
            if os.path.exists(temp_audio_path):
                os.remove(temp_audio_path)
                if verbose: print(f"Removed existing temp file: {temp_audio_path}")
            if os.path.exists(temp_video_path):
                os.remove(temp_video_path)
                if verbose: print(f"Removed existing temp file: {temp_video_path}")
        except OSError as e:
            print(f"Warning: Could not remove old temp file: {e}")

        try:
            # --- 1. Download Audio ---
            audio_command = ['yt-dlp', '-x', '--audio-format', 'mp3', '-o', temp_audio_path, url]
            if verbose: print(f"Running yt-dlp audio command: {' '.join(audio_command)}")

            result_audio = subprocess.run(audio_command, capture_output=True, text=True, check=False, encoding='utf-8', errors='ignore')

            if result_audio.returncode != 0:
                error_message = f"yt-dlp audio download failed (Code {result_audio.returncode}):\n{result_audio.stderr or result_audio.stdout}"
                if verbose: print(error_message)
                if os.path.exists(temp_audio_path): os.remove(temp_audio_path)
                return False, error_message, duration

            if not os.path.exists(temp_audio_path):
                error_message = "yt-dlp audio download command succeeded but output file not found."
                if verbose: print(error_message)
                return False, error_message, duration

            if verbose: print("Audio download successful.")

            # --- 2. Download Video ---
            video_command = ['yt-dlp', '-v', '--no-audio', '-f', '[ext=mp4]', '-o', temp_video_path, url]
            if verbose: print(f"Running yt-dlp video command: {' '.join(video_command)}")

            result_video = subprocess.run(video_command, capture_output=True, text=True, check=False, encoding='utf-8', errors='ignore')

            if result_video.returncode != 0:
                error_message = f"yt-dlp video download failed (Code {result_video.returncode}):\n{result_video.stderr or result_video.stdout}"
                if verbose: print(error_message)
                if os.path.exists(temp_audio_path): os.remove(temp_audio_path)
                if os.path.exists(temp_video_path): os.remove(temp_video_path)
                return False, error_message, duration

            if not os.path.exists(temp_video_path):
                error_message = "yt-dlp video download command succeeded but output file not found."
                if verbose: print(error_message)
                if os.path.exists(temp_audio_path): os.remove(temp_audio_path)
                return False, error_message, duration

            if verbose: print("Video download successful.")

            # --- 3. Merge Audio and Video ---
            merge_command = [
                'ffmpeg', '-i', temp_video_path, '-i', temp_audio_path,
                '-c', 'copy', '-map', '0:v:0', '-map', '1:a:0', '-y',
                output_path 
            ]
            if not verbose:
                merge_command.extend(['-loglevel', 'error'])


            if verbose: print(f"Running ffmpeg merge command: {' '.join(merge_command)}")
            result_merge = subprocess.run(merge_command, capture_output=True, text=True, check=False, encoding='utf-8', errors='ignore')

            if os.path.exists(temp_audio_path):
                try: os.remove(temp_audio_path)
                except OSError as e: print(f"Warning: Could not remove temp file {temp_audio_path}: {e}")
            if os.path.exists(temp_video_path):
                try: os.remove(temp_video_path)
                except OSError as e: print(f"Warning: Could not remove temp file {temp_video_path}: {e}")

            if result_merge.returncode != 0:
                error_message = f"FFmpeg merge failed (Code {result_merge.returncode}):\nStderr: {result_merge.stderr}"
                if verbose: print(error_message)
                if os.path.exists(output_path):
                    try: os.remove(output_path)
                    except OSError as e: print(f"Warning: Could not remove failed output file {output_path}: {e}")
                return False, error_message, duration
            else:
                if verbose: print("FFmpeg merge successful.")
                if not os.path.exists(output_path):
                    return False, "FFmpeg merge command succeeded but final output file not found.", duration
                return True, f"Successfully downloaded and merged video for {url}.", duration

        except FileNotFoundError as e:
            error_msg = f"{e.filename} command not found. Please ensure yt-dlp and ffmpeg are installed and in PATH."
            print(error_msg)
            if os.path.exists(temp_audio_path):
                try: os.remove(temp_audio_path)
                except OSError: pass
            if os.path.exists(temp_video_path):
                try: os.remove(temp_video_path)
                except OSError: pass
            return False, error_msg, duration
        except Exception as e:
            error_msg = f"An unexpected error occurred during {platform} download: {e}"
            print(error_msg)
            if os.path.exists(temp_audio_path):
                try: os.remove(temp_audio_path)
                except OSError: pass
            if os.path.exists(temp_video_path):
                try: os.remove(temp_video_path)
                except OSError: pass
            return False, error_msg, duration
    else:
        # --- Standard Logic using yt-dlp library (for other platforms) ---
        if verbose:
            print(f"Using standard yt-dlp library download for {platform or 'unknown platform'}.")
        try:
            base_name = os.path.splitext(output_filename)[0]
            temp_output_template = os.path.join(output_dir, f'temp_{base_name}.%(ext)s')

            ydl_opts = {
                'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best', 
                'outtmpl': temp_output_template,
                'quiet': not verbose,
                'no_warnings': True,
                'merge_output_format': 'mp4', 
            }

            downloaded_filepath = None
            final_info_dict = None
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                if duration is None:
                    try:
                         pre_download_info = ydl.extract_info(url, download=False)
                         if 'entries' in pre_download_info and pre_download_info['entries']:
                             duration = pre_download_info['entries'][0].get('duration')
                         else:
                             duration = pre_download_info.get('duration')

                         if duration is not None:
                             if verbose: print(f"Determined duration just before download: {duration}s")
                             if duration > 600:
                                 return False, f"Video exceeds 10-minute limit ({duration}s) just before download.", duration
                         else:
                              print(f"Warning: Still could not determine duration for {url} before download.")
                    except Exception as info_err:
                         print(f"Warning: Error getting duration just before download: {info_err}")

                try:
                    final_info_dict = ydl.extract_info(url, download=True)
                except yt_dlp.utils.DownloadError as download_err:
                     return False, f"yt-dlp download error: {download_err}", duration

                if final_info_dict:
                     downloaded_filepath = ydl.prepare_filename(final_info_dict)
                else:
                     print("Warning: final_info_dict not available, attempting to find temp file by pattern.")
                     base_temp = os.path.join(output_dir, f'temp_{base_name}')
                     possible_files = [f for f in os.listdir(output_dir) if f.startswith(os.path.basename(base_temp))]
                     if possible_files:
                         possible_files.sort(key=lambda f: os.path.getmtime(os.path.join(output_dir, f)), reverse=True)
                         downloaded_filepath = os.path.join(output_dir, possible_files[0])
                         if verbose: print(f"Found temp file by pattern: {downloaded_filepath}")
                     else:
                          raise yt_dlp.utils.DownloadError(f"Could not determine downloaded file path based on template '{temp_output_template}'.")


            if not downloaded_filepath or not os.path.exists(downloaded_filepath):
                 return False, f"Download seemed to finish but output file not found: {downloaded_filepath}", duration

            _, ext = os.path.splitext(downloaded_filepath)
            if ext.lower() == '.mp4':
                if downloaded_filepath != output_path:
                    if verbose: print(f"Renaming {downloaded_filepath} to {output_path}")
                    shutil.move(downloaded_filepath, output_path)
                else:
                    if verbose: print(f"Downloaded file already at target path: {output_path}")
                return True, "Successfully downloaded (MP4).", duration

            if verbose: print(f"Downloaded file is not MP4 ({ext}), converting with ffmpeg...")
            ffmpeg_flags = ['-c:v', 'libx264', '-c:a', 'aac', '-strict', 'experimental', '-y']
            if not verbose:
                 ffmpeg_flags.extend(['-loglevel', 'error'])

            convert_command = [
                'ffmpeg', '-i', downloaded_filepath
            ] + ffmpeg_flags + [output_path]

            if verbose: print(f"Running ffmpeg conversion command: {' '.join(convert_command)}")
            result_convert = subprocess.run(convert_command, capture_output=True, text=True, check=False, encoding='utf-8', errors='ignore')

            os.remove(downloaded_filepath)

            if result_convert.returncode != 0:
                error_message = f"FFmpeg conversion failed (Code {result_convert.returncode}):\nStderr: {result_convert.stderr}"
                if verbose: print(error_message)
                if os.path.exists(output_path): os.remove(output_path)
                return False, error_message, duration
            else:
                if verbose: print("FFmpeg conversion successful.")
                if not os.path.exists(output_path):
                    return False, "FFmpeg conversion command succeeded but final output file not found.", duration
                return True, "Successfully downloaded and converted to MP4.", duration

        except FileNotFoundError as e:
             error_msg = f"{e.filename} not found. Please ensure ffmpeg (and yt-dlp library) is installed and accessible."
             print(error_msg)
             if downloaded_filepath and os.path.exists(downloaded_filepath): os.remove(downloaded_filepath)
             base_temp = os.path.join(output_dir, f'temp_{base_name}')
             for f in os.listdir(output_dir):
                 if f.startswith(os.path.basename(base_temp)):
                     try: os.remove(os.path.join(output_dir, f))
                     except OSError: pass
             return False, error_msg, duration
        except Exception as e:
             error_msg = f"An unexpected error occurred during standard download/conversion: {e}"
             print(error_msg)
             if downloaded_filepath and os.path.exists(downloaded_filepath): os.remove(downloaded_filepath)
             base_temp = os.path.join(output_dir, f'temp_{base_name}')
             for f in os.listdir(output_dir):
                  if f.startswith(os.path.basename(base_temp)):
                      try: os.remove(os.path.join(output_dir, f))
                      except OSError: pass
             return False, error_msg, duration


def download_progress_hook(d):
    if d['status'] == 'downloading':
        filename = d.get('filename', 'N/A')
        percent = d.get('_percent_str', 'N/A')
        speed = d.get('_speed_str', 'N/A')
        eta = d.get('_eta_str', 'N/A')
        print(f"Downloading {os.path.basename(filename)}: {percent} at {speed}, ETA: {eta}      ", end='\r')
    elif d['status'] == 'finished':
        filename = d.get('filename', 'N/A')
        print(f"\nFinished downloading {os.path.basename(filename)}.")
    elif d['status'] == 'error':
        print("\nError during download hook.")


def get_youtube_description(url):
    try:
         ydl_opts = {'quiet': True, 'no_warnings': True, 'skip_download': True}
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info_dict = ydl.extract_info(url, download=False)
             description = info_dict.get('description')
             if description:
                 return description.strip()
             else:
                 print(f"Could not get YouTube description via yt-dlp for {url}")

    except Exception as ydl_err:
        print(f"yt-dlp failed to get YouTube description: {ydl_err}. Falling back to requests/BS4.")

    try:
        match = re.search(r"(?:v=|/|embed/|shorts/|youtu\.be/)([\w-]{11})", url)
        if not match:
            print(f"Could not extract YouTube video ID from URL: {url}")
            return None
        video_id = match.group(1)
        watch_url = f"https://www.youtube.com/watch?v={video_id}"

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.9'
        }
        response = requests.get(watch_url, headers=headers, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        meta_desc = soup.find('meta', itemprop='description')
        if meta_desc and meta_desc.get('content'):
            return meta_desc['content'].strip()

        scripts = soup.find_all('script')
        for script in scripts:
            if script.string and 'ytInitialPlayerResponse' in script.string:
                try:
                    json_str = script.string[script.string.find('{'):script.string.rfind('}')+1]
                    data = json.loads(json_str)
                    description = data.get('microformat', {}).get('playerMicroformatRenderer', {}).get('description', {}).get('simpleText')
                    if description:
                        return description.strip()
                except (json.JSONDecodeError, KeyError, AttributeError) as e:
                    continue

        desc_tag = soup.find('meta', attrs={'name': 'description'})
        if desc_tag and desc_tag.get('content'):
            return desc_tag.get('content').strip()

        print(f"Description not found using various methods for {watch_url}.")
        return None

    except requests.exceptions.Timeout:
        print(f"Timeout error fetching YouTube description for URL: {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching YouTube URL for description: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred getting YouTube description: {e}")
        return None


def get_tweet_text(url):
    print("Attempting to get tweet text using Selenium")
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # Set user agent

    driver = None
    try:
        driver = webdriver.Chrome(options=chrome_options)
        driver.set_page_load_timeout(30)
        driver.get(url)
        wait = WebDriverWait(driver, 25)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'article[data-testid="tweet"]')))
        time.sleep(2)

        try:
            tweet_container = driver.find_element(By.CSS_SELECTOR, 'article[data-testid="tweet"]')
            tweet_text_element = tweet_container.find_element(By.CSS_SELECTOR, 'div[data-testid="tweetText"]')
            tweet_text = tweet_text_element.text
            return tweet_text.strip() if tweet_text else None

        except NoSuchElementException:
            print(f"Error: Tweet text element structure not found for URL: {url}. Page source might reveal the issue.")
            return None
        except TimeoutException:
             print(f"Timeout: Tweet content area did not load within the time limit for URL: {url}")
             return None


    except TimeoutException:
        print(f"Timeout: Page load timed out for Twitter/X URL: {url}")
        return None
    except Exception as e:
        print(f"An error occurred with Selenium for Twitter/X: {e}")
        return None
    finally:
        if driver:
            driver.quit()


def get_tiktok_description(tiktok_url):
    try:
         ydl_opts = {'quiet': True, 'no_warnings': True, 'skip_download': True}
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info_dict = ydl.extract_info(tiktok_url, download=False)
             description = info_dict.get('description') or info_dict.get('title')
             if description:
                 return description.strip()
             else:
                 print(f"Could not get TikTok description via yt-dlp info for {tiktok_url}")

    except Exception as ydl_err:
        print(f"yt-dlp failed to get TikTok description: {ydl_err}. Falling back to requests/BS4.")

    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Referer': 'https://www.tiktok.com/',
            'Accept-Language': 'en-US,en;q=0.9'
        }
        canonical_url, _ = get_seo_canonical_url(tiktok_url)
        url_to_fetch = canonical_url if canonical_url else tiktok_url

        if not url_to_fetch:
             print("Error: Could not determine a valid TikTok URL to fetch.")
             return None

        print(f"Fetching TikTok page: {url_to_fetch}")
        response = requests.get(url_to_fetch, headers=headers, timeout=20)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        script_tag = soup.find('script', {'id': '__UNIVERSAL_DATA_FOR_REHYDRATION__'})
        if script_tag and script_tag.string:
            try:
                json_data = json.loads(script_tag.string)
                item_struct = json_data.get('__DEFAULT_SCOPE__', {}).get('webapp.video-detail', {}).get('itemInfo', {}).get('itemStruct', {})
                description = item_struct.get('desc')
                if description:
                    return description.strip()
                else:
                    author_name = item_struct.get('author', {}).get('nickname')
                    music_title = item_struct.get('music', {}).get('title')
                    if author_name or music_title:
                         return f"Video by {author_name or 'unknown'} using sound {music_title or 'unknown'}".strip()

            except (json.JSONDecodeError, KeyError, AttributeError) as e:
                print(f"Error parsing TikTok JSON data: {e}. Structure might have changed.")

        meta_desc = soup.find('meta', property='og:description')
        if meta_desc and meta_desc.get('content'):
            return meta_desc['content'].strip()

        print(f"Could not find TikTok description using various methods for {url_to_fetch}.")
        return None

    except requests.exceptions.Timeout:
        print(f"Timeout error fetching TikTok description for URL: {tiktok_url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error: Request failed fetching TikTok description: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred getting TikTok description: {e}")
        return None


def get_tiktok_video_id(url):
    """Extracts the video ID from various TikTok URL formats."""
    if not url: return None
    match = re.search(r'(?:/video/|/v/|item_id=|share/video/)(\d{10,})', url)
    if match:
        return match.group(1)
    match_canonical = re.search(r'@[\w.-]+/video/(\d+)', url)
    if match_canonical:
        return match_canonical.group(1)
    return None


def get_seo_canonical_url(tiktok_url: str):
    """Attempts to find the canonical URL and video ID from a TikTok page."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9'
    }
    canonical_url = None
    video_id = None

    try:
        response = requests.get(tiktok_url, headers=headers, allow_redirects=True, timeout=20)
        response.raise_for_status()
        final_url = response.url
        video_id = get_tiktok_video_id(final_url)

        soup = BeautifulSoup(response.content, 'html.parser')

        link_canonical = soup.find('link', rel='canonical')
        if link_canonical and link_canonical.get('href'):
            canonical_url = link_canonical['href']
            if not video_id:
                video_id = get_tiktok_video_id(canonical_url)

        if not canonical_url:
            script_tag = soup.find('script', {'id': '__UNIVERSAL_DATA_FOR_REHYDRATION__'})
            if script_tag and script_tag.string:
                try:
                    json_data = json.loads(script_tag.string)
                    canonical_url = json_data.get('__DEFAULT_SCOPE__', {}).get('seo.abtest', {}).get('canonical')
                    if not canonical_url:
                         canonical_url = json_data.get('__DEFAULT_SCOPE__', {}).get('webapp.video-detail', {}).get('shareMeta', {}).get('canonical')

                    if canonical_url and not video_id:
                        video_id = get_tiktok_video_id(canonical_url)

                except (json.JSONDecodeError, KeyError, AttributeError) as e:
                    print(f"Warning: Error parsing TikTok JSON for canonical URL: {e}")

        if not video_id:
            video_id = get_tiktok_video_id(tiktok_url)

        if not canonical_url:
             canonical_url = final_url

        return canonical_url, video_id

    except requests.exceptions.Timeout:
        print(f"Timeout resolving TikTok URL/getting canonical: {tiktok_url}")
        return tiktok_url, get_tiktok_video_id(tiktok_url)
    except requests.exceptions.RequestException as e:
        print(f"Request failed resolving TikTok URL/getting canonical: {e}")
        return tiktok_url, get_tiktok_video_id(tiktok_url)
    except Exception as e:
        print(f"An unexpected error occurred getting TikTok canonical URL: {e}")
        return tiktok_url, get_tiktok_video_id(tiktok_url)


def get_reddit_post_title(url):
    try:
         ydl_opts = {'quiet': True, 'no_warnings': True, 'skip_download': True}
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info_dict = ydl.extract_info(url, download=False)
             title = info_dict.get('title')
             if title:
                 cleaned_title = re.sub(r'^r/\w+\s*-\s*', '', title).strip()
                 if cleaned_title: return cleaned_title
                 else: return title.strip()
             else:
                  print(f"Could not get Reddit title via yt-dlp for {url}")

    except Exception as ydl_err:
        print(f"yt-dlp failed to get Reddit title: {ydl_err}. Falling back to requests/BS4.")

    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        old_url = url.replace("www.reddit.com", "old.reddit.com")
        try:
             response = requests.get(old_url, headers=headers, timeout=15)
             response.raise_for_status()
             print(f"Using old.reddit.com for: {url}")
        except requests.exceptions.RequestException:
             print(f"Failed to fetch old.reddit.com, trying www.reddit.com for: {url}")
             response = requests.get(url, headers=headers, timeout=15)
             response.raise_for_status()


        soup = BeautifulSoup(response.text, 'html.parser')
        title_tag_old = soup.select_one('p.title a.title')
        if title_tag_old:
            return title_tag_old.get_text(strip=True)

        title_element_new = soup.select_one('[id^="post-title-"]')
        if title_element_new:
            return title_element_new.get_text(strip=True)

        meta_title = soup.find('meta', property='og:title')
        if meta_title and meta_title.get('content'):
             title_content = meta_title['content'].strip()
             cleaned_title = re.sub(r'^r/\w+\s*-\s*', '', title_content).strip()
             return cleaned_title if cleaned_title else title_content


        print(f"Could not find Reddit post title using various methods for {url}")
        return None

    except requests.exceptions.Timeout:
        print(f"Timeout error fetching Reddit title for URL: {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Reddit URL for title: {e}")
        return None
    except Exception as e:
        print(f"An error occurred getting Reddit title: {e}")
        return None


def get_content_from_url(url):
    """Gets description/text/title from supported social media URLs."""
    if not url:
        print("Error: No URL provided to get_content_from_url.")
        return None

    platform = _get_social_media_platform(url)
    content = None
    site_type = platform 

    try:
        if platform == "youtube":
            content = get_youtube_description(url)
        elif platform == "twitter":
            content = get_tweet_text(url)
            if not content:
                 print("Falling back to yt-dlp for Twitter/X metadata (might get username/status)...")
                 try:
                     ydl_opts = {'quiet': True, 'no_warnings': True, 'skip_download': True}
                     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                         info_dict = ydl.extract_info(url, download=False)
                         content = info_dict.get('title') or info_dict.get('description')
                         if content: content = content.strip()
                 except Exception as ydl_twitter_err:
                     print(f"yt-dlp fallback for Twitter also failed: {ydl_twitter_err}")

        elif platform == "tiktok":
            content = get_tiktok_description(url)
        elif platform == "reddit":
            content = get_reddit_post_title(url)
        elif platform == "facebook":
             print("Attempting to get Facebook content via yt-dlp (description/title)...")
             try:
                 ydl_opts = {'quiet': True, 'no_warnings': True, 'skip_download': True}
                 with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                     info_dict = ydl.extract_info(url, download=False)
                     content = info_dict.get('description') or info_dict.get('title')
                     if content: content = content.strip()
             except Exception as ydl_fb_err:
                 print(f"yt-dlp failed to get Facebook content: {ydl_fb_err}")
        elif platform == "instagram":
              print("Attempting to get Instagram content via yt-dlp (caption)...")
              try:
                 ydl_opts = {'quiet': True, 'no_warnings': True, 'skip_download': True}
                 with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                     info_dict = ydl.extract_info(url, download=False)
                     content = info_dict.get('description')
                     if content: content = content.strip()
              except Exception as ydl_ig_err:
                  print(f"yt-dlp failed to get Instagram content: {ydl_ig_err}")
        elif platform == "media vault (archive)":
             print("Skipping content extraction for Media Vault link.")
             content = "[Media Vault Archive Link]"
             site_type = "media_vault"
        else:
            print(f"Unsupported URL for content extraction: {url}")
            site_type = "unsupported"

        if content:
            return {"site_type": site_type, "content": content}
        else:
            print(f"Content extraction failed for {platform} URL: {url}")
            return {"site_type": site_type, "content": None}

    except Exception as e:
        print(f"An unexpected error occurred in get_content_from_url for {url}: {e}")
        import traceback
        traceback.print_exc()
        return {"site_type": platform or "error", "content": None}


def extract_sentences_with_links(url):
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        results = {}
        body_tag = soup.body
        if not body_tag:
            print(f"Warning: Could not find <body> tag in {url}")
            return {}
        tags_to_check = body_tag.find_all(['p', 'blockquote'])
        for tag in tags_to_check:
            links = tag.find_all('a', href=True)
            if links and tag.get_text(strip=True):
                full_text = ' '.join(tag.get_text(separator=' ', strip=True).split())
                link_dict = {}

                for link in links:
                    link_text = ' '.join(link.get_text(separator=' ', strip=True).split())
                    href = link.get('href')
                    if href and href.startswith(('http://', 'https://')):
                         link_dict[link_text or href] = href
                if link_dict:
                    results[full_text] = link_dict
        return results
    except requests.exceptions.Timeout:
        print(f"Timeout error fetching sentences/links for URL: {url}")
        return {}
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url} for sentences/links: {e}")
        return {}
    except Exception as e:
        print(f"An error occurred extracting sentences/links from {url}: {e}")
        return {}



if __name__ == '__main__':
    START_INDEX = 0  # Set to previous video_count if resuming
    INPUT_FILE = "input.txt"
    DOWNLOAD_DIR_BASE = "/fs/clip-projects/rlab/atrey/ooc-misinformation/scraping"
    OUTPUT_DIR_NAME = "downloaded_videos_politifact"
    OUTPUT_JSON_NAME = "video_data_politifact.json"
    VERBOSE_DOWNLOAD = False # Set to True for detailed yt-dlp/ffmpeg output

    if not check_dependencies():
        exit(1)

    download_dir = os.path.join(DOWNLOAD_DIR_BASE, OUTPUT_DIR_NAME)
    output_json_path = os.path.join(DOWNLOAD_DIR_BASE, OUTPUT_JSON_NAME)

    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
        print(f"Created download directory: {download_dir}")

    results = []
    if os.path.exists(output_json_path):
        try:
            with open(output_json_path, "r") as f:
                results = json.load(f)
            print(f"Loaded {len(results)} existing records from {output_json_path}")
            if not isinstance(results, list): 
                 print("Error: JSON file does not contain a list. Starting fresh.")
                 results = []
        except json.JSONDecodeError:
            print(f"Error decoding JSON from {output_json_path}. Starting fresh.")
            results = []
        except Exception as e:
             print(f"Error loading JSON file: {e}. Starting fresh.")
             results = []


    # Determine starting ID based on existing results
    existing_ids_in_json = set()
    max_existing_id = 0
    if results:
        for item in results:
            if item.get('id') is not None:
                try:
                    item_id = int(item['id'])
                    existing_ids_in_json.add(item_id)
                    max_existing_id = max(max_existing_id, item_id)
                except (ValueError, TypeError):
                    print(f"Warning: Found non-integer or invalid id in existing data: {item.get('id')}")

    current_processing_id = max_existing_id + 1

    print(f"Starting next processing ID at: {current_processing_id}")
    successful_downloads = sum(1 for item in results if item.get('download_success'))
    print(f"Found {successful_downloads} successful downloads in existing data.")

    try:
        with open(INPUT_FILE, "r") as f:
            all_urls = [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        print(f"Error: Input file '{INPUT_FILE}' not found.")
        exit(1)

    total_urls = len(all_urls)
    print(f"Found {total_urls} URLs in {INPUT_FILE}.")

    processed_count = 0
    skipped_count = 0
    download_errors = 0
    successful_downloads = 0


    # Determine the actual starting index in the input list if resuming
    start_processing_index = 0
    processed_urls = set(item['politifact_url'] for item in results if item.get('politifact_url'))
    while start_processing_index < len(all_urls) and all_urls[start_processing_index] in processed_urls:
        start_processing_index += 1

    if start_processing_index > 0:
        print(f"Resuming processing from URL index {start_processing_index} (URL: {all_urls[start_processing_index]})")
    else:
        print("Starting processing from the beginning of the input file.")
        

    for i, url in enumerate(all_urls[start_processing_index:], start=start_processing_index):
        print("-" * 40)
        print(f"Processing URL {i+1}/{total_urls} (ID: {current_processing_id}): {url}")

        headline = get_headline(url)
        subheadline = get_subheadline(url)
        rating = get_rating(url)
        external_links_data = extract_sentences_with_links(url)

        print(f"  Headline: {headline}")
        print(f"  Subheadline: {subheadline}")
        print(f"  Rating: {rating}")
        print(f"  External Links Found: {len(external_links_data)}")


        # --- Find and Process Social Link ---
        social_link = get_first_social_link(url)
        social_platform = _get_social_media_platform(social_link)
        destination_path = None
        download_success = False
        download_message = "No social media link found in sources."
        social_text = None
        video_duration = None
        content_result = None

        if social_link and social_link != "No social link associated with article found.":
            print(f"  Found Social Link: {social_link} (Platform: {social_platform})")
            print("  Fetching social media content text...")
            content_result = get_content_from_url(social_link)
            if content_result:
                social_text = content_result.get('content')
                social_platform = content_result.get('site_type', social_platform)
                print(f"    Social Text: {' '.join(social_text.split())[:100]}..." if social_text else "None")
            else:
                 print("    Failed to fetch social media content text.")


            # --- Download Video ---
            current_video_filename = f"video_{current_processing_id}.mp4"
            print(f"  Attempting download to {current_video_filename}...")
            download_success, download_message, duration = download_video(
                social_link,
                download_dir,
                output_filename=current_video_filename,
                verbose=VERBOSE_DOWNLOAD
            )

            video_duration = duration # Store duration regardless of success

            if download_success:
                destination_path = os.path.join(download_dir, current_video_filename)
                print(f"  Video download successful: {destination_path}")
                print(f"  Message: {download_message}")
                successful_downloads += 1
            else:
                print(f"  Video download failed.")
                print(f"  Message: {download_message}")
                download_errors += 1

        else:
            print(f"  No usable social media link found in sources for {url}.")
            download_message = "No valid social media link found in article sources."

        video_data = {
            'id': current_processing_id,
            'politifact_url': url,
            'politifact_headline': headline,
            'politifact_subheadline': subheadline,
            'rating': rating,
            'social_link': social_link if social_link != "No social link associated with article found." else None,
            'social_platform': social_platform,
            'social_duration': video_duration,
            'social_text': social_text,
            'external_links_info': external_links_data,
            'download_success': download_success,
            'download_message': download_message,
            'drive_path': destination_path,
            'processing_timestamp': time.strftime("%Y-%m-%d %H:%M:%S %Z")
        }
        results.append(video_data)
        processed_count += 1
        current_processing_id += 1

        # --- Save Progress Periodically (e.g., after each URL) ---
        try:
            temp_json_path = output_json_path + ".tmp"
            with open(temp_json_path, "w") as f:
                json.dump(results, f, indent=4)
            os.replace(temp_json_path, output_json_path)
        except Exception as e:
            print(f"Error writing progress to JSON file: {e}")
            if os.path.exists(temp_json_path):
                 try: os.remove(temp_json_path)
                 except OSError: pass


        print(f"Finished processing URL {i+1}. Successful Downloads: {successful_downloads}, Failures: {download_errors}, Skipped: {skipped_count}")

    print("\n" + "="*50)
    print("Processing Complete.")
    print(f"Total URLs in input: {total_urls}")
    print(f"URLs processed in this run: {processed_count}")
    print(f"URLs skipped (already in JSON): {skipped_count}")
    print(f"Successful video downloads: {successful_downloads}")
    print(f"Video download errors/skips: {download_errors}")
    print(f"Total records in JSON: {len(results)}")
    print(f"Final data saved to: {output_json_path}")
    print("="*50)

Error decoding JSON from /fs/clip-projects/rlab/atrey/ooc-misinformation/scraping/video_data_politifact.json. Starting fresh.
Starting next processing ID at: 1
Found 0 successful downloads in existing data.
Found 1 URLs in input.txt.
Starting processing from the beginning of the input file.
----------------------------------------
Processing URL 1/1 (ID: 1): https://www.politifact.com/factchecks/2025/mar/31/tiktok-posts/apple-isnt-secretly-installing-starlink-into-your/
  Headline: Apple isn’t secretly installing Starlink into your iPhone
  Rating: false
  External Links Found: 27
  Found Social Link: https://mvau.lt/media/8e1f214a-f83f-4db1-8c0e-76e17bafe18b (Platform: media vault (archive))
  Fetching social media content text...
Skipping content extraction for Media Vault link.
    Social Text: [Media Vault Archive Link]...
  Attempting download to video_1.mp4...
  Video download successful: /fs/clip-projects/rlab/atrey/ooc-misinformation/scraping/downloaded_videos_politifact/video_