# Testing Options Scraper with Automated Login - FIXED VERSION

This notebook fixes the table reading issues in the original scraper:
1. Better detection between active trades vs closed trade perspectives
2. Improved table finding logic for Bull Put strategies
3. Enhanced debugging output to identify which tables are being scraped

## Setup Requirements

1. Create a credentials file `credentials.txt` in the same directory with format:
   ```
   username:your_username_or_email
   password:your_password
   ```

2. Add `credentials.txt` to your `.gitignore` file to keep credentials secure.

3. The browser session will remain logged in after the script completes.

In [1]:
import sqlite3
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import re
import time
from datetime import datetime

In [2]:
def load_credentials(credentials_file='credentials.txt'):
    """
    Load username and password from credentials file
    
    Parameters:
    credentials_file (str): Path to credentials file
    
    Returns:
    tuple: (username, password) or (None, None) if file not found
    """
    try:
        if not os.path.exists(credentials_file):
            print(f"Credentials file '{credentials_file}' not found.")
            print("Please create a file with format:")
            print("username:your_username_or_email")
            print("password:your_password")
            return None, None
        
        username = None
        password = None
        
        with open(credentials_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line.startswith('username:'):
                    username = line.split('username:', 1)[1].strip()
                elif line.startswith('password:'):
                    password = line.split('password:', 1)[1].strip()
        
        if not username or not password:
            print("Invalid credentials file format. Expected:")
            print("username:your_username_or_email")
            print("password:your_password")
            return None, None
        
        return username, password
        
    except Exception as e:
        print(f"Error reading credentials file: {str(e)}")
        return None, None

In [3]:
def automated_login(driver, username, password, max_retries=3):
    """
    Perform automated login to optionrecom.com
    
    Parameters:
    driver: Selenium WebDriver instance
    username (str): Username or email
    password (str): Password
    max_retries (int): Maximum number of login attempts
    
    Returns:
    bool: True if login successful, False otherwise
    """
    for attempt in range(max_retries):
        try:
            print(f"Login attempt {attempt + 1} of {max_retries}...")
            
            # Navigate to login page
            driver.get("https://optionrecom.com/my-account-2/")
            time.sleep(3)
            
            # Wait for and find username field
            username_field = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.NAME, "username"))
            )
            
            # Clear and enter username
            username_field.clear()
            username_field.send_keys(username)
            print("Username entered successfully")
            
            # Find and enter password
            password_field = driver.find_element(By.NAME, "password")
            password_field.clear()
            password_field.send_keys(password)
            print("Password entered successfully")
            
            # Find and click login button
            login_button = driver.find_element(By.XPATH, "//button[@name='login']")
            login_button.click()
            print("Login button clicked")
            
            # Wait for page to load and check if login was successful
            time.sleep(5)
            
            # Check if we're still on the login page (indicates failed login)
            current_url = driver.current_url
            if "my-account" in current_url and "login" not in current_url.lower():
                print("Login successful!")
                return True
            
            # Check for error messages
            error_elements = driver.find_elements(By.CSS_SELECTOR, ".woocommerce-error, .error, [class*='error']")
            if error_elements:
                error_text = error_elements[0].text
                print(f"Login failed: {error_text}")
            else:
                print("Login may have failed - still on login page")
            
        except TimeoutException:
            print(f"Timeout on attempt {attempt + 1} - page took too long to load")
        except NoSuchElementException as e:
            print(f"Could not find login element on attempt {attempt + 1}: {str(e)}")
        except Exception as e:
            print(f"Unexpected error on attempt {attempt + 1}: {str(e)}")
        
        if attempt < max_retries - 1:
            print(f"Retrying in 3 seconds...")
            time.sleep(3)
    
    print(f"Login failed after {max_retries} attempts")
    return False

In [4]:
def connect_to_database(db_path='../database/option_strategies.db'):
    """Connect to the SQLite database"""
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        
        # Verify the database has the required table
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='option_strategies'")
        if not cursor.fetchone():
            print(f"Error: Database {db_path} does not contain the option_strategies table.")
            print("Please run the database setup script first.")
            conn.close()
            return None, None
            
        return conn, cursor
    except Exception as e:
        print(f"Error connecting to database: {str(e)}")
        return None, None

In [5]:
def extract_date(driver):
    """Extract the date from the page"""
    try:
        # Search the page text
        page_text = driver.find_element(By.TAG_NAME, "body").text
        date_pattern = r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d+\w*,\s+\d{4}'
        matches = re.findall(date_pattern, page_text)
        
        if matches:
            return matches[0]
        
        return "Date not found"
        
    except Exception as e:
        print(f"Error extracting date: {str(e)}")
        return "Date extraction error"

In [6]:
def extract_options_expiry_date(driver, tab_content=None):
    """Extract the Options Expiry Date"""
    try:
        # Search entire page
        page_text = driver.find_element(By.TAG_NAME, "body").text
        date_match = re.search(r'Options Expiry Date:?\s*(\d{4}-\d{2}-\d{2})', page_text)
        if date_match:
            return date_match.group(1)
        
        # More general search
        date_match = re.search(r'(\d{4}-\d{2}-\d{2})', page_text)
        if date_match:
            return date_match.group(1)
        
        return "Expiry date not found"
        
    except Exception as e:
        print(f"Error extracting options expiry date: {str(e)}")
        return "Expiry date extraction error"

In [7]:
def is_active_trades_table(table):
    """Check if a table contains active trades (not closed trade perspectives)"""
    try:
        # Get table text to check for closed trade indicators
        table_text = table.text.lower()
        
        # Look for indicators of closed trades
        closed_indicators = [
            'closed trade perspectives',
            'historical',
            'past performance',
            'expired',
            'completed'
        ]
        
        # Check if any closed indicators are present
        for indicator in closed_indicators:
            if indicator in table_text:
                return False
        
        # Check the parent container for closed trade indicators
        parent_elements = []
        current = table
        for _ in range(3):  # Check up to 3 levels up
            try:
                current = current.find_element(By.XPATH, '..')
                parent_elements.append(current)
            except:
                break
        
        for parent in parent_elements:
            try:
                parent_text = parent.text.lower()
                for indicator in closed_indicators:
                    if indicator in parent_text:
                        return False
            except:
                continue
        
        return True
        
    except Exception as e:
        print(f"Error checking if table is active trades: {str(e)}")
        return True  # Default to True if we can't determine

In [8]:
def find_best_table_in_tab(driver, tab_content):
    """Find the best table within a tab, prioritizing active trades"""
    tables = []
    
    # First try to find tables within the tab content
    if tab_content:
        try:
            tables_in_tab = tab_content.find_elements(By.TAG_NAME, "table")
            tables.extend(tables_in_tab)
        except:
            pass
    
    # If no tables in tab content, look for visible tables on the page
    if not tables:
        all_tables = driver.find_elements(By.TAG_NAME, "table")
        visible_tables = [t for t in all_tables if t.is_displayed()]
        tables = visible_tables
    
    if not tables:
        return None
    
    # Filter for active trades tables and tables with data
    best_table = None
    best_score = -1
    
    for table in tables:
        try:
            # Check if table has data rows
            rows = table.find_elements(By.TAG_NAME, "tr")
            data_rows = [r for r in rows[1:] if r.find_elements(By.TAG_NAME, "td")]  # Skip header
            
            if not data_rows:
                continue
            
            score = len(data_rows)  # Base score on number of data rows
            
            # Prioritize active trades tables
            if is_active_trades_table(table):
                score += 1000  # Big bonus for active trades
            
            # Check if table has expected columns
            headers = table.find_elements(By.TAG_NAME, "th")
            header_texts = [h.text.upper() for h in headers]
            
            expected_columns = ['TICKER', 'SYMBOL', 'TRIGGER', 'STRIKE', 'PREMIUM']
            column_matches = sum(1 for col in expected_columns if any(col in h for h in header_texts))
            score += column_matches * 10
            
            if score > best_score:
                best_score = score
                best_table = table
                
        except Exception as e:
            print(f"Error evaluating table: {str(e)}")
            continue
    
    return best_table

In [9]:
def extract_table_data(driver, tab, tab_index, date_info, strategy_type, conn, cursor):
    """Extract data from the table in the current tab and save to database - FIXED VERSION"""
    try:
        tab_name = tab.text.strip().replace('\n', ' ')
        print(f"\nProcessing Tab {tab_index+1}: '{tab_name}'")
        
        # Click the tab
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", tab)
        time.sleep(1)
        
        try:
            tab.click()
        except:
            driver.execute_script("arguments[0].click();", tab)
        
        time.sleep(3)  # Give more time for content to load
        
        # Find tab content
        tab_href = tab.get_attribute("href")
        tab_content = None
        
        if tab_href and "#" in tab_href:
            tab_id = tab_href.split("#")[1]
            try:
                tab_content = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.ID, tab_id))
                )
                print(f"Found tab content for ID: {tab_id}")
            except Exception as e:
                print(f"Could not find tab content for ID {tab_id}: {str(e)}")
        
        # Extract options expiry date
        options_expiry_date = extract_options_expiry_date(driver, tab_content)
        
        # Find the best table for this tab
        table = find_best_table_in_tab(driver, tab_content)
        
        if not table:
            print(f"No suitable tables found in tab #{tab_index+1}")
            return 0
        
        # Check if this is an active trades table
        is_active = is_active_trades_table(table)
        print(f"Table type: {'Active trades' if is_active else 'Closed/Historical trades'}")
        
        if not is_active:
            print(f"Skipping closed trade perspectives table in tab #{tab_index+1}")
            return 0
        
        # Extract headers
        headers = table.find_elements(By.TAG_NAME, "th")
        header_texts = [header.text.strip() for header in headers]
        print(f"Table headers: {header_texts}")
        
        # Find column indices with improved matching
        column_map = {
            'ID': -1,
            'Ticker': -1,
            'Trigger Price': -1,
            'Strike Price': -1,
            'Estimated Premium': -1
        }
        
        for i, header in enumerate(header_texts):
            h_upper = header.upper()
            if 'ID' in h_upper and column_map['ID'] == -1:
                column_map['ID'] = i
            elif ('TICKER' in h_upper or 'SYMBOL' in h_upper) and column_map['Ticker'] == -1:
                column_map['Ticker'] = i
            elif ('TRIGGER' in h_upper and 'PRICE' in h_upper) or 'ENTRY' in h_upper and column_map['Trigger Price'] == -1:
                column_map['Trigger Price'] = i
            elif 'STRIKE' in h_upper and 'PRICE' in h_upper and column_map['Strike Price'] == -1:
                column_map['Strike Price'] = i
            elif ('PREMIUM' in h_upper or 'ESTIMATED' in h_upper) and column_map['Estimated Premium'] == -1:
                column_map['Estimated Premium'] = i
        
        print(f"Column mapping: {column_map}")
        
        # Extract rows
        rows = table.find_elements(By.TAG_NAME, "tr")[1:]  # Skip header
        records_count = 0
        
        print(f"Found {len(rows)} data rows")
        
        for row_idx, row in enumerate(rows):
            try:
                cells = row.find_elements(By.TAG_NAME, "td")
                if not cells:
                    continue
                
                print(f"Row {row_idx + 1}: {len(cells)} cells")
                
                # Extract data from cells with bounds checking
                item_id = cells[column_map['ID']].text.strip() if column_map['ID'] != -1 and column_map['ID'] < len(cells) else f"AUTO_{row_idx+1}"
                ticker_raw = cells[column_map['Ticker']].text.strip() if column_map['Ticker'] != -1 and column_map['Ticker'] < len(cells) else 'N/A'

                # Check if ticker contains (ER) and process accordingly
                er_value = 0
                if "(ER)" in ticker_raw:
                    ticker = ticker_raw.replace("(ER)", "").strip()
                    er_value = 1
                else:
                    ticker = ticker_raw

                # Skip records with invalid ticker values
                if not ticker or ticker.lower() in ['n/a', 'none', '', 'null']:
                    print(f"Skipping row {row_idx + 1} - invalid ticker: '{ticker}'")
                    continue

                trigger_price = cells[column_map['Trigger Price']].text.strip() if column_map['Trigger Price'] != -1 and column_map['Trigger Price'] < len(cells) else 'N/A'
                strike_price = cells[column_map['Strike Price']].text.strip() if column_map['Strike Price'] != -1 and column_map['Strike Price'] < len(cells) else 'N/A'
                estimated_premium = cells[column_map['Estimated Premium']].text.strip() if column_map['Estimated Premium'] != -1 and column_map['Estimated Premium'] < len(cells) else 'N/A'

                # Parse 'strike_price' to extract 'buy' and 'sell' values
                strike_buy_value, strike_sell_value = 0.0, 0.0
                if " - " in strike_price:
                    parts = strike_price.split(" - ")
                    if len(parts) == 2:
                        try:
                            strike_sell_part = parts[0].strip()
                            strike_buy_part = parts[1].strip()

                            # Extract numerical values more robustly
                            sell_match = re.search(r'(\d+\.?\d*)', strike_sell_part)
                            buy_match = re.search(r'(\d+\.?\d*)', strike_buy_part)
                            
                            if sell_match:
                                strike_sell_value = float(sell_match.group(1))
                            if buy_match:
                                strike_buy_value = float(buy_match.group(1))
                        except ValueError as e:
                            print(f"Error parsing strike prices: {str(e)}")

                print(f"Processing: {ticker} | Trigger: {trigger_price} | Strike: {strike_price} | Premium: {estimated_premium}")
                
                # Database insert
                cursor.execute('''
                INSERT INTO option_strategies (
                    scrape_date, strategy_type, tab_name, ticker, trigger_price, 
                    strike_price, strike_buy, strike_sell, estimated_premium, item_id, options_expiry_date, date_info, er
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ''', (
                    datetime.now().isoformat(), strategy_type, tab_name, ticker, trigger_price, 
                    strike_price, strike_buy_value, strike_sell_value, estimated_premium, item_id, options_expiry_date, date_info, er_value
                ))

                conn.commit()
                records_count += 1
                
            except Exception as e:
                print(f"Error processing row {row_idx + 1}: {str(e)}")
                continue
        
        print(f"Successfully saved {records_count} records from tab #{tab_index+1}")
        return records_count
        
    except Exception as e:
        print(f"Error processing tab #{tab_index+1}: {str(e)}")
        import traceback
        print(f"Full traceback: {traceback.format_exc()}")
        return 0

In [10]:
def process_strategy_page(driver, strategy_url, strategy_type, conn, cursor):
    """Process a single strategy page and extract data from all tabs"""
    try:
        print(f"\n===== Processing {strategy_type} Strategy Page =====")
        driver.get(strategy_url)
        time.sleep(5)  # Give more time for page to load
        
        # Extract the date
        date_info = extract_date(driver)
        print(f"Page date: {date_info}")
        
        # Find all tabs using multiple methods
        tabs = []
        tab_selectors = [
            "//div[contains(@class, 'ep_tabs_header')]//a[contains(@class, 'ep_label_main')]",
            "//a[contains(@class, 'ep_label_main')]",
            "//div[contains(@class, 'tabs')]//a",
            "//ul[contains(@class, 'tabs')]//a",
            "//div[contains(@class, 'tab')]//a"
        ]
        
        for selector in tab_selectors:
            try:
                found_tabs = driver.find_elements(By.XPATH, selector)
                if found_tabs:
                    tabs = found_tabs
                    print(f"Found {len(tabs)} tabs using selector: {selector}")
                    break
            except Exception as e:
                print(f"Selector {selector} failed: {str(e)}")
                continue
        
        if not tabs:
            print(f"No tab elements found on the {strategy_type} page")
            # Try to process any tables found on the page
            tables = driver.find_elements(By.TAG_NAME, "table")
            if tables:
                print(f"Found {len(tables)} tables without tabs, attempting to process...")
                # Create a dummy tab for processing
                class DummyTab:
                    def __init__(self, index):
                        self.index = index
                    def text(self):
                        return f"Table {self.index + 1}"
                    def get_attribute(self, attr):
                        return None
                    def click(self):
                        pass
                
                dummy_tab = DummyTab(0)
                return extract_table_data(driver, dummy_tab, 0, date_info, strategy_type, conn, cursor)
            return 0
        
        # Print tab information
        for i, tab in enumerate(tabs):
            try:
                tab_text = tab.text.strip().replace('\n', ' ')
                tab_href = tab.get_attribute('href')
                print(f"Tab {i+1}: '{tab_text}' -> {tab_href}")
            except:
                print(f"Tab {i+1}: Unable to get text/href")
        
        # Process only the first 4 tabs
        num_tabs_to_process = min(4, len(tabs))
        total_records = 0
        
        for i, tab in enumerate(tabs[:num_tabs_to_process]):
            records = extract_table_data(driver, tab, i, date_info, strategy_type, conn, cursor)
            total_records += records
            time.sleep(2)  # Small delay between tabs
        
        return total_records
            
    except Exception as e:
        print(f"Error processing {strategy_type} strategy page: {str(e)}")
        import traceback
        print(f"Full traceback: {traceback.format_exc()}")
        return 0

In [11]:
def scrape_option_strategies_automated(db_path='../database/option_strategies.db', 
                                     browser_type="chrome", 
                                     credentials_file='credentials.txt',
                                     keep_browser_open=True):
    """
    FIXED VERSION: Scrape data from option strategy pages with improved table detection
    
    Parameters:
    db_path (str): Path to the SQLite database file
    browser_type (str): 'chrome' or 'edge'
    credentials_file (str): Path to credentials file
    keep_browser_open (bool): Keep browser open after scraping to maintain session
    
    Returns:
    int: Number of records added to the database
    """
    
    # Load credentials
    username, password = load_credentials(credentials_file)
    if not username or not password:
        return 0
    
    # Connect to the database
    conn, cursor = connect_to_database(db_path)
    if not conn or not cursor:
        return 0
    
    # Initialize browser with options to keep it open
    chrome_options = ChromeOptions()
    chrome_options.add_argument('--start-maximized')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    if keep_browser_open:
        # Add detach option to keep browser open
        chrome_options.add_experimental_option("detach", True)
    
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
    
    try:
        # Perform automated login
        print("Starting automated login process...")
        if not automated_login(driver, username, password):
            print("Login failed. Cannot proceed with scraping.")
            return 0
        
        print("Login successful! Proceeding with data scraping...")
        
        # Define strategies to scrape
        strategies = [
            {
                "url": "https://optionrecom.com/bear-call-spread-strategy/",
                "type": "Bear Call"
            },
            {
                "url": "https://optionrecom.com/bull-put-spread-strategy/",
                "type": "Bull Put"
            }
        ]
        
        # Process each strategy page
        total_records = 0
        
        for strategy in strategies:
            records = process_strategy_page(driver, strategy["url"], strategy["type"], conn, cursor)
            total_records += records
            time.sleep(3)  # Delay between strategy pages
        
        print(f"\nTotal records saved to database: {total_records}")
        
        # Query to show what was saved
        cursor.execute("SELECT strategy_type, tab_name, COUNT(*) as count FROM option_strategies WHERE scrape_date >= datetime('now', '-1 hour') GROUP BY strategy_type, tab_name")
        results = cursor.fetchall()
        
        print("\nRecords by strategy and tab (last hour):")
        for strategy, tab, count in results:
            print(f"  {strategy} - {tab}: {count} records")
        
        if keep_browser_open:
            print("\nBrowser session maintained. You can manually navigate to other pages.")
            print("Close the browser window when finished.")
        
        return total_records
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        import traceback
        print(f"Full traceback: {traceback.format_exc()}")
        return 0
    finally:
        if not keep_browser_open:
            driver.quit()
        conn.close()

## Run the Fixed Scraper

Execute the cell below to run the fixed automated scraper with improved table detection.

In [12]:
# Run the fixed automated scraper
if __name__ == "__main__":
    result = scrape_option_strategies_automated(
        db_path='../database/option_strategies.db',
        credentials_file='credentials.txt',
        keep_browser_open=True  # Set to False if you want the browser to close automatically
    )
    
    print(f"\nTesting scraper completed. Total records processed: {result}")

Starting automated login process...
Login attempt 1 of 3...
Username entered successfully
Password entered successfully
Login button clicked
Login successful!
Login successful! Proceeding with data scraping...

===== Processing Bear Call Strategy Page =====
Page date: August
Found 8 tabs using selector: //div[contains(@class, 'ep_tabs_header')]//a[contains(@class, 'ep_label_main')]
Tab 1: 'Mild Risk 95-97% accuracy > shorter expiry' -> https://optionrecom.com/bear-call-spread-strategy/#ep_tab_wrapper__9f43ceed-3419-409f-bf35-d667ce0f5453
Tab 2: 'Minimal Risk 97-99% accuracy > shorter expiry' -> https://optionrecom.com/bear-call-spread-strategy/#ep_tab_wrapper__80047de2-1d95-4611-8895-f8f4215f316d
Tab 3: 'Mild Risk 95-97% accuracy > longer expiry' -> https://optionrecom.com/bear-call-spread-strategy/#ep_tab_wrapper__2fbffe4a-dd50-41e9-989f-b68347220373
Tab 4: 'Minimal Risk 97-99% accuracy > longer expiry' -> https://optionrecom.com/bear-call-spread-strategy/#ep_tab_wrapper__99187b37-b03