In [None]:
import pandas as pd
import requests
import numpy as np


def load_nasdaq_tickers(file_path):
    """
    Load NASDAQ tickers and security names into a dictionary for lookup.
    """
    nasdaq_data = pd.read_csv(file_path)
    nasdaq_dict = {}
    for _, row in nasdaq_data.iterrows():
        if pd.notna(row['Symbol']) and pd.notna(row['Security Name']):
            # Map ticker to the first word in the Security Name
            first_word = row['Security Name'].split()[0].lower()  # Get first word, normalized
            nasdaq_dict[row['Symbol'].lower()] = first_word
    return nasdaq_dict


def check_title(article, stock_symbol, nasdaq_dict):
    """
    Ensure the article title contains the stock ticker or the first word of the Security Name.
    """
    article = str(article).lower()

    if pd.notna(stock_symbol) and isinstance(stock_symbol, str) and stock_symbol.lower() in nasdaq_dict:
        first_word = nasdaq_dict[stock_symbol.lower()]
        if first_word in article or stock_symbol.lower() in article:
            return True

    return False


def check_url(url):
    """
    Ensure the URL is working/accessible.
    """
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        return response.status_code == 200
    except requests.RequestException:
        return False


def check_english(article, fns_pid):
    """
    Ensure the news article is in English.
    """
    non_english_fns_pids = [12345, 67890]  # Replace with actual IDs if needed
    if fns_pid in non_english_fns_pids:
        return False
    common_english_words = ['the', 'is', 'at', 'on', 'and', 'of', 'in', 'to']
    article = str(article).lower()
    return any(word in article for word in common_english_words)


def get_cleaned_data(raw_data, nasdaq_dict, my_id=903875382, sample_size=1000):
    """
    Process and filter news articles dataset, then sample 15 articles.
    Allows for testing on a smaller sample size of the dataset for URL checks.
    """
    # Initial cleaning: Remove rows with missing or empty essential columns
    clean_data = raw_data[
        raw_data['Article'].notna() &
        raw_data['Url'].notna() &
        (raw_data['Article'].str.strip() != '') &
        (raw_data['Url'].str.strip() != '')
    ]

    # Sample a subset of the data before applying filters
    np.random.seed(my_id)
    if sample_size and sample_size < len(clean_data):
        clean_data = clean_data.sample(n=sample_size, random_state=my_id)

    # Apply filters with debug information
    clean_data['title_check'] = clean_data.apply(
        lambda row: check_title(row['Article_title'], row['Stock_symbol'], nasdaq_dict), axis=1)
    clean_data['url_check'] = clean_data['Url'].apply(check_url)
    clean_data['english_check'] = clean_data.apply(
        lambda row: check_english(row['Article'], row.get('FNSPID', None)), axis=1)

    # Log debugging info
    print(f"Rows passing title check: {clean_data['title_check'].sum()}")
    print(f"Rows passing URL check: {clean_data['url_check'].sum()}")
    print(f"Rows passing English check: {clean_data['english_check'].sum()}")

    # Combine filters
    filtered_data = clean_data[
        clean_data['title_check'] &
        clean_data['url_check'] &
        clean_data['english_check']
    ]

    # Sample 15 articles
    sampled_data = filtered_data.sample(n=min(15, len(filtered_data)), random_state=my_id)

    return sampled_data


if __name__ == "__main__":
    # Load the NASDAQ data
    nasdaq_file = '/Users/arsheyagourav/Desktop/VIP/nasdaqNames.csv'  # Replace with the actual file path
    nasdaq_dict = load_nasdaq_tickers(nasdaq_file)

    # Load the news articles dataset
    file_path = '/Users/arsheyagourav/Desktop/VIP/nasdaq_exteral_data.csv'
    df = pd.read_csv(file_path)

    # Check if dataset loaded correctly
    print(f"Loaded dataset with {len(df)} rows")

    # Filter the data
    results = get_cleaned_data(df, nasdaq_dict, sample_size=1000)

    # Print the results
    print(f"Found {len(results)} articles:")
    for i, row in results.iterrows():
        print(f"\nArticle {i + 1}:")
        print(f"Title: {row['Article_title']}")
        print(f"Stock: {row['Stock_symbol']}")
        print(f"URL: {row['Url']}")

Loaded dataset with 100000 rows
