# CSV URL Filtering Notebook

This notebook reads two CSV files:
1. First CSV: Contains a single column 'URL' with website URLs
2. Second CSV: Contains columns 'run_date', 'pagepath', 'product_name', 'product_group', 'asset'

The goal is to filter records from the second CSV where the URL from the first CSV matches the pagepath in the second CSV.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from pathlib import Path

## 1. Read the CSV Files

In [None]:
# Define file paths - update these paths to match your actual CSV file locations
urls_csv_path = 'data/urls.csv'  # Update this path
data_csv_path = 'data/data.csv'  # Update this path

# Read the first CSV file (URLs)
try:
    urls_df = pd.read_csv(urls_csv_path)
    print(f"URLs CSV loaded successfully. Shape: {urls_df.shape}")
    print(f"Columns: {list(urls_df.columns)}")
    print("\nFirst 5 rows:")
    print(urls_df.head())
except FileNotFoundError:
    print(f"Error: Could not find {urls_csv_path}. Please update the file path.")
except Exception as e:
    print(f"Error reading URLs CSV: {e}")

In [None]:
# Read the second CSV file (data with pagepath)
try:
    data_df = pd.read_csv(data_csv_path)
    print(f"Data CSV loaded successfully. Shape: {data_df.shape}")
    print(f"Columns: {list(data_df.columns)}")
    print("\nFirst 5 rows:")
    print(data_df.head())
except FileNotFoundError:
    print(f"Error: Could not find {data_csv_path}. Please update the file path.")
except Exception as e:
    print(f"Error reading data CSV: {e}")

## 2. Data Exploration and Validation

In [None]:
# Check data types and basic info
print("URLs DataFrame Info:")
print(urls_df.info())
print("\n" + "="*50 + "\n")
print("Data DataFrame Info:")
print(data_df.info())

In [None]:
# Check for null values
print("Null values in URLs DataFrame:")
print(urls_df.isnull().sum())
print("\nNull values in Data DataFrame:")
print(data_df.isnull().sum())

In [None]:
# Sample URLs and pagepaths to understand the data format
print("Sample URLs:")
print(urls_df['URL'].head(10).tolist())
print("\nSample pagepaths:")
print(data_df['pagepath'].head(10).tolist())

## 3. Filter Records Where URL Matches Pagepath

In [None]:
# Create a set of URLs for efficient lookup
url_set = set(urls_df['URL'].dropna())
print(f"Number of unique URLs: {len(url_set)}")
print(f"Total records in data CSV: {len(data_df)}")

In [None]:
# Filter the data DataFrame where pagepath matches any URL
filtered_df = data_df[data_df['pagepath'].isin(url_set)]

print(f"Number of matching records: {len(filtered_df)}")
print(f"Percentage of records that match: {len(filtered_df)/len(data_df)*100:.2f}%")

## 4. Display Results

In [None]:
# Display the filtered results
print("Filtered DataFrame (first 10 rows):")
print(filtered_df.head(10))

In [None]:
# Summary statistics of the filtered data
print("Summary of filtered data:")
print(f"Shape: {filtered_df.shape}")
print(f"\nUnique values per column:")
for col in filtered_df.columns:
    print(f"{col}: {filtered_df[col].nunique()} unique values")

In [None]:
# Group by product_group to see distribution
if 'product_group' in filtered_df.columns:
    print("Distribution by product_group:")
    print(filtered_df['product_group'].value_counts())

## 5. Save Filtered Results (Optional)

In [None]:
# Save the filtered results to a new CSV file
output_path = 'data/filtered_results.csv'

try:
    filtered_df.to_csv(output_path, index=False)
    print(f"Filtered results saved to: {output_path}")
except Exception as e:
    print(f"Error saving file: {e}")

## 6. Additional Analysis (Optional)

In [None]:
# Find URLs that didn't match any pagepath
matched_urls = set(filtered_df['pagepath'].unique())
unmatched_urls = url_set - matched_urls

print(f"URLs that didn't match any pagepath: {len(unmatched_urls)}")
if len(unmatched_urls) > 0 and len(unmatched_urls) <= 10:
    print("Unmatched URLs:")
    for url in list(unmatched_urls)[:10]:
        print(f"  - {url}")

In [None]:
# Check for partial matches (if URLs might be substrings of pagepaths or vice versa)
print("Checking for potential partial matches...")

# This is a more computationally expensive operation, so we'll limit it
sample_urls = list(url_set)[:5]  # Check first 5 URLs
sample_pagepaths = data_df['pagepath'].dropna().unique()[:100]  # Check first 100 pagepaths

partial_matches = []
for url in sample_urls:
    for pagepath in sample_pagepaths:
        if url in str(pagepath) or str(pagepath) in url:
            if url != pagepath:  # Exclude exact matches
                partial_matches.append((url, pagepath))

if partial_matches:
    print(f"Found {len(partial_matches)} potential partial matches (sample):")
    for url, pagepath in partial_matches[:5]:
        print(f"  URL: {url}")
        print(f"  Pagepath: {pagepath}")
        print("  ---")
else:
    print("No partial matches found in the sample.")