# Web Scrape SQL Analysis

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

load_dotenv()

USER = os.getenv("PG_USER")
PASS = os.getenv("PG_PASSWORD")
HOST = os.getenv("PG_HOST")
DB   = os.getenv("PG_DB")

conn_str = f"postgresql+psycopg2://{USER}:{PASS}@{HOST}/{DB}"
engine   = create_engine(conn_str)

pd.set_option("display.max_rows", None)


**Business Question:**  
Which F1 drivers generate the most discussion on r/formula1?  Fan chatter can reveal who’s on everyone’s mind, informing broadcast focus and sponsor value.


In [2]:
# Cell 2: Descriptive – Top 10 Most‑Mentioned Drivers on Reddit

sql_query = """
WITH 
drivers AS (
  SELECT DISTINCT driver
    FROM raw.qualifying_results_2023
),
mentions AS (
  SELECT
    d.driver,
    COUNT(*) AS mention_count
  FROM drivers d
  JOIN raw.web_scrape_formula1 w
    ON w.title ILIKE '%%' || d.driver || '%%'
  GROUP BY d.driver
)
SELECT
  driver,
  mention_count,
  RANK() OVER (ORDER BY mention_count DESC) AS mention_rank
FROM mentions
ORDER BY mention_count DESC
LIMIT 10;
"""

# Use keyword 'con=' so pandas knows this is the SQL connection,
# and nothing ends up in the wrong slot:
df_mentions = pd.read_sql(sql_query, engine)
df_mentions

Unnamed: 0,driver,mention_count,mention_rank
0,Norris,6,1
1,Verstappen,4,2
2,Piastri,4,2
3,Russell,3,4
4,Leclerc,3,4
5,Hamilton,2,6
6,Alonso,2,6
7,Albon,1,8
8,Sainz,1,8
9,Tsunoda,1,8
