# Daily Scrape QC
This quick notebook inspects the latest GasBuddy scrape file and surfaces basic quality checks (row counts, station coverage, missingness, and price distributions).

In [None]:

import os, glob, pandas as pd
from datetime import datetime

paths = sorted(glob.glob('scraping_*/*gas_price_zip_*_*.csv'), key=os.path.getmtime)
assert paths, "No CSVs found under scraping_*/. Run the scraper first."
latest = paths[-1]
latest


In [None]:

import pandas as pd
df = pd.read_csv(latest)
df.head()


In [None]:

print("Rows:", len(df))
print("Unique stations:", df['station_id'].nunique())
print("Unique ZIPs:", df['zipcode'].nunique())
df.isna().mean().rename("missing_rate")


In [None]:

import re
def to_price(x):
    if isinstance(x, str):
        m = re.search(r'(\d+(?:\.\d+)?)', x.replace(',', ''))
        return float(m.group(1)) if m else None
    return None

df['price_num'] = df['price'].apply(to_price)
df[['price','price_num']].head(10)


In [None]:

import matplotlib.pyplot as plt
clean = df['price_num'].dropna()
plt.figure()
plt.hist(clean, bins=40)
plt.title('Price Distribution (price_num)')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()


In [None]:

df['posted_time'].value_counts().head(20)
