In [None]:
# !pip install -r ../requirements.txt
import os, json, math, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from datetime import datetime

# Project paths (adjust DATA_DIR to your folder if desired)
BASE = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))
DATA_DIR = "/Users/andrewrodriguez/Desktop/compsci1050/privacy_policies/policy_texts"
FIG_DIR = "/Users/andrewrodriguez/Desktop/compsci1050/privacy_policies/figures"
REPORTS_DIR = "/Users/andrewrodriguez/Desktop/compsci1050/privacy_policies/reports"

from helpers import (
    analyze_text, load_local_texts, fetch_policy_text, polite_sleep,
    get_wayback_snapshots, fetch_wayback_content, html_to_text, domain_from_url
)

print('BASE:', BASE)
print('DATA_DIR:', DATA_DIR)
print('FIG_DIR:', FIG_DIR)
print('REPORTS_DIR:', REPORTS_DIR)

BASE: /Users/andrewrodriguez/Desktop/compsci1050
DATA_DIR: /Users/andrewrodriguez/Desktop/compsci1050/privacy_policies/policy_texts
FIG_DIR: /Users/andrewrodriguez/Desktop/compsci1050/figures
REPORTS_DIR: /Users/andrewrodriguez/Desktop/compsci1050/reports


In [5]:
texts = {}
if os.path.isdir(DATA_DIR):
    texts.update(load_local_texts(DATA_DIR))
print(f'Loaded {len(texts)} local files')

Loaded 7 local files


In [7]:
# Compute metrics into a DataFrame
rows = []
for name, text in texts.items():
    rows.append(analyze_text(name, text))
df = pd.DataFrame(rows).set_index('name').sort_index()
display(df.head())
df.to_csv(os.path.join(REPORTS_DIR, 'metrics.csv'), index=True)
print('Saved metrics to reports/metrics.csv')

Unnamed: 0_level_0,n_chars,n_words,n_sents,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,dale_chall,coleman_liau,ari,avg_sentence_length,ttr,yule_k,entropy,avg_zipf,rare_frac,legalese_frac
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
amazon,27478,4164,160,25.075821,16.227089,19.747176,17.489301,11.730236,14.539193,17.644276,26.025,0.219292,93.476508,8.219079,5.607517,0.009266,0.004277
apple,29703,4637,200,29.172227,14.950188,17.874388,16.162427,11.225745,13.362993,15.207038,23.185,0.202908,99.28382,8.113057,5.679806,0.009692,0.009482
google,64446,10254,401,37.59499,14.368148,17.472431,15.6555,11.583315,12.816228,15.852249,25.571072,0.139813,97.857111,8.273859,5.675013,0.012535,0.004821
meta,35133,5546,149,23.316218,19.254035,21.596128,17.712588,11.769075,13.470429,21.978787,37.221477,0.129702,130.592344,7.616436,5.770186,0.007452,0.004968
palantir,59335,8974,300,18.604502,18.095678,21.267763,18.613347,12.332505,14.800891,19.869173,29.913333,0.141534,102.235467,8.185523,5.646391,0.023589,0.011795


OSError: Cannot save file into a non-existent directory: '/Users/andrewrodriguez/Desktop/compsci1050/reports'

## 3) Visualizations

In [None]:
# Figure 1: Words vs Flesch-Kincaid Grade (length vs grade level)
plt.figure()
plt.scatter(df['n_words'], df['flesch_kincaid_grade'])
for i, name in enumerate(df.index):
    plt.annotate(name, (df['n_words'].iloc[i], df['flesch_kincaid_grade'].iloc[i]))
plt.xlabel('Words')
plt.ylabel('Flesch–Kincaid Grade')
plt.title('Figure 1: Length vs Grade Level')
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'length_vs_grade.png'), dpi=200)
plt.show()

In [None]:
# Figure 2: PCA (2D) on standardized features with k-means overlay
features = ['n_words','n_sents','avg_sentence_length','flesch_kincaid_grade',
            'gunning_fog','smog_index','dale_chall','coleman_liau','ari',
            'ttr','yule_k','entropy','avg_zipf','rare_frac','legalese_frac']

# Handle missing columns gracefully
missing = [c for c in features if c not in df.columns]
if missing:
    print('Missing features (will be dropped):', missing)
    features = [c for c in features if c in df.columns]

X = df[features].replace([np.inf,-np.inf], np.nan).fillna(df[features].median())
scaler = StandardScaler()
Z = scaler.fit_transform(X)

pca = PCA(n_components=2, random_state=0)
P = pca.fit_transform(Z)

best_k, best_sil = None, -1
labels_by_k = {}
for k in [2,3,4]:
    km = KMeans(n_clusters=k, random_state=0, n_init='auto')
    lbl = km.fit_predict(P)
    sil = silhouette_score(P, lbl)
    labels_by_k[k] = lbl
    if sil > best_sil:
        best_sil, best_k = sil, k

labels = labels_by_k[best_k]

plt.figure()
plt.scatter(P[:,0], P[:,1])
for i, name in enumerate(df.index):
    plt.annotate(f"{name} (c{labels[i]})", (P[i,0], P[i,1]))
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title(f'Figure 2: PCA + KMeans (k={best_k}, silhouette={best_sil:.3f})')
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'pca_kmeans.png'), dpi=200)
plt.show()

df['cluster_k'] = best_k
df['cluster_label'] = labels


In [None]:
# Figure 3: Metric bars (standardized) per policy
cols = ['flesch_kincaid_grade','gunning_fog','smog_index','dale_chall','coleman_liau','ari',
        'avg_sentence_length','ttr','yule_k','entropy','avg_zipf','rare_frac','legalese_frac']
cols = [c for c in cols if c in df.columns]

M = df[cols].replace([np.inf,-np.inf], np.nan).fillna(df[cols].median())
M = (M - M.mean())/M.std(ddof=0)

plt.figure(figsize=(10,6))
M.plot(kind='bar', rot=45, legend=False)
plt.xlabel('Policy')
plt.ylabel('Z-score')
plt.title('Figure 3: Selected Metrics (standardized)')
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'metric_bars.png'), dpi=200)
plt.show()

## 4) Historical evolution for one site
Pick a URL (ideally one of the above). We pull a few snapshots per year from the Internet Archive and recompute metrics.

In [None]:
HIST_URL = ''  # e.g., 'https://policies.google.com/privacy'
if not HIST_URL:
    # If not set, try to default to the first URL you fetched; otherwise skip
    try:
        from urllib.parse import urlparse
        # reconstruct a plausible URL from the index name if needed
        HIST_URL = 'https://policies.google.com/privacy'  # change me
    except Exception:
        HIST_URL = ''

print('Historical target:', HIST_URL)

In [None]:
# Fetch snapshots and compute metrics
from src.helpers import analyze_text, get_wayback_snapshots, fetch_wayback_content, html_to_text, domain_from_url, polite_sleep
hist_rows = []
hist_df = pd.DataFrame()

if HIST_URL:
    try:
        snaps = get_wayback_snapshots(HIST_URL, from_year=2015, to_year=datetime.now().year, limit_per_year=2)
        print('Snapshots:', snaps[:6], '... (showing up to 6)')
        for year, ts in snaps:
            try:
                html = fetch_wayback_content(HIST_URL, ts)
                text = html_to_text(html)
                rec = analyze_text(f"{domain_from_url(HIST_URL)}_{ts}", text)
                rec['year'] = year
                rec['timestamp'] = ts
                hist_rows.append(rec)
                polite_sleep(1.0)
            except Exception as e:
                print('Snapshot failed:', ts, e)
        hist_df = pd.DataFrame(hist_rows).sort_values('timestamp')
        if not hist_df.empty:
            display(hist_df[['name','year','n_words','flesch_kincaid_grade']].head())
            hist_df.to_csv(os.path.join(REPORTS_DIR, 'history_metrics.csv'), index=False)
            print('Saved history to reports/history_metrics.csv')
    except Exception as e:
        print('Wayback query failed:', e)
else:
    print('HIST_URL not set; skipping history analysis.')

In [None]:
# Figure 4: Words over time
if 'hist_df' in globals() and not hist_df.empty:
    plt.figure()
    plt.plot(hist_df['year'], hist_df['n_words'], marker='o')
    plt.xlabel('Year')
    plt.ylabel('Words')
    plt.title('Figure 4: Policy Length Over Time')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, 'history_words.png'), dpi=200)
    plt.show()

In [None]:
# Figure 5: Grade level over time
if 'hist_df' in globals() and not hist_df.empty:
    plt.figure()
    plt.plot(hist_df['year'], hist_df['flesch_kincaid_grade'], marker='o')
    plt.xlabel('Year')
    plt.ylabel('Flesch–Kincaid Grade')
    plt.title('Figure 5: Policy Grade Level Over Time')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, 'history_grade.png'), dpi=200)
    plt.show()

## 5) Export a 2–4 page report
We render a Markdown report with key figures and summary values. Convert to PDF/Docx as needed.

In [None]:
from jinja2 import Template

def snapshots_md(df):
    if df is None or df.empty:
        return "No snapshots found."
    cols = ['year','timestamp','n_words','flesch_kincaid_grade']
    df2 = df[cols].copy()
    return df2.to_markdown(index=False)

site_list = list(df.index) if 'df' in globals() and not df.empty else []
historical_site = domain_from_url(HIST_URL) if HIST_URL else (site_list[0] if site_list else "unknown")

summary_cols = ['n_words','n_sents','flesch_kincaid_grade','gunning_fog','smog_index','dale_chall']
summary_cols = [c for c in summary_cols if c in (df.columns if 'df' in globals() else [])]
summary_table_md = (df[summary_cols].to_markdown() if summary_cols else "No metrics computed.")

best_k = int(df['cluster_k'].iloc[0]) if 'df' in globals() and 'cluster_k' in df.columns and not df.empty else 0

template_path = os.path.join(REPORTS_DIR, 'report_template.md')
with open(template_path, 'r') as f:
    tpl = Template(f.read())

report_text = tpl.render(
    run_date=datetime.now().strftime('%Y-%m-%d'),
    historical_site=historical_site,
    key_findings_summary="(Fill after reviewing the figures)",
    site_list=site_list,
    retrieval_method="from local text files and/or direct fetch of policy URLs on the same day",
    summary_table_md=summary_table_md,
    best_k=best_k,
    best_silhouette=float('nan'),
    cluster_interpretation="(e.g., Cluster 0 = shorter/easier; Cluster 1 = longer/harder)",
    hist_len_trend="(e.g., generally increasing)",
    hist_grade_trend="(e.g., stable around grade 12)",
    hist_readability_trend="(e.g., modest decrease in readability after 2018)",
    interpretation="that policy readability varies substantially, and historical revisions often add length without proportional clarity gains",
    repo_link="https://github.com/yourname/privacy-policy-lab",
    snapshot_table_md=snapshots_md(hist_df if 'hist_df' in globals() else None)
)

out_path = os.path.join(REPORTS_DIR, 'lab_report.md')
with open(out_path, 'w') as f:
    f.write(report_text)

print('Wrote report to', out_path)
print('Convert to PDF with pandoc (optional):')
print('  pandoc reports/lab_report.md -o reports/lab_report.pdf')