# Caso 3 - Navegación web

## Importación de librerías

In [1]:
from urllib.parse import urlparse
import os
import glob
import hashlib
import kagglehub
import pandas as pd
import numpy as np

## Llamado del df

In [2]:
# download the latest version – returns the path to the downloaded files
path = kagglehub.dataset_download("shawon10/browser-history")
print("Path to dataset files:", path)

if os.path.isdir(path):
    csv_files = glob.glob(os.path.join(path, "*.csv"))
else:
    # if path ends in .zip you can unzip it yourself
    import zipfile
    with zipfile.ZipFile(path) as z:
        z.extractall(os.path.dirname(path))
    csv_files = glob.glob(os.path.join(os.path.dirname(path), "*.csv"))

print("Found CSV files:", csv_files)

# load one (or more) into a DataFrame
df = pd.read_csv(csv_files[0])       
df.columns = [
    "url",
    "first_visit_time",
    "last_visit_time",
    "click_counts",
    "frequency"
]
df.head()

Path to dataset files: C:\Users\PC\.cache\kagglehub\datasets\shawon10\browser-history\versions\1
Found CSV files: ['C:\\Users\\PC\\.cache\\kagglehub\\datasets\\shawon10\\browser-history\\versions\\1\\TestingHistory.csv', 'C:\\Users\\PC\\.cache\\kagglehub\\datasets\\shawon10\\browser-history\\versions\\1\\TrainingHistory.csv']


Unnamed: 0,url,first_visit_time,last_visit_time,click_counts,frequency
0,https://drive.google.com/drive/my-drive,1511916240,1520808300,28,30912
1,http://localhost/phpmyadmin/,1519419120,1520778600,16,13232
2,https://www.youtube.com/,1515287460,1520799060,22,12375
3,https://console.starter-ca-central-1.openshift...,1511746560,1520777880,22,9830
4,https://console.starter-ca-central-1.openshift...,1509489720,1520043900,20,9570


### Proceso de anonimización

In [3]:
navhis_df = df.copy()


# --- 1. AGREGACIÓN Y SUPRESIÓN ---
# agregación temporal + supresión de timestamps (o sea restar los timestamps)
navhis_df["time_on_page"] = (
    navhis_df["last_visit_time"] - 
    navhis_df["first_visit_time"]
)

navhis_df.drop(
    columns=["first_visit_time", "last_visit_time"], 
    inplace=True
)
# --- 2. PERTURBACIÓN ---
# Perturbación por redondeo (reducción de precisión)
navhis_df["frequency"] = (
    navhis_df["frequency"] // 10
) * 10

# --- 2. GENERALIZACIÓN ---
# Generalización de URL (quedarse solo con dominio)
def extract_domain(url):
    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}"

navhis_df["url"] = navhis_df["url"].apply(extract_domain)

navhis_df.head()

Unnamed: 0,url,click_counts,frequency,time_on_page
0,https://drive.google.com,28,30910,8892060
1,http://localhost,16,13230,1359480
2,https://www.youtube.com,22,12370,5511600
3,https://console.starter-ca-central-1.openshift...,22,9830,9031320
4,https://console.starter-ca-central-1.openshift...,20,9570,10554180


### Comparación de ambos df 

In [4]:
df.head(10)

Unnamed: 0,url,first_visit_time,last_visit_time,click_counts,frequency
0,https://drive.google.com/drive/my-drive,1511916240,1520808300,28,30912
1,http://localhost/phpmyadmin/,1519419120,1520778600,16,13232
2,https://www.youtube.com/,1515287460,1520799060,22,12375
3,https://console.starter-ca-central-1.openshift...,1511746560,1520777880,22,9830
4,https://console.starter-ca-central-1.openshift...,1509489720,1520043900,20,9570
5,http://daler.org/,1520027280,1520780760,5,6270
6,http://ruetoj-ruetoj.193b.starter-ca-central-1...,1517408760,1520783100,11,5955
7,http://ruetoc-ruetoc.193b.starter-ca-central-1...,1508093880,1520340120,22,5898
8,https://github.com/shawon100,1516319760,1520544900,24,5539
9,https://myaccount.laxmicoin.com/register?ref=l...,1520364660,1520422500,4,5513


In [5]:
navhis_df.head(10)

Unnamed: 0,url,click_counts,frequency,time_on_page
0,https://drive.google.com,28,30910,8892060
1,http://localhost,16,13230,1359480
2,https://www.youtube.com,22,12370,5511600
3,https://console.starter-ca-central-1.openshift...,22,9830,9031320
4,https://console.starter-ca-central-1.openshift...,20,9570,10554180
5,http://daler.org,5,6270,753480
6,http://ruetoj-ruetoj.193b.starter-ca-central-1...,11,5950,3374340
7,http://ruetoc-ruetoc.193b.starter-ca-central-1...,22,5890,12246240
8,https://github.com,24,5530,4225140
9,https://myaccount.laxmicoin.com,4,5510,57840
