In [22]:
import pandas as pd
import regex as re
import matplotlib.pyplot as plt
from datetime import datetime
import os
from tqdm import tqdm
import shutil


### Function to extract pixel IDs from a given page

In [4]:
def process_snapshot(snapshot_path):
    pixel_ids = set()

    with open(snapshot_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Pattern 1: Extract IDs from script src pattern
    script_pattern = r'<script src="https://connect\.facebook\.net/signals/config/(\d+)'
    ids_from_script = re.findall(script_pattern, content)
    pixel_ids.update(ids_from_script)

    # Pattern 2: Extract IDs from fbq("init", ...) pattern
    fbq_pattern = r'fbq\("init","(\d+)"\);'
    ids_from_fbq = re.findall(fbq_pattern, content)
    pixel_ids.update(ids_from_fbq)

    # Return the list of unique IDs
    return list(pixel_ids)


### Function to create a dataframe where columns months from 2024-09 till 2019-09, and each row corresponds to a website while an entry stores a list of pixel IDs for that month in the website. 

Note: An entry being None means the snapshot for that month didn't exist, and the entry being an empty list means the snapshot did exist but there was no pixel ID present.

In [30]:
def create_dataframe(base_folder):
    # Generate all months from Sept 2024 to Sept 2019
    months = pd.date_range(start="2019-09-01", end="2024-09-01", freq='MS').strftime("%Y%m").tolist()[::-1]
    data = []
    websites = [website for website in os.listdir(base_folder) if os.path.isdir(os.path.join(base_folder, website))]
    for website in tqdm(websites, desc="Processing websites", unit="website"):
        website_path = os.path.join(base_folder, website)
        
        # Create a dictionary to store the results for this website
        results = {"website": website}
        
        # Initialize each month with None for this website
        for month in months:
            results[month] = None
        
        for snapshot in os.listdir(website_path):
            snapshot_timestamp = snapshot[:6]  # Extract the year and month from the filename
            snapshot_path = os.path.join(website_path, snapshot)

            # If the snapshot timestamp matches one of the months, process it
            if snapshot_timestamp in results:
                result = process_snapshot(snapshot_path)
                results[snapshot_timestamp] = result if result else []

        data.append(results)

    df = pd.DataFrame(data)
    df.to_csv("pixelHistory.csv")

    return df

base_folder = "/home/abdullah/dump/Documents/LUMSU/Other/drp2.0/Tracking-Project/Results/final/websites" #configure this to where all downloaded snapshots are
df = create_dataframe(base_folder)
print(df)


Processing websites:   0%|          | 0/1563 [00:00<?, ?website/s]

Processing websites: 100%|██████████| 1563/1563 [02:51<00:00,  9.10website/s]

            website              202409              202408  \
0         unpkg.com                  []                  []   
1      facebook.net                None                None   
2     amazonaws.com                  []                  []   
3     google.com.my                  []                  []   
4      nicovideo.jp   [351824429553763]                  []   
...             ...                 ...                 ...   
1558       daum.net                  []                  []   
1559    spamcop.net                  []                  []   
1560    knowbe4.com                  []                  []   
1561     carfax.com  [1797990817173863]  [1797990817173863]   
1562  storygize.net                  []                None   

                  202407              202406              202405  \
0                     []                  []                  []   
1                   None                None                None   
2                     []               




In [24]:
folder1 = '/home/abdullah/dump/Documents/LUMSU/Other/drp2.0/Tracking-Project/Results/top10k-snapshots/2-noori'
folder2 = '/home/abdullah/dump/Documents/LUMSU/Other/drp2.0/Tracking-Project/Results/new/top10k-snapshots/2-noori'
output_folder = '/home/abdullah/dump/Documents/LUMSU/Other/drp2.0/Tracking-Project/Results/final/1-maula'

In [25]:
def merge_folders(folder1, folder2, output_folder):
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Traverse both folders
    for folder in [folder1, folder2]:
        for website in os.listdir(folder):
            website_folder = os.path.join(folder, website)
            if os.path.isdir(website_folder):
                # Create the corresponding website folder in the output if it doesn't exist
                output_website_folder = os.path.join(output_folder, website)
                if not os.path.exists(output_website_folder):
                    os.makedirs(output_website_folder)
                
                # Copy all unique snapshots from both folders to the output folder
                for snapshot in os.listdir(website_folder):
                    snapshot_path = os.path.join(website_folder, snapshot)
                    output_snapshot_path = os.path.join(output_website_folder, snapshot)
                    if not os.path.exists(output_snapshot_path):
                        shutil.copy2(snapshot_path, output_snapshot_path)

merge_folders(folder1, folder2, output_folder)


In [29]:
base_folder = "/home/abdullah/dump/Documents/LUMSU/Other/drp2.0/Tracking-Project/Results/final/websites" #configure this to where all downloaded snapshots are

1563

In [31]:
pd.read_csv('pixelHistory.csv')

Unnamed: 0.1,Unnamed: 0,website,202409,202408,202407,202406,202405,202404,202403,202402,...,202006,202005,202004,202003,202002,202001,201912,201911,201910,201909
0,0,unpkg.com,[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
1,1,facebook.net,,,,,,,,,...,,,,,,,,,,
2,2,amazonaws.com,[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
3,3,google.com.my,[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
4,4,nicovideo.jp,['351824429553763'],[],[],['351824429553763'],['351824429553763'],['351824429553763'],['351824429553763'],['351824429553763'],...,[],[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1558,1558,daum.net,[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
1559,1559,spamcop.net,[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
1560,1560,knowbe4.com,[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],
1561,1561,carfax.com,['1797990817173863'],['1797990817173863'],['1797990817173863'],['1797990817173863'],['1797990817173863'],[],[],['1797990817173863'],...,['1797990817173863'],['1797990817173863'],['1797990817173863'],['1797990817173863'],['1797990817173863'],['1797990817173863'],['1797990817173863'],['1797990817173863'],['1797990817173863'],['1797990817173863']
