In [1]:
import regex as re
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Loading Base Code

In [2]:
def readCode(file_name):
    base_code = ""
    with open (file_name,'r') as f:
        for line in f:
            line = line.replace(' ','')
            base_code+=line
    return base_code

### Loading and Cleaning pixel code

In [3]:
def extractConfigurationCode(file_name):
    pixel_code = ""

    with open (file_name,'r') as f:
        for line in f:
            pixel_code+=line
    configuration = "fbq.registerPlugin" + (pixel_code.split('fbq.registerPlugin')[1].split('/*')[0])
    return configuration

### Script to visualize differences between any two files

In [6]:
import difflib

code1 = extractConfigurationCode('temp1.js')
code2 = extractConfigurationCode('temp3.js')

# code1 = readCode('base_code.js')
# code1 = readCode('base_code.js')

# Split the code into lines
code1_lines = code1.strip().splitlines()
code2_lines = code2.strip().splitlines()

# Create an instance of HtmlDiff
html_diff = difflib.HtmlDiff(wrapcolumn=100)

# Generate the HTML table with customized styling
html_result = html_diff.make_file(
    code1_lines, 
    code2_lines, 
    context=True,  # Show only the differences with context
    numlines=2     # Number of context lines to show
)

# Custom HTML styling to enhance visual appeal
custom_css = """
<style>
    body {
        font-family: Arial, sans-serif;
        line-height: 1.6;
        background-color: #f4f4f4;
        margin: 0;
        padding: 20px;
    }
    table.diff {
        width: 100%;
        border-collapse: collapse;
        margin: 20px 0;
        font-size: 16px;
    }
    table.diff th {
        background-color: #2c3e50;
        color: white;
        padding: 10px;
        text-align: left;
    }
    table.diff td {
        padding: 10px;
        vertical-align: top;
        border-bottom: 1px solid #ddd;
    }
    .diff_header {
        background-color: #34495e;
        color: white;
    }
    .diff_next {
        background-color: #f39c12;
    }
    .diff_add {
        background-color: #2ecc71;
        color: white;
    }
    .diff_sub {
        background-color: #e74c3c;
        color: white;
    }
    .diff_chg {
        background-color: #3498db;
        color: white;
    }
</style>
"""

# Embed the custom CSS into the HTML result
html_result = html_result.replace(
    '<style type="text/css">', 
    '<style type="text/css">' + custom_css
)

# Save the result to an HTML file
with open('diff_1_3.html', 'w') as file:
    file.write(html_result)

print("Differences saved to diff_successive.html")

# Collect added and deleted parts
diff = difflib.ndiff(code1_lines, code2_lines)
added_parts = []
deleted_parts = []

for line in diff:
    if line.startswith('+ '):
        added_parts.append(line[2:])
    elif line.startswith('- '):
        deleted_parts.append(line[2:])

print("Added parts:")
print("\n".join(added_parts))

print("\nDeleted parts:")
print("\n".join(deleted_parts))


Differences saved to diff_successive.html
Added parts:
fbq.registerPlugin("1752529721656045", {__fbEventsPlugin: 1, plugin: function(fbq, instance, config) { fbq.loadPlugin("iwlbootstrapper");
config.set("1752529721656045", "cookie", {"fbcParamsConfig":{"params":[{"prefix":"","query":"fbclid","ebp_path":"clickID"},{"prefix":"aem","query":"aem","ebp_path":"aem"}]},"enableFbcParamSplit":false});
config.set(null, "batching", {"batchWaitTimeMs":20,"maxBatchSize":10});
config.set("1752529721656045", "unwantedData", {"blacklisted_keys":{"PageView":{"cd":[],"url":["query","W39DNdOb","DObMSCu","7VcXUDOB","cuXlDOB","odQbdOb","euWI2dob","reF2dOB"]}},"sensitive_keys":{"PageView":{"cd":["2ca69efd4ea5af91a637f19ba0bab8b081d2c03773c4a72fcbf8817c856b33ef","de5bcbf26f0c337a356869fc94ab56cd6ce51162a961e14277f75925c1c8ad2a"],"url":["d6d198dd68bbff3e3fef3ae8aa7c4d9608c0b13cc99e15077b82dfa3233be364","3ff6c05723bb069d19953340320fa9512f0be584742703e60226ded28bb43861","7dfc00598060b4f5264a9219bdbe5126309c87e

## Analyzing Changes in Time (i.e. parts added and deleted over time)

In [21]:
import pandas as pd
import difflib
import os

# Directory containing the HTML files
folder_path = 'riteaid_archived_versions'

# Get all files in the directory sorted by their timestamps
file_list = sorted([f for f in os.listdir(folder_path) if f.endswith('.html')])

# DataFrame to store the results
columns = ['Timestamp1', 'Timestamp2', 'Code1', 'Code2', 'Added Parts', 'Deleted Parts']
df = pd.DataFrame(columns=columns)

# List to accumulate rows before creating DataFrame
rows = []

# Iterate over the files successively
for i in range(len(file_list) - 1):
    file1 = file_list[i]
    file2 = file_list[i + 1]

    # Get timestamps
    timestamp1 = file1.split('.')[0]
    timestamp2 = file2.split('.')[0]

    # Extract code
    code1 = extractConfigurationCode(os.path.join(folder_path, file1))
    code2 = extractConfigurationCode(os.path.join(folder_path, file2))

    # Split the code into lines
    code1_lines = code1.strip().splitlines()
    code2_lines = code2.strip().splitlines()

    # Create an instance of HtmlDiff
    html_diff = difflib.HtmlDiff(wrapcolumn=100)

    # Generate the HTML table with customized styling
    html_result = html_diff.make_file(
        code1_lines, 
        code2_lines, 
        context=True,  # Show only the differences with context
        numlines=2     # Number of context lines to show
    )

    # Collect added and deleted parts
    diff = difflib.ndiff(code1_lines, code2_lines)
    added_parts = []
    deleted_parts = []

    for line in diff:
        if line.startswith('+ '):
            added_parts.append(line[2:])
        elif line.startswith('- '):
            deleted_parts.append(line[2:])

    # Store results in rows list
    rows.append({
        'Timestamp1': timestamp1,
        'Timestamp2': timestamp2,
        'Code1': code1,
        'Code2': code2,
        'Added Parts': '\n'.join(added_parts),
        'Deleted Parts': '\n'.join(deleted_parts)
    })

# Convert list of rows to DataFrame
df = pd.DataFrame(rows, columns=columns)

# Save the DataFrame to a CSV file
df.to_csv('code_comparisons.csv', index=False)

print("Comparison results saved to code_comparisons.csv")


Comparison results saved to code_comparisons.csv
