In [1]:
try:
    import psycopg2
    import psycopg2.extras
    import pandas as pd
    import opendp
    from opendp.transformations import *
    from opendp.measurements import *
    from opendp.mod import enable_features
    enable_features("contrib")
    from opendp.mod import binary_search
    from opendp.accuracy import discrete_laplacian_scale_to_accuracy
    # Import time module, logging, os
    import time
    import logging
    import os
    print("libraries imported")
except ImportError as e:
    print(f"Error importing libraries: {e}")

libraries imported


In [2]:
def postgres_DB(dbname, user, host, password):
    try:
        # Connect to the database
        conn = psycopg2.connect(dbname=dbname, user=user, host=host, password=password)
        try:
            # Create a cursor
            cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
            print("Connected to database, cursor defined")
            return conn, cur
        except psycopg2.Error as e:
            print(f"Error creating cursor: {e}")
            # Close the connection in case of an error
            conn.close()
            return None, None      
    except psycopg2.Error as e:
        print(f"Error connecting to database: {e}")
        return None, None

In [3]:
def csv_scan(first_file):
    #Scanning CSV's into dataframes
    df = pd.read_csv(first_file)
    return df

In [4]:
def postgres_scan(table_name, cursor):
    # Execute a query to fetch column names and data from the table
    query = f"SELECT * FROM {table_name}"
    cursor.execute(query)

    # Fetch column names from the cursor description
    column_names = [desc[0] for desc in cursor.description]

    # Create a DataFrame from the fetched data
    df = pd.DataFrame(cursor.fetchall(), columns=column_names)

    return df

In [5]:
def filter_df(df, condition):
    # Filter the DataFrame based on the given condition
    filtered_df = df.query(condition)
    # Reset index in the new dataframe
    filtered_df.reset_index(drop=True, inplace=True)

    return filtered_df

In [6]:
def pad_data(df, filtered_df, track_column):
    # Find the maximum number of rows in the original DataFrame
    max_rows = len(df)

    # Find the difference in the number of rows
    num_rows_diff = len(df) - len(filtered_df)

    # Create a DataFrame of zeros with the same columns as low_gpa_df
    zeros_df = pd.DataFrame(0, index=range(num_rows_diff), columns=filtered_df.columns)

    # Concatenate low_gpa_df and zeros_df vertically
    df_padded = pd.concat([filtered_df, zeros_df],ignore_index=True)

    # Create a dummy column to track zero-filled rows
    # .astype(int)converts boolean TRUE/FALSE to 1-0
    # 0 for False(fake data), 1 for True(real data)
    zero_filled = ~(df_padded == 0).all(axis=1)
    df_padded[track_column] = zero_filled.astype(int)

    return df_padded

In [7]:
#Partially pad so that the data contains k*100 % dummy values
def DP_pad_data(filtered_df, original_df, k, track_column):
#     if not isinstance(filtered_df, pd.DataFrame) or not isinstance(original_df, pd.DataFrame):
#         raise ValueError("Both filtered_df and original_df should be valid DataFrames.")
    min_size = len(filtered_df)
    max_size = len(original_df)
    if min_size >= max_size:
        raise ValueError("filtered_df should have fewer rows than original_df.")
    size_diff = max_size - min_size
    
    if not 0 < k < 1:
        raise ValueError("The value of k should be between 0 and 1.")
        
    # Calculate the number of rows to pad, but ensure it does not exceed size_diff 
    # to satisfy k% dummy rows in the total size, this is the cross-formula, this could grow past max_size sometimes
    n_rows=int((min_size * k)/(1-k))
    
    #when total number of rows after pad reaches max_size, meaning n_rows=size_diff #cross maths
    max_k = size_diff / (min_size + size_diff)
    
    if n_rows > size_diff:
        print(f"Sorry, padding up to {k*100:.2f}% dummy in this dataset is not applicable, as it will exceed the size of input data.\n"
              f"Your data will be padded to the maximum input size of {max_size}.")
        print(f"Maximum effective k: {max_k:.3f}, beyond this value the data will be padded to the maximum size.\n"
              f"Runtime is now equal to fully private count.")
        
    #realistic pad rows might be different because we cannot exceed input (original datasize)
    # the min funct ensures padding only til the max allowed
    pad_rows = min(n_rows, size_diff)
    size_df_padded = min_size + pad_rows
    
    zeros_df = pd.DataFrame(0, index=range(pad_rows), columns=filtered_df.columns)
    df_padded = pd.concat([filtered_df, zeros_df], ignore_index=True)
#     print(f"Size padded such that the dataset has {k*100:.2f}% dummy data: {df_padded.shape[0]}")
#     print()

    
    zero_filled = ~(df_padded == 0).all(axis=1)
    df_padded[track_column] = zero_filled.astype(int)
    
    return df_padded

In [8]:
def join(df, df2, merge_type, column_compare):
    final_df = pd.merge(left = df, right = df2, how = merge_type, on = column_compare)
    return final_df

In [9]:
#count the number of real data (not padded) in a padded dataframe -> count that hides execution time
def count_real(padded_df, track_column):
    count_real = (padded_df[track_column] == 1).sum()    
    #convert to int (bc how pandas handle boolean)
    return count_real.item()  

In [10]:
def mean_real(df, column): 
    #later worry about which df
    # Check if the column exists in the DataFrame
    if column in df.columns:
        # Calculate the mean for the specified column
        mean_value = df[column].mean()
        return mean_value
    else:
        raise ValueError("Column '{}' does not exist in the DataFrame.".format(column_name))

In differential privacy, sensitivity, epsilon, and scale are important concepts used in privacy-preserving mechanisms to control the amount of noise added to query results and ensure privacy protection.

Sensitivity:
Sensitivity refers to the maximum possible change in the query result when a single individual's data is added or removed from the dataset. It measures how much impact a single individual can have on the query result. The sensitivity value is specific to each query and dataset.

Epsilon (ε):
Epsilon is a privacy parameter that controls the level of privacy protection provided by a differentially private mechanism. A smaller value of epsilon provides stronger privacy guarantees but may result in noisier query results. On the other hand, a larger value of epsilon provides weaker privacy guarantees but yields less noisy query results.

The range of epsilon (ε) in differential privacy is typically between 0 and infinity. However, in practice, epsilon is always a positive value, and it's common to use epsilon values that are greater than or equal to 0 and less than or equal to 1.

A smaller epsilon value provides stronger privacy guarantees, meaning that the released data is more protected and less likely to reveal sensitive information about individuals in the dataset. On the other hand, a larger epsilon value provides weaker privacy guarantees, meaning that the released data may be more accurate but less private.

The choice of epsilon depends on the specific privacy requirements and the level of privacy protection needed for the application. In general, smaller epsilon values are preferred for scenarios where strong privacy protection is essential, such as medical or financial data. However, larger epsilon values can be used for scenarios where a balance between privacy and data accuracy is acceptable.

It's important to note that as epsilon increases, the amount of noise added to the query result decreases, which means the released data becomes less private. Therefore, the selection of an appropriate epsilon value requires a careful trade-off between privacy and utility (accuracy of the released data).

Scale:
Scale is a parameter used in the Laplace and Gaussian mechanisms to determine the amount of noise added to the query result. For the Laplace mechanism, the scale is directly proportional to sensitivity/epsilon, while for the Gaussian mechanism, the scale is sensitivity/epsilon. A higher scale value means more noise is added, which results in more privacy protection.

In [11]:
def mechanism_selection():
    print("Select a mechanism for adding noise:")
    print("1. Laplace Noise")
    print("2. Gaussian Noise")
    print("3. Random Noise")
    
    while True:
        choice = input("Enter your choice (1, 2, or 3): ")
        if choice == "1":
            return "laplace"
        elif choice == "2":
            return "gaussian"
        elif choice == "3":
            return "random"
        else:
            print("Invalid choice. Please enter 1, 2, or 3.")

In [12]:
def privacy_parameters():
    print("In order to add noise using Laplace, Gaussian or Random mechanism, first determine privacy parameters.")
    print("Enter the sensitivity (e.g., 1): ")
    sensitivity = float(input())
    print("Enter the epsilon (e.g., 0.1): ")
    epsilon = float(input())
    # Calculate the scale parameter for Laplace distribution
    scale = sensitivity / epsilon
    return sensitivity, epsilon, scale

In [13]:
def add_noise(result, mechanism, scale):
    if mechanism == "laplace":
        return laplace_noise(result,scale)
    elif mechanism == "gaussian":
        return gaussian_noise(result,scale)
    elif mechanism == "random":
        return random_noise(result,scale)
    else:
        raise ValueError("Invalid mechanism choice. Supported mechanisms are 'laplace' and 'gaussian'.")

The result is a random draw from the discrete Laplace/Gaussian distribution, centered at the true count of the number of records in the underlying dataset.

In [14]:
# Add Laplace noise for differential privacy
def laplace_noise(result, scale):
    #scale here just has to be hard-coded. Because scale is proportional but not equal to sensitivity/epsilon
    #there isn't a universal way I've found to calculate scale for laplace without knowing standard deviation.
    if isinstance(result, int):
        dp_count = make_base_discrete_laplace(scale) 
    if isinstance(result, float):
        dp_count = make_base_laplace(scale) 
    noisy_result = dp_count(result)
    return noisy_result

In [15]:
def gaussian_noise(result,scale):
    if isinstance(result, int):
        dp_count = make_base_discrete_gaussian(scale)
    if isinstance(result, float):
        dp_count = make_base_gaussian(scale) 
    noisy_result = dp_count(result)
    return noisy_result

In [16]:
import numpy as np

def random_noise(result, scale):
    # Generate Laplace noise and add it to the result
    #scale is hard-coded, explanation above
    if isinstance(result, int):
        noisy_result = int(result + np.random.laplace(loc=0, scale=scale))
    if isinstance(result, float):
        noisy_result = float(result + np.random.laplace(loc=0, scale=scale))
    return noisy_result

In [17]:
def max_deviation(scale, alpha=None):
    if alpha is None:
        alpha = float(input("Enter the alpha (e.g., 0.05): "))

    max_deviation = discrete_laplacian_scale_to_accuracy(scale=scale, alpha=alpha)
    return max_deviation

In [18]:
def read_log_file(log_file_path):
    print("Trying to read log file:", log_file_path)
    try:
        if os.path.exists(log_file_path):
            with open(log_file_path, 'r') as log_file:
                log_contents = log_file.read()
            return log_contents
        else:
            return f"Log file '{log_file_path}' not found."
    except Exception as e:
        return f"Error occurred while reading the log file: {e}"

In [19]:
import gspread
from gspread_dataframe import set_with_dataframe

def parse_log_content(log_content):
    parsed_data = []
    unique_entries = set()

    for line in log_content.split("\n"):
        if line.strip() and '-' not in line:
            print(f"Warning: Skipping line due to invalid format: {line}")
            continue  # Skip to the next iteration without processing further
        if not line.strip():
            continue

        try:
            if " - " not in line:
                raise ValueError(f"Invalid log format in line: {line}")
            # Split based on "-"
            timestamp_str, event_duration_str = line.split(" - ", 1)
            # Remove trailing white space
            timestamp = timestamp_str.strip()
            #print(f"Timestamp: {timestamp_str}")

            # Split the event and duration string with a maximum of 1 split
            event, *duration_str_parts = event_duration_str.split(": ", 1)
            # Join the duration string parts back together in case there were additional separators in the event description
            duration_str = ": ".join(duration_str_parts).strip()
#             print(f"Event: {event}")
#             print(f"Event-Duration: {event_duration_str}")
#             print(f"Duration String Parts: {duration_str_parts}")

            if not duration_str:
                raise ValueError(f"Invalid log format in line: {line}", "missing duration string")

            # Check if the duration includes units (e.g., 'ms' or 'seconds')
            if 'ms' in duration_str:
                duration = float(duration_str.split(" ms")[0])
            elif 'seconds' in duration_str:
                duration = float(duration_str.split(" seconds")[0]) * 1000  # Convert seconds to milliseconds
            else:
                raise ValueError(f"Invalid duration format in line: {line}", "missing time")

            # Extract the value of 'k' from the event description
            k_str_parts = event.split("k=")
            if len(k_str_parts) < 2:
                raise ValueError(f"Invalid 'k' value in line: {line}")
            k_str = k_str_parts[1].split(">")[0]
            k = float(k_str)

            # Extract the data size from the event description
            data_size_parts = event.split("<Datasize=")
            if len(data_size_parts) < 2:
                raise ValueError(f"Invalid 'Data Size' value in line: {line}")
            data_size = data_size_parts[1].split(",")[0]

            # Check if the entry is unique, if yes, add it to the parsed_data list and the unique_entries set
            entry_key = (timestamp, event, k, duration, data_size)
            if entry_key not in unique_entries:
                unique_entries.add(entry_key)
                parsed_data.append({"Timestamp": timestamp, "Event": event, "k": k, "Duration": duration, "Datasize": data_size})
            else:
                print(f"Warning: Skipping duplicate entry: {line}")

        except ValueError as e:
            # Handle the case when the line does not match the expected format, here its the separator -------
            print(f"Warning for parse_log_content: skip line invalid form: {line}")
#             print(f"Timestamp: {timestamp_str}")
#             print(f"Event-Duration: {event_duration_str}")
#             print(f"Event: {event}")
#             print(f"Duration String Parts: {duration_str_parts}")

    return parsed_data


In [20]:
import json
import warnings
import re
def write_log_GS(parsed_data):
    if not parsed_data:
        print("No valid log lines were found.")
        return
    try:
        # Authenticate with Google Sheets using the credentials JSON file
        gc = gspread.service_account(filename='verse-project-timesheet-f4792514dd22.json')
        sheet = gc.open('Code Execution Time Log')
        worksheet = sheet.worksheet('Sheet1')

        # Create a list of column names
        column_names = list(parsed_data[0].keys())

        # Clear the existing data in the worksheet and write the column names
        worksheet.clear()
        worksheet.append_row(column_names)

        # Convert the parsed data to a list of lists (values) for writing to the Google Sheet
        values = [list(entry.values()) for entry in parsed_data]

        # Write the values to the Google Sheet, starting from cell A2 (after the column names)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            worksheet.update('A2', values, value_input_option='USER_ENTERED')

        print("Log data has been written to the Google Sheet.")
    except Exception as e:
        print(f"An error occurred while writing data to Google Sheet: {e}")
        print("Error details:")
        import traceback
        traceback.print_exc()


In [21]:
def log_execution_time(event_data, data_size):
    for event, k, time_ms in event_data:
        logging.info(f'{event} <Datasize={data_size},k={k}>: {time_ms:.3f} ms')
        

In [22]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials

def read_data_from_google_sheets(sheet_name, worksheet_name):
    try:
        # Authenticate with Google Sheets using the credentials JSON file
        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
        creds = ServiceAccountCredentials.from_json_keyfile_name('verse-project-timesheet-f4792514dd22.json', scope)
        client = gspread.authorize(creds)

        # Open the Google Sheet
        sheet = client.open(sheet_name)

        # Read data from the specified worksheet
        worksheet = sheet.worksheet(worksheet_name)
        data = worksheet.get_all_records()

        return data
    except Exception as e:
        print(f"An error occurred while reading data from Google Sheet: {e}")
        return None


In [23]:
data_sizes = ['1KB', '500KB', '1MB', '500MB', '1GB']

In [24]:
def main(data_sizes):
          
    #start writing csv code here without interfering with postgress code   
    #'''
    try:
        # Initialize the log file
        log_file_path = os.path.join(os.getcwd(), 'code_execution.log')
        logging.basicConfig(filename=log_file_path, level=logging.INFO,
                            format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
        print("""Access Google Sheet at this link:
https://docs.google.com/spreadsheets/d/12uLBDyk8io9BvxWPqtluN-K1pmnGekNdFz2TmgqoaDM/edit?usp=sharing""")
 
        for data_size in data_sizes:
            # Scan data
            start_scan = time.time()
            df = csv_scan(f"{data_size}.csv")
            end_scan = time.time()
            scan_time = (end_scan - start_scan) * 10**3

            # Filter data
            start_filter = time.time()
            filtered_column = 'Age'
            condition = f"{filtered_column} < 26"
            young_df = filter_df(df, condition)
            end_filter = time.time()
            filter_time = (end_filter - start_filter) * 10**3

            # Pad data
            start_pad = time.time()
            young_padded = pad_data(df, young_df, "dummy")
            end_pad = time.time()
            pad_time = (end_pad - start_pad) * 10**3

            # Fully Public count
            start_public = time.time()
            young_count = young_df.shape[0]
            end_public = time.time()
            public_time = (end_public - start_public) * 10**3

            # Fully Private count
            start_priv = time.time()
            real_count = count_real(young_padded, 'dummy')
            end_priv = time.time()
            full_priv_time = (end_priv - start_priv) * 10**3

            # DP Private pad
            k = 0.25
            dp_padded = DP_pad_data(young_df, df, k, "dummy")
            start_dp = time.time()
            dp_padded_count = count_real(dp_padded, 'dummy')
            end_dp = time.time()
            dp_padded_time = (end_dp - start_dp) * 10**3
            
            # print the difference between start and end time in milli. secs
            # print("TIME to 3 decimal ") 
            print("DATASIZE:", data_size)
            print("Dummy Percentage <for DP Private only>:", k*100)
            # Log the time duration of each event
            events = [
                ('Scan', k, scan_time),
                ('Filter', k, filter_time),
                ('Pad', k, pad_time),
                ('Fully Public', k, public_time),
                ('Fully Private', k, full_priv_time),
                ('DP Private', k, dp_padded_time)
            ]


            # logging the time duration of each event
            log_execution_time(events, data_size)
            logging.info('-' * 38)

            # Open and read log
            #log_file_path = 'code_execution.log'
            log_content = read_log_file(log_file_path)
            #print(f"LOG CONTENT\n{log_content}")
            
            # Parse the log content to a structured format
            parsed_data = parse_log_content(log_content)
            

            # Call the write_log_GS function with the parsed data
            write_log_GS(parsed_data)

#             #Print the log data for verification
#             print("Log Data After Write:")
#             for entry in parsed_data:
#                 print("PARSED DATA\n", entry)
                
            data = read_data_from_google_sheets("Code Execution Time Log", "Sheet1")
            if data:
                print("Data read from Google Sheet\n",data)
                #pass
                print()
            else:
                print("Failed to read data from Google Sheet.")
            
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Error details:")
        import traceback
        traceback.print_exc()
            
    #'''
            
    
       
if __name__ == "__main__":
    main(data_sizes)

Access Google Sheet at this link:
https://docs.google.com/spreadsheets/d/12uLBDyk8io9BvxWPqtluN-K1pmnGekNdFz2TmgqoaDM/edit?usp=sharing
DATASIZE: 1KB
Dummy Percentage <for DP Private only>: 25.0
Trying to read log file: /Users/anhpham/Projects/verse_project/code_execution.log
Log data has been written to the Google Sheet.
Data read from Google Sheet
 [{'Timestamp': '2023-08-01 0:08:35', 'Event': 'Scan <Datasize=1KB,k=0.25>', 'k': 0.25, 'Duration': 2.574, 'Datasize': '1KB'}, {'Timestamp': '2023-08-01 0:08:35', 'Event': 'Filter <Datasize=1KB,k=0.25>', 'k': 0.25, 'Duration': 1.381, 'Datasize': '1KB'}, {'Timestamp': '2023-08-01 0:08:35', 'Event': 'Pad <Datasize=1KB,k=0.25>', 'k': 0.25, 'Duration': 0.817, 'Datasize': '1KB'}, {'Timestamp': '2023-08-01 0:08:35', 'Event': 'Fully Public <Datasize=1KB,k=0.25>', 'k': 0.25, 'Duration': 0.001, 'Datasize': '1KB'}, {'Timestamp': '2023-08-01 0:08:35', 'Event': 'Fully Private <Datasize=1KB,k=0.25>', 'k': 0.25, 'Duration': 0.15, 'Datasize': '1KB'}, {'Tim

DATASIZE: 1GB
Dummy Percentage <for DP Private only>: 25.0
Trying to read log file: /Users/anhpham/Projects/verse_project/code_execution.log
Log data has been written to the Google Sheet.
Data read from Google Sheet
 [{'Timestamp': '2023-08-01 0:08:35', 'Event': 'Scan <Datasize=1KB,k=0.25>', 'k': 0.25, 'Duration': 2.574, 'Datasize': '1KB'}, {'Timestamp': '2023-08-01 0:08:35', 'Event': 'Filter <Datasize=1KB,k=0.25>', 'k': 0.25, 'Duration': 1.381, 'Datasize': '1KB'}, {'Timestamp': '2023-08-01 0:08:35', 'Event': 'Pad <Datasize=1KB,k=0.25>', 'k': 0.25, 'Duration': 0.817, 'Datasize': '1KB'}, {'Timestamp': '2023-08-01 0:08:35', 'Event': 'Fully Public <Datasize=1KB,k=0.25>', 'k': 0.25, 'Duration': 0.001, 'Datasize': '1KB'}, {'Timestamp': '2023-08-01 0:08:35', 'Event': 'Fully Private <Datasize=1KB,k=0.25>', 'k': 0.25, 'Duration': 0.15, 'Datasize': '1KB'}, {'Timestamp': '2023-08-01 0:08:35', 'Event': 'DP Private <Datasize=1KB,k=0.25>', 'k': 0.25, 'Duration': 0.121, 'Datasize': '1KB'}, {'Timesta

In [25]:

#write some small program to increase file size
#to get data points for the x-bar


#scan, filter, join, count
#extract 
#run individually
#just scan, just joint
#if 25% 50% percent dummy then how the performance change
#setting 25% dummy


#mechanism - getscale
#check DP pad
#about logging, check if my log is checking each stage? is that what we re trying to do
#try putting log in google sheet
#visualization

#iguring out how to extract datasize to become data points on x-axis
#formulate problem, we need an extra column for datasize, how do we write this datasize column in the function
#but handle cases when this datasize is not applicable as well


#commenting the example code
#subsitute