In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import numpy as np
import sys
import os
import re

# Used to accelerate plotting DAMON figures.
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

from matplotlib.colors import LogNorm, hsv_to_rgb

def find_region_id(row, df2):
    #print(row)
    #time = row['time']
    addr = row['PageFrame']
    matches = df2[
            (df2['start'] <= addr) &
            (df2['end'] >= addr)
            #(df2['start_addr'] <= addr) &
            #(df2['end_addr'] >= addr)
            ]
    if not matches.empty:
        return matches.iloc[0]['rno'].astype(int)  # if multiple matches, take the first
    else:
        #print("Failed! time {} addr {}".format(time,addr))
        #exit()
        return None

# Prepare a df for given PEBS sample file
def prepare_pebs_df(file):
    # Read the file line by line
    with open(file) as f:
        rows = [line.strip().split() for line in f if line.strip()]

    # Find the maximum number of columns in any row
    max_cols = max(len(row) for row in rows)

    # Pad each row so all have the same length
    #padded_rows = [row + [np.nan]*(max_cols - len(row)) for row in rows]

    # Function to pad each row with the last recorded value
    def pad_row(row, target_length):
        if len(row) < target_length:
            last_value = row[-1]
            # Extend the row with the last_value until it reaches the target length
            row = row + [last_value] * (target_length - len(row))
        return row

    # Pad each row accordingly
    padded_rows = [pad_row(row, max_cols) for row in rows]

    # Create a DataFrame
    df = pd.DataFrame(padded_rows)

    # Rename columns: first column as 'PageFrame' and remaining as 'Epoch1', 'Epoch2', ...
    df.rename(columns={0: "PageFrame"}, inplace=True)
    df.columns = ["PageFrame"] + [f"Epoch_{i}" for i in range(1, max_cols)]

    df["PageFrame"] = df["PageFrame"].apply(lambda x: hex(int(x, 16) << 21))

    # Convert epoch columns to numeric
    for col in df.columns[1:]:
        df[col] = pd.to_numeric(df[col])


    # Set PageFrame as index for easier time-series operations
    df.set_index("PageFrame", inplace=True)

    df = df.copy() # Improves performance? df is sparse otherwise

    # Compute the deltas across epochs
    delta_df = df.diff(axis=1)

    # For the first epoch, fill NaN with the original epoch value
    first_epoch = df.columns[0]
    delta_df[first_epoch] = df[first_epoch]

    # Reorder columns to ensure the first epoch is first
    delta_df = delta_df[df.columns]

    # Optional: Convert column names to a numeric index if desired
    # For plotting purposes, we can remove the 'Epoch_' prefix and convert to int
    delta_df.columns = [int(col.replace("Epoch_", ""))*0.5 for col in delta_df.columns]

    # If we want to use plt instead of sns, melt df into long form
    df_long = (
        delta_df
        .reset_index()
        .melt(id_vars=["PageFrame"], var_name="epoch", value_name="value")
    )
    df_long["PageFrame"] = df_long["PageFrame"].apply(lambda x: int(x,16))
    return df_long

    return delta_df

def get_reuse_distance_df(df):
    df_zero_streak_sorted = df.sort_values(by=['PageFrame', 'epoch']).reset_index(drop=True)
    
    # Container for results
    results = []
    
    # Group by PageFrame
    for pf, group in df_zero_streak_sorted.groupby('PageFrame'):
        # Mark where value == 0
        zero_mask = group['value'] == 0
    
        # Identify start of new streaks using the change in zero_mask
        streak_id = (zero_mask != zero_mask.shift()).cumsum()
    
        # For value == 0 streaks only, compute their lengths
        zero_streaks = group[zero_mask].groupby(streak_id).size()
    
        # Get the max streak length (0 if none)
        max_streak = zero_streaks.max() if not zero_streaks.empty else 0
    
        results.append({'PageFrame': pf, 'reuse_distance': max_streak})
    
    # Create a new dataframe
    streak_df = pd.DataFrame(results)
    return streak_df

def calculate_duty_cycle(df):
    # Calculate Duty Cycle
    non_zero_df = df[df['value'] != 0]
    counts = non_zero_df.groupby('PageFrame').size()
    counts.name = 'duty_cycle'
    df = df.merge(counts, on='PageFrame', how='left')
    df['duty_cycle'] = df['duty_cycle'].fillna(0).astype(int)
    df['duty_cycle_sample_count'] = len(df['epoch'].unique())
    df['duty_cycle_percent'] = (df['duty_cycle'] / len(df['epoch'].unique())*100).astype(int)
    return df

In [2]:
vma_df = (pd.read_csv('../../smap_deduplicated.csv'))

vma_df['start'] = vma_df['start'].apply(lambda x: int(x,16))
vma_df['end'] = vma_df['end'].apply(lambda x: int(x,16))

In [3]:
# Get only vma with no pathname (anon region) and a size over 2 MB
filtered_vma_df = (vma_df[pd.isna(vma_df['pathname']) & (vma_df['size'] >= (1<<21))])

In [16]:
# Read in pebs data and bin in N second intervals
N = 10
df = prepare_pebs_df('../../results/results_gapbs_vma/gapbs_cc_sv_samples.dat')
df['time_bin'] = (df['epoch'] // N).astype(int)
print(df)
dfs_by_interval = {
    f"{N * bin}s_to_{N * (bin + 1)}s": group.drop(columns='time_bin')
    for bin, group in df.groupby('time_bin')
}

               PageFrame  epoch  value  time_bin
0                4194304    0.5      0         0
1                6291456    0.5      0         0
2                8388608    0.5      0         0
3               10485760    0.5      0         0
4               12582912    0.5      0         0
...                  ...    ...    ...       ...
2733219  140737345748992  266.5      0        26
2733220  140737347846144  266.5      3        26
2733221  140737349943296  266.5     11        26
2733222  140737352040448  266.5     12        26
2733223  140737486258176  266.5      0        26

[2733224 rows x 4 columns]


In [30]:
for df in dfs_by_interval.values():
    time_bin_df = df.copy()
    duty_df = calculate_duty_cycle(time_bin_df)
    duty_df = duty_df.drop_duplicates(subset='PageFrame')[['PageFrame', 'duty_cycle', 'duty_cycle_sample_count', 'duty_cycle_percent']]
    #duty_avg_df = (df.drop(columns=['epoch', 'duty_cycle_sample_count', 'duty_cycle', 'value']).groupby('PageFrame', as_index=False).mean().astype(int))

    #print(duty_df)
    streak_df = get_reuse_distance_df(time_bin_df)
    time_bin_df = time_bin_df.merge(streak_df, on='PageFrame', how='left')
    
    #print(streak_df)
    #break
    page_stat_df = time_bin_df.groupby('PageFrame').agg(
        {
            'value': ['mean', 'std', 'min', 'max'],
            'reuse_distance': ['mean', 'std', 'min', 'max']
        }
    )

    # Combine duty cycle info with access statistics
    page_stat_df.columns = ['_'.join(col) for col in page_stat_df.columns]
    page_stat_df = page_stat_df.merge(duty_df, on='PageFrame', how='left')
    page_stat_df = page_stat_df.reset_index()

    # Apply region numbers, do this last on smaller aggregated data set because it takes a while.
    page_stat_df['rno'] = page_stat_df.apply(lambda row: find_region_id(row, filtered_vma_df), axis=1)
    page_stat_df = page_stat_df.dropna()
    page_stat_df['rno'] = page_stat_df['rno'].astype(int)
    print(page_stat_df)

    break

      index        PageFrame  value_mean   value_std  value_min  value_max  \
301     301  140727233282048    0.000000    0.000000          0          0   
302     302  140727235379200    0.000000    0.000000          0          0   
303     303  140727237476352    0.000000    0.000000          0          0   
304     304  140727239573504    0.000000    0.000000          0          0   
305     305  140727241670656    0.000000    0.000000          0          0   
...     ...              ...         ...         ...        ...        ...   
5119   5119  140737337360384  127.263158  264.981308          0        854   
5120   5120  140737339457536  110.000000  251.484702          0        862   
5121   5121  140737341554688  149.421053  325.104140          0       1163   
5122   5122  140737343651840  212.368421  430.572256          0       1531   
5123   5123  140737345748992    1.473684    2.776257          0          8   

      reuse_distance_mean  reuse_distance_std  reuse_distance_m

In [6]:
# Assign each page frame a VMA from out filtered VMA data set
df['rno'] = df.apply(lambda row: find_region_id(row, filtered_vma_df), axis=1)
df = df.dropna()
df['rno'] = df['rno'].astype(int)
print(df)

             PageFrame epoch  value  rno
301    140727233282048   0.5      0    6
302    140727235379200   0.5      0    6
303    140727237476352   0.5      0    6
304    140727239573504   0.5      0    6
305    140727241670656   0.5      0    6
...                ...   ...    ...  ...
97423  140737337360384   9.5      0    6
97424  140737339457536   9.5      0    6
97425  140737341554688   9.5      0    6
97426  140737343651840   9.5      0    6
97427  140737345748992   9.5      0    6

[91637 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rno'] = df['rno'].astype(int)


In [7]:
page_stat_df = df.groupby('PageFrame').agg(
        {
            'value': ['mean', 'std', 'min', 'max'],
            #'reuse_distance': ['count', 'mean', 'std', 'min', 'max']
        }
    )
page_stat_df.columns = ['_'.join(col) for col in page_stat_df.columns]
page_stat_df = page_stat_df.reset_index()
print(page_stat_df)

            PageFrame  value_mean   value_std  value_min  value_max
0     140727233282048    0.000000    0.000000          0          0
1     140727235379200    0.000000    0.000000          0          0
2     140727237476352    0.000000    0.000000          0          0
3     140727239573504    0.000000    0.000000          0          0
4     140727241670656    0.000000    0.000000          0          0
...               ...         ...         ...        ...        ...
4818  140737337360384  127.263158  264.981308          0        854
4819  140737339457536  110.000000  251.484702          0        862
4820  140737341554688  149.421053  325.104140          0       1163
4821  140737343651840  212.368421  430.572256          0       1531
4822  140737345748992    1.473684    2.776257          0          8

[4823 rows x 5 columns]


In [8]:
duty_df = (df.drop(columns=['epoch', 'duty_cycle_sample_count', 'duty_cycle', 'value']).groupby('PageFrame', as_index=False).mean().astype(int))
page_stat_df = page_stat_df.merge(duty_df, on='PageFrame', how='left')
print(page_stat_df)

KeyError: "['duty_cycle_sample_count', 'duty_cycle'] not found in axis"

In [None]:
streak_df = get_reuse_distance_df(df)
print(streak_df)

In [None]:
page_stat_df = page_stat_df.merge(streak_df, on='PageFrame', how='left')
print(page_stat_df)

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def apply_cluster(page_stat_df):
    scaler = StandardScaler()
    features = page_stat_df.drop(columns=['PageFrame', 'rno'])
    scaled_features = scaler.fit_transform(features)
    
    pca_col = ['pc1', 'pc2']
    pca = PCA(n_components=2)
    pca_df = pd.DataFrame(pca.fit_transform(scaled_features), columns=pca_col)
    
    k = 8
    kmeans = KMeans(n_clusters=k)
    #kmeans.fit(scaled_features)
    kmeans.fit(pca_df)
    
    page_stat_df['cluster'] = kmeans.labels_
    page_stat_df_merged = (pd.concat([page_stat_df, pca_df], axis=1))
    page_stat_df_merged['start_epoch'] = 0

    # Return new df with cluster labels and pca values
    return page_stat_df_merged

clustered_df = apply_cluster(page_stat_df)

In [None]:
print(clustered_df)

sns.scatterplot(data=clustered_df, y='pc2', x='pc1', hue='cluster', palette=sns.color_palette("tab10"))

In [None]:
vma_stat_df = page_stat_df.groupby('rno').agg(
        {
            'duty_cycle_percent': ['count', 'mean', 'std', 'min', 'max'],
            'reuse_distance': ['count', 'mean', 'std', 'min', 'max']
        }
    )

filtered_vma_df = vma_df
filtered_vma_df = vma_df[vma_df['pathname'] == pd.NA]

vma_stat_df.columns = ['_'.join(col) for col in vma_stat_df.columns]
vma_stat_df = vma_stat_df.reset_index()
vma_stat_df = pd.merge(vma_stat_df, filtered_vma_df, on='rno', how='left')
vma_stat_df['rss_kb_percent'] = vma_stat_df['rss_kb'] / vma_stat_df['size']
vma_stat_df['pss_kb_dirty_percent'] = vma_stat_df['pss_dirty'] / vma_stat_df['pss_kb']
print(vma_stat_df)

In [None]:
# Filter out VMAs

In [None]:
#vma_stat_df = vma_stat_df.loc[vma_stat_df.groupby('rno')['size'].idxmax()]
#print(vma_stat_df.reset_index(drop=True).drop(columns=['epoch', 'start', 'end', 'inode', 'pathname', 'rss_kb', 'pss_kb', 'pss_dirty', 'referenced']))