start of exploration of flags with manhatten plot

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
buildsi · Nov 24, 2021 · 08f9685 · 08f9685
1 parent db1f901
commit 08f9685
Show file tree

Hide file tree

Showing 8 changed files with 41,136 additions and 139 deletions.
diff --git a/association-analysis/hill-climb/README.md b/association-analysis/hill-climb/README.md
@@ -89,12 +89,23 @@ by doing:
 ### Linear Regression
 
 The goal here would be to predict the importance of a flag based on tokens, or breaking the code up into tiny pieces.
-This will require using the [tokens](../tokens).
+This will require using the [tokens](../tokens). Note that I tried this (script not preserved) and there was serious overfitting,
+but I did notice in the PDF that some flags have HUGE performance improvements, so it might make sense to just look at them.
+
+### Manhattan
+
+What I quickly saw with linear regression was overfitting up the wazoo (like, a perfectly straight line, nope!) So I decided to look at the flags pdf and filter out some set of flags that had a huge increase in performance, and then I'd look at the assembly before and after. But first I thought I'd try a visualization that is usually used for showing significant gene p-values - the manhatten plot! For compiler flags!
 
 ```bash
-$ python linear_regression.py data/flags-delta-times.csv ../tokens/data/
+$ python manhattan.py run data/flags-delta-times.csv ../tokens/data/
 ```
 
-TODO:
+This generates a manhatten plot for all the flags, and then a filtered one with values > 1.3. As a reminder, a value of 1 is the baseline time for the program, so anything above 1 is faster. I like this visualization because it shows a nice little row of flags that are clearly better! But I wanted to filter it a bit more to better look at the actual flags, and that's the second pdf.
+
+### Assembly
 
-this is overfitting up the wazoo! Let's look at the flags pdf and find the few flags that do a LOT better and try to understand why.
+Okay - now we can filter down to a set of flags and scripts that have a bit better performance. What I want to do is to look at the assemly of the program with and without the flag, and try to understand what is being optimized.
+
+```bash
+$ python explore_assembly.py run data/flags-times-flat.csv
+```
diff --git a/association-analysis/hill-climb/data/flags-times-flat.csv b/association-analysis/hill-climb/data/flags-times-flat.csv
diff --git a/association-analysis/hill-climb/data/manhattan-flags-filtered.pdf b/association-analysis/hill-climb/data/manhattan-flags-filtered.pdf
diff --git a/association-analysis/hill-climb/data/manhattan-flags-filtered.png b/association-analysis/hill-climb/data/manhattan-flags-filtered.png
diff --git a/association-analysis/hill-climb/data/manhattan-flags.pdf b/association-analysis/hill-climb/data/manhattan-flags.pdf
diff --git a/association-analysis/hill-climb/data/manhattan-flags.png b/association-analysis/hill-climb/data/manhattan-flags.png
diff --git a/association-analysis/hill-climb/linear_regression.py b/association-analysis/hill-climb/linear_regression.py
diff --git a/association-analysis/hill-climb/manhattan.py b/association-analysis/hill-climb/manhattan.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.linear_model import LinearRegression
+import seaborn as sns
+import numpy as np
+import pandas
+
+import argparse
+from glob import glob
+import json
+import os
+import re
+import shutil
+import sys
+
+import time
+
+# keep global results
+results = []
+
+here = os.path.dirname(os.path.abspath(__file__))
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="run")
+
+    description = (
+        "Run a linear regression to predict time improvement given a flag and tokens."
+    )
+    subparsers = parser.add_subparsers(
+        help="actions",
+        title="actions",
+        description=description,
+        dest="command",
+    )
+    run = subparsers.add_parser("run", help="run")
+    run.add_argument("csv", help="flags-delta-times.csv")
+    run.add_argument("tokens", help="tokens.csv")
+    return parser
+
+
+def manhattan(csv, tokens):
+    flags = pandas.read_csv(csv, index_col=0)
+    tokens = pandas.read_csv(tokens, index_col=0)
+
+    # Replace tokens paths with same pattern as flags
+    tokens.index = [
+        x.replace("home/vanessa/Desktop/Code/compilerop/association-analysis/code/", "")
+        .replace(" ", "-")
+        .replace("/", "-")
+        .rstrip(".cpp")
+        for x in tokens.index
+    ]
+
+    # Flatten the entire flags matrix
+    print("Flattening flags data frame...")
+    flat = pandas.DataFrame(columns=["flag", "program", "value"])
+    count = 0
+    for index, row in flags.iterrows():
+        for idx, value in enumerate(row):
+            flag = flags.columns[idx]
+            flat.loc[count] = [flag, index, value]
+            count +=1
+
+    flat.to_csv("data/flags-times-flat.csv")
+
+    # -log_10(pvalue)
+    flat['minuslog10pvalue'] = -np.log10(flat.value)
+    flat.flag = flat.flag.astype('category')
+    flat = flat.sort_values('flag')
+    flat['ind'] = range(len(flat))
+    df_grouped = flat.groupby(('flag'))
+
+    # manhattan plot
+    fig = plt.figure(figsize=(40, 10)) # Set the figure size
+    ax = fig.add_subplot(111)
+    colors = ['darkred','darkgreen','darkblue', 'gold']
+    x_labels = []
+    x_labels_pos = []
+    for num, (name, group) in enumerate(df_grouped):
+        group.plot(kind='scatter', x='ind', y='value', color=colors[num % len(colors)], ax=ax)
+        x_labels.append(name)
+        x_labels_pos.append((group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0])/2))
+    ax.set_xticks(x_labels_pos)
+    ax.set_xticklabels(x_labels, rotation=45)
+
+    # set axis limits
+    ax.set_xlim([0, len(flat)])
+    ax.set_ylim([0, 3])
+
+    # x axis label
+    ax.set_xlabel('Flag')
+
+    # show the graph
+    plt.savefig("data/manhattan-flags.pdf")
+    plt.savefig("data/manhattan-flags.png")
+
+    # Finally let's filter down to those >= 1.3
+    # Yes this code is redundant and terrible don't judge sometimes I do data science too!
+    filtered = flat[flat.value >= 1.3]
+    filtered['ind'] = range(len(filtered))
+    df_grouped = filtered.groupby(('flag'))
+
+    # manhattan plot
+    fig = plt.figure(figsize=(20, 10)) # Set the figure size
+    ax = fig.add_subplot(111)
+    colors = ['darkred','darkgreen','darkblue', 'gold']
+    x_labels = []
+    x_labels_pos = []
+    for num, (name, group) in enumerate(df_grouped):
+        if group.empty:
+            continue
+        group.plot(kind='scatter', x='ind', y='value', color=colors[num % len(colors)], ax=ax)
+        x_labels.append(name)
+        x_labels_pos.append((group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0])/2))
+    ax.set_xticks(x_labels_pos)
+    ax.set_xticklabels(x_labels, rotation=45)
+
+    # set axis limits
+    ax.set_xlim([0, len(filtered)])
+    ax.set_ylim([0, 3])
+
+    # x axis label
+    ax.set_xlabel('Flag')
+
+    # show the graph
+    plt.savefig("data/manhattan-flags-filtered.pdf")
+    plt.savefig("data/manhattan-flags-filtered.png")
+
+def main():
+    parser = get_parser()
+
+    def help(return_code=0):
+        parser.print_help()
+        sys.exit(return_code)
+
+    args, extra = parser.parse_known_args()
+    if not args.command:
+        help()
+
+    # Load data
+    if not args.csv or not os.path.exists(args.csv):
+        sys.exit("%s missing or does not exist." % args.csv)
+
+    manhattan(args.csv, args.tokens)
+
+
+if __name__ == "__main__":
+    main()