Skip to content

Commit

Permalink
start of exploration of flags with manhatten plot
Browse files Browse the repository at this point in the history
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
  • Loading branch information
vsoch committed Nov 24, 2021
1 parent db1f901 commit 08f9685
Show file tree
Hide file tree
Showing 8 changed files with 41,136 additions and 139 deletions.
19 changes: 15 additions & 4 deletions association-analysis/hill-climb/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,23 @@ by doing:
### Linear Regression

The goal here would be to predict the importance of a flag based on tokens, or breaking the code up into tiny pieces.
This will require using the [tokens](../tokens).
This will require using the [tokens](../tokens). Note that I tried this (script not preserved) and there was serious overfitting,
but I did notice in the PDF that some flags have HUGE performance improvements, so it might make sense to just look at them.

### Manhattan

What I quickly saw with linear regression was overfitting up the wazoo (like, a perfectly straight line, nope!) So I decided to look at the flags pdf and filter out some set of flags that had a huge increase in performance, and then I'd look at the assembly before and after. But first I thought I'd try a visualization that is usually used for showing significant gene p-values - the manhatten plot! For compiler flags!

```bash
$ python linear_regression.py data/flags-delta-times.csv ../tokens/data/
$ python manhattan.py run data/flags-delta-times.csv ../tokens/data/
```

TODO:
This generates a manhatten plot for all the flags, and then a filtered one with values > 1.3. As a reminder, a value of 1 is the baseline time for the program, so anything above 1 is faster. I like this visualization because it shows a nice little row of flags that are clearly better! But I wanted to filter it a bit more to better look at the actual flags, and that's the second pdf.

### Assembly

this is overfitting up the wazoo! Let's look at the flags pdf and find the few flags that do a LOT better and try to understand why.
Okay - now we can filter down to a set of flags and scripts that have a bit better performance. What I want to do is to look at the assemly of the program with and without the flag, and try to understand what is being optimized.

```bash
$ python explore_assembly.py run data/flags-times-flat.csv
```
40,970 changes: 40,970 additions & 0 deletions association-analysis/hill-climb/data/flags-times-flat.csv

Large diffs are not rendered by default.

Binary file not shown.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
135 changes: 0 additions & 135 deletions association-analysis/hill-climb/linear_regression.py

This file was deleted.

151 changes: 151 additions & 0 deletions association-analysis/hill-climb/manhattan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/usr/bin/env python3

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
import numpy as np
import pandas

import argparse
from glob import glob
import json
import os
import re
import shutil
import sys

import time

# keep global results
results = []

here = os.path.dirname(os.path.abspath(__file__))


def get_parser():
parser = argparse.ArgumentParser(description="run")

description = (
"Run a linear regression to predict time improvement given a flag and tokens."
)
subparsers = parser.add_subparsers(
help="actions",
title="actions",
description=description,
dest="command",
)
run = subparsers.add_parser("run", help="run")
run.add_argument("csv", help="flags-delta-times.csv")
run.add_argument("tokens", help="tokens.csv")
return parser


def manhattan(csv, tokens):
flags = pandas.read_csv(csv, index_col=0)
tokens = pandas.read_csv(tokens, index_col=0)

# Replace tokens paths with same pattern as flags
tokens.index = [
x.replace("home/vanessa/Desktop/Code/compilerop/association-analysis/code/", "")
.replace(" ", "-")
.replace("/", "-")
.rstrip(".cpp")
for x in tokens.index
]

# Flatten the entire flags matrix
print("Flattening flags data frame...")
flat = pandas.DataFrame(columns=["flag", "program", "value"])
count = 0
for index, row in flags.iterrows():
for idx, value in enumerate(row):
flag = flags.columns[idx]
flat.loc[count] = [flag, index, value]
count +=1

flat.to_csv("data/flags-times-flat.csv")

# -log_10(pvalue)
flat['minuslog10pvalue'] = -np.log10(flat.value)
flat.flag = flat.flag.astype('category')
flat = flat.sort_values('flag')
flat['ind'] = range(len(flat))
df_grouped = flat.groupby(('flag'))

# manhattan plot
fig = plt.figure(figsize=(40, 10)) # Set the figure size
ax = fig.add_subplot(111)
colors = ['darkred','darkgreen','darkblue', 'gold']
x_labels = []
x_labels_pos = []
for num, (name, group) in enumerate(df_grouped):
group.plot(kind='scatter', x='ind', y='value', color=colors[num % len(colors)], ax=ax)
x_labels.append(name)
x_labels_pos.append((group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0])/2))
ax.set_xticks(x_labels_pos)
ax.set_xticklabels(x_labels, rotation=45)

# set axis limits
ax.set_xlim([0, len(flat)])
ax.set_ylim([0, 3])

# x axis label
ax.set_xlabel('Flag')

# show the graph
plt.savefig("data/manhattan-flags.pdf")
plt.savefig("data/manhattan-flags.png")

# Finally let's filter down to those >= 1.3
# Yes this code is redundant and terrible don't judge sometimes I do data science too!
filtered = flat[flat.value >= 1.3]
filtered['ind'] = range(len(filtered))
df_grouped = filtered.groupby(('flag'))

# manhattan plot
fig = plt.figure(figsize=(20, 10)) # Set the figure size
ax = fig.add_subplot(111)
colors = ['darkred','darkgreen','darkblue', 'gold']
x_labels = []
x_labels_pos = []
for num, (name, group) in enumerate(df_grouped):
if group.empty:
continue
group.plot(kind='scatter', x='ind', y='value', color=colors[num % len(colors)], ax=ax)
x_labels.append(name)
x_labels_pos.append((group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0])/2))
ax.set_xticks(x_labels_pos)
ax.set_xticklabels(x_labels, rotation=45)

# set axis limits
ax.set_xlim([0, len(filtered)])
ax.set_ylim([0, 3])

# x axis label
ax.set_xlabel('Flag')

# show the graph
plt.savefig("data/manhattan-flags-filtered.pdf")
plt.savefig("data/manhattan-flags-filtered.png")

def main():
parser = get_parser()

def help(return_code=0):
parser.print_help()
sys.exit(return_code)

args, extra = parser.parse_known_args()
if not args.command:
help()

# Load data
if not args.csv or not os.path.exists(args.csv):
sys.exit("%s missing or does not exist." % args.csv)

manhattan(args.csv, args.tokens)


if __name__ == "__main__":
main()

0 comments on commit 08f9685

Please sign in to comment.