Skip to content

Commit

Permalink
adding means image
Browse files Browse the repository at this point in the history
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
  • Loading branch information
vsoch committed Feb 10, 2022
1 parent 0590e9d commit f4cca7e
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 17 deletions.
13 changes: 0 additions & 13 deletions 2.vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,18 +64,6 @@ def process_text(text):
# Get rid of anything that looks like a path!
tokens = [x for x in tokens if os.sep not in x]

# Split words with underscore into two words
words = []
for t in tokens:
if "_" in t:
words += [x.strip() for x in t.split("_")]

# Don't add single letters
elif len(t) == 1:
continue
else:
words.append(t)

# Don't do stemming here - the error messages are usually hard coded / consistent
# words = [stemmer.stem(t) for t in tokens]
return tokens
Expand Down Expand Up @@ -125,7 +113,6 @@ def build_model(texts, name, outdir):
emb.to_csv(os.path.join("docs", "%s-umap-software-embeddings.csv" % name))

# Make the tsne (output embeddings go into docs for visual)
return
fit = manifold.TSNE(n_components=2)
embedding = fit.fit_transform(distance)
emb = pandas.DataFrame(embedding, index=distance.index, columns=["x", "y"])
Expand Down
2 changes: 1 addition & 1 deletion 3.charts.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def main():
)
}

print("%s out of %s mention 'undefined reference'" %(count, len(errors)))
print("%s out of %s mention 'undefined reference'" % (count, len(errors)))
write_json(lookup_parsed, os.path.join("docs", "parsed_errors_count.json"))
write_json(lookup_errors, os.path.join("docs", "errors_count.json"))

Expand Down
132 changes: 132 additions & 0 deletions 4.scores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/env python

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from glob import glob
import statistics
import pandas
import tempfile
import shutil
import argparse
import json
import re
import sys
import os


# Derive stop words and stemmer once
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()


def get_parser():
parser = argparse.ArgumentParser(
description="Spack Monitor Analyzer",
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"--data_dir",
help="Directory with data",
default=os.path.join(os.getcwd(), "data"),
)
return parser


def write_json(content, filename):
with open(filename, "w") as fd:
fd.write(json.dumps(content, indent=4))


def read_json(filename):
with open(filename, "r") as fd:
content = json.loads(fd.read())
return content


def process_text(text):
"""
Process text, including:
1. Lowercase
2. Remove numbers and punctuation
3. Strip whitespace
4. Tokenize and stop word removal
5. Stemming
"""
# Make lowercase
text = text.lower()

# Remove numbers and punctuation (but leave path separator for now)
text = re.sub(r"\d+", "", text)
text = re.sub(r"[^\w\s\/]", "", text)

# Strip whitespace
text = text.strip()

# tokenize and stop word removal
tokens = [x for x in word_tokenize(text) if not x in stop_words]

# Since error output as filepaths get rid of paths!
# Get rid of anything that looks like a path!
tokens = [x for x in tokens if os.sep not in x]

# Don't do stemming here - the error messages are usually hard coded / consistent
# words = [stemmer.stem(t) for t in tokens]
return tokens


def main():

parser = get_parser()
args, extra = parser.parse_known_args()

# Make sure output directory exists
datadir = os.path.abspath(args.data_dir)
if not os.path.exists(datadir):
sys.exit("%s does not exist!" % datadir)

# Build model with errors
errors = []
for filename in glob(os.path.join(datadir, "errors*.json")):
errors += read_json(filename)
print("Found %s errors!" % len(errors))

# Load in model
model = Doc2Vec.load(os.path.join("data", "models", "model.error.doc2vec"))

scores = []
# for each error, calculate homogeneity score
for entry in errors:

# Pre, text, and post
text = entry.get("text")
if not text:
continue

# Split based on error
if "error:" not in text:
continue

text = text.split("error:", 1)[-1]
tokens = process_text(text)
new_vector = model.infer_vector(tokens)
sims = model.docvecs.most_similar([new_vector])

# NOT a perfect metric, take mean and sd
nums = [x[1] for x in sims]
scores.append((statistics.mean(nums), statistics.stdev(nums)))

# We can save these if needed
means = [s[0] for s in scores]
stdevs = [s[0] for s in scores]

plt.hist(means, bins=100)
plt.title("KNN with N=10, average similarity for 30K messages")
plt.savefig(os.path.join("data", "means.png"))


if __name__ == "__main__":
main()
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ in build errors than warnings that clutter the signal. For the "error only" (or
we look for strings that have `error:` and split and take the right side of that. For all other
processing methods, we remove paths (e.g., tokenize then remove anything with an os.sep or path separator).

Finally, generate counts of data (to be put into [docs](docs) if we want to eventually visualize):
Then generate counts of data (to be put into [docs](docs) if we want to eventually visualize):

```bash
$ python 3.charts.py
Expand All @@ -71,10 +71,12 @@ Some data will be generated in data, and assets for the web interface will go
into [docs](docs). The interface allows you to select and see the difference between
the models, and clearly just using the error messages (parsed or not) has the strongest signal (best clustering).

## TODO
And finally, generate a quick plot to show that, if we did KNN for each error, the mean similarity
of the closests 10 points (standard deviation not shown, but is calculated if we need):

- cluster metric of goodness/homogeneity
![data/means.png](data/means.png)


## License

Spack is distributed under the terms of both the MIT license and the
Expand Down
Binary file added data/means.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
numba==0.53
requests
matplotlib
gensim
nltk
pandas
Expand Down

0 comments on commit f4cca7e

Please sign in to comment.