adding means image

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
buildsi · Feb 10, 2022 · f4cca7e · f4cca7e
1 parent 0590e9d
commit f4cca7e
Show file tree

Hide file tree

Showing 6 changed files with 139 additions and 17 deletions.
diff --git a/2.vectors.py b/2.vectors.py
@@ -64,18 +64,6 @@ def process_text(text):
     # Get rid of anything that looks like a path!
     tokens = [x for x in tokens if os.sep not in x]
 
-    # Split words with underscore into two words
-    words = []
-    for t in tokens:
-        if "_" in t:
-            words += [x.strip() for x in t.split("_")]
-
-        # Don't add single letters
-        elif len(t) == 1:
-            continue
-        else:
-            words.append(t)
-
     # Don't do stemming here - the error messages are usually hard coded / consistent
     # words = [stemmer.stem(t) for t in tokens]
     return tokens
@@ -125,7 +113,6 @@ def build_model(texts, name, outdir):
     emb.to_csv(os.path.join("docs", "%s-umap-software-embeddings.csv" % name))
 
     # Make the tsne (output embeddings go into docs for visual)
-    return
     fit = manifold.TSNE(n_components=2)
     embedding = fit.fit_transform(distance)
     emb = pandas.DataFrame(embedding, index=distance.index, columns=["x", "y"])

diff --git a/3.charts.py b/3.charts.py
@@ -148,7 +148,7 @@ def main():
         )
     }
 
-    print("%s out of %s mention 'undefined reference'" %(count, len(errors)))
+    print("%s out of %s mention 'undefined reference'" % (count, len(errors)))
     write_json(lookup_parsed, os.path.join("docs", "parsed_errors_count.json"))
     write_json(lookup_errors, os.path.join("docs", "errors_count.json"))
 

diff --git a/4.scores.py b/4.scores.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+import matplotlib.pyplot as plt
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from nltk.tokenize import word_tokenize
+from glob import glob
+import statistics
+import pandas
+import tempfile
+import shutil
+import argparse
+import json
+import re
+import sys
+import os
+
+
+# Derive stop words and stemmer once
+stop_words = set(stopwords.words("english"))
+stemmer = PorterStemmer()
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Spack Monitor Analyzer",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--data_dir",
+        help="Directory with data",
+        default=os.path.join(os.getcwd(), "data"),
+    )
+    return parser
+
+
+def write_json(content, filename):
+    with open(filename, "w") as fd:
+        fd.write(json.dumps(content, indent=4))
+
+
+def read_json(filename):
+    with open(filename, "r") as fd:
+        content = json.loads(fd.read())
+    return content
+
+
+def process_text(text):
+    """
+    Process text, including:
+
+    1. Lowercase
+    2. Remove numbers and punctuation
+    3. Strip whitespace
+    4. Tokenize and stop word removal
+    5. Stemming
+    """
+    # Make lowercase
+    text = text.lower()
+
+    # Remove numbers and punctuation (but leave path separator for now)
+    text = re.sub(r"\d+", "", text)
+    text = re.sub(r"[^\w\s\/]", "", text)
+
+    # Strip whitespace
+    text = text.strip()
+
+    # tokenize and stop word removal
+    tokens = [x for x in word_tokenize(text) if not x in stop_words]
+
+    # Since error output as filepaths get rid of paths!
+    # Get rid of anything that looks like a path!
+    tokens = [x for x in tokens if os.sep not in x]
+
+    # Don't do stemming here - the error messages are usually hard coded / consistent
+    # words = [stemmer.stem(t) for t in tokens]
+    return tokens
+
+
+def main():
+
+    parser = get_parser()
+    args, extra = parser.parse_known_args()
+
+    # Make sure output directory exists
+    datadir = os.path.abspath(args.data_dir)
+    if not os.path.exists(datadir):
+        sys.exit("%s does not exist!" % datadir)
+
+    # Build model with errors
+    errors = []
+    for filename in glob(os.path.join(datadir, "errors*.json")):
+        errors += read_json(filename)
+    print("Found %s errors!" % len(errors))
+
+    # Load in model
+    model = Doc2Vec.load(os.path.join("data", "models", "model.error.doc2vec"))
+
+    scores = []
+    # for each error, calculate homogeneity score
+    for entry in errors:
+
+        # Pre, text, and post
+        text = entry.get("text")
+        if not text:
+            continue
+
+        # Split based on error
+        if "error:" not in text:
+            continue
+
+        text = text.split("error:", 1)[-1]
+        tokens = process_text(text)
+        new_vector = model.infer_vector(tokens)
+        sims = model.docvecs.most_similar([new_vector])
+
+        # NOT a perfect metric, take mean and sd
+        nums = [x[1] for x in sims]
+        scores.append((statistics.mean(nums), statistics.stdev(nums)))
+
+    # We can save these if needed
+    means = [s[0] for s in scores]
+    stdevs = [s[0] for s in scores]
+
+    plt.hist(means, bins=100)
+    plt.title("KNN with N=10, average similarity for 30K messages")
+    plt.savefig(os.path.join("data", "means.png"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/README.md b/README.md
@@ -59,7 +59,7 @@ in build errors than warnings that clutter the signal. For the "error only" (or
 we look for strings that have `error:` and split and take the right side of that. For all other
 processing methods, we remove paths (e.g., tokenize then remove anything with an os.sep or path separator).
 
-Finally, generate counts of data (to be put into [docs](docs) if we want to eventually visualize):
+Then generate counts of data (to be put into [docs](docs) if we want to eventually visualize):
 
 ```bash
 $ python 3.charts.py
@@ -71,10 +71,12 @@ Some data will be generated in data, and assets for the web interface will go
 into [docs](docs). The interface allows you to select and see the difference between
 the models, and clearly just using the error messages (parsed or not) has the strongest signal (best clustering).
 
-## TODO
+And finally, generate a quick plot to show that, if we did KNN for each error, the mean similarity
+of the closests 10 points (standard deviation not shown, but is calculated if we need):
 
- - cluster metric of goodness/homogeneity
+![data/means.png](data/means.png)
 
+
 ## License
 
 Spack is distributed under the terms of both the MIT license and the

diff --git a/data/means.png b/data/means.png
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 numba==0.53
 requests
+matplotlib
 gensim
 nltk
 pandas