Skip to content

Commit

Permalink
updating dbstream to not use one word errors
Browse files Browse the repository at this point in the history
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
  • Loading branch information
vsoch committed Mar 12, 2022
1 parent 46f7328 commit 36d89ac
Show file tree
Hide file tree
Showing 128 changed files with 75,903 additions and 30,810 deletions.
42 changes: 19 additions & 23 deletions 2.online-ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def init_client(self, host, prefix=None):
else:
self.cli = Client(host)

def iter_sentences(self, return_text=False, return_raw=False):
def iter_sentences(self):
"""
Yield sentences (parsed) to learn from.
"""
Expand All @@ -64,16 +64,13 @@ def iter_sentences(self, return_text=False, return_raw=False):
continue

text = raw.split("error:", 1)[-1]
if return_text and text:
yield text
elif return_raw:
yield raw
else:
tokens = process_text(text)
sentence = " ".join(tokens)
if not tokens or not sentence.strip():
continue
yield sentence
tokens = process_text(text)
sentence = " ".join(tokens)

# Skip single words!
if not tokens or not sentence.strip() or len(tokens) == 1:
continue
yield sentence

def kmeans(self, model_name="spack-errors", save_prefix="kmeans"):
"""
Expand All @@ -91,7 +88,7 @@ def kmeans(self, model_name="spack-errors", save_prefix="kmeans"):

# Add each error to the server (only if not done yet)
if not exists:
for sentence in self.iter_sentences(self.errors):
for sentence in self.iter_sentences():
res = self.cli.learn(x=sentence, model_name=model_name)

# Save clusters to file under data/clusters/<prefix>
Expand All @@ -116,7 +113,7 @@ def dbstream(self, model_name="spack-dbstream-errors", save_prefix="dbstream"):
exists = False

if not exists:
for sentence in self.iter_sentences(self.errors):
for sentence in self.iter_sentences():
res = self.cli.learn(x=sentence, model_name=model_name)

# Save clusters to file under data/clusters/<prefix>
Expand All @@ -128,23 +125,22 @@ def denstream(self, model_name="spack-dbstream-errors", save_prefix="denstream")
"""
Build the denstream model https://riverml.xyz/latest/api/cluster/DenStream/
"""
# See https://github.com/online-ml/river/issues/874
# model might have bugs! denstream I think is better
# because denstream is good with outliers (we likely won't have)
exists = True
if model_name not in self.cli.models()["models"]:

# TODO the docs of denstream are not clear about beta/mu (and the range is wrong)
# and running like this we only get 2 clusters - I'm going to keep testing this.
model = feature_extraction.BagOfWords() | cluster.DenStream(
decaying_factor=0.01,
beta=1.01,
mu=1.0005,
epsilon=0.5,
n_samples_init=10,
beta=0.5,
mu=2.5,
epsilon=0.02,
)
model_name = self.cli.upload_model(model, "cluster", model_name=model_name)
exists = False

if not exists:
for sentence in self.iter_sentences(self.errors):
for sentence in self.iter_sentences():
res = self.cli.learn(x=sentence, model_name=model_name)

# Save clusters to file under data/clusters/<prefix>
Expand All @@ -159,7 +155,7 @@ def generate_clusters_json(self, model_name, save_prefix):
# At this point, let's get a prediction for each
# We can just group them based on the cluster
clusters = {}
for sentence in self.iter_sentences(self.errors):
for sentence in self.iter_sentences():
res = self.cli.predict(x=sentence, model_name=model_name)
if res["prediction"] not in clusters:
clusters[res["prediction"]] = []
Expand Down Expand Up @@ -243,7 +239,7 @@ def main():
# spack monitor server.

# model = builder.dbstream(model_name="spack-dbstream-errors")
model = builder.denstream(model_name="spack-denstream-errors")
# model = builder.denstream(model_name="spack-denstream-errors")


if __name__ == "__main__":
Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,18 @@ data/
└── warnings.json
```

### Online ML

The script [2.online-ml.py](2.online-ml.py) will generate the clusters in [data/clusters](data/clusters).
Note that I've commented out the main function runs (since I've run them already) so if you want to re-produce
or run differently, either insert an interactive `IPython.embed()` or uncomment and edit areas.

```bash
python 2.online-ml.py
```

### Vectors and Docs Visuals

We next want to preprocess the data and generate models / vectors!

```bash
Expand Down
3 changes: 0 additions & 3 deletions data/clusters/dbstream/cluster-tokens-10.json

This file was deleted.

140 changes: 140 additions & 0 deletions data/clusters/dbstream/cluster-tokens-11.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
[
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory",
"unsupported option printmultiosdirectory"
]
10 changes: 9 additions & 1 deletion data/clusters/dbstream/cluster-tokens-116.json
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
[
"use constant expression",
"use constant expression",
"invalid use void expression",
"use constant expression",
"use constant expression",
"invalid use void expression",
"use constant expression",
"use constant expression",
"invalid use void expression",
"use constant expression",
"use constant expression",
"invalid use void expression",
"use constant expression",
"use constant expression",
"invalid use void expression",
"use constant expression",
"use constant expression",
"invalid use void expression",
"use constant expression",
"use constant expression",
"invalid use void expression",
"use constant expression",
"use constant expression"
"use constant expression",
"invalid use void expression"
]

0 comments on commit 36d89ac

Please sign in to comment.