Skip to content

Commit

Permalink
adding knn model
Browse files Browse the repository at this point in the history
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
  • Loading branch information
vsoch committed Mar 24, 2022
1 parent 23bc16b commit be10e0a
Show file tree
Hide file tree
Showing 10 changed files with 808,978 additions and 6 deletions.
83 changes: 77 additions & 6 deletions 2.online-ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
# Let's try doing kmeans with river!

from riverapi.main import Client
from river import cluster, feature_extraction
from river import cluster, feature_extraction, neighbors
from scipy.spatial.distance import pdist, squareform
from sklearn import manifold
from creme import feature_extraction as creme_features

import pandas
import argparse
Expand All @@ -15,6 +16,7 @@

sys.path.insert(0, os.getcwd())
from helpers import process_text, write_json, read_errors
from knn import KNeighborsClassifier


def get_parser():
Expand Down Expand Up @@ -69,7 +71,7 @@ def iter_sentences(self):
# Skip single words!
if not tokens or not sentence.strip() or len(tokens) == 1:
continue
yield sentence
yield sentence, entry["id"]

def kmeans(self, model_name="spack-errors", save_prefix="kmeans"):
"""
Expand All @@ -87,7 +89,7 @@ def kmeans(self, model_name="spack-errors", save_prefix="kmeans"):

# Add each error to the server (only if not done yet)
if not exists:
for sentence in self.iter_sentences():
for sentence, _ in self.iter_sentences():
self.cli.learn(x=sentence, model_name=model_name)

# Save clusters to file under data/clusters/<prefix>
Expand All @@ -96,6 +98,72 @@ def kmeans(self, model_name="spack-errors", save_prefix="kmeans"):

return self.load_model("%s.pkl" % model_name)

def knn(self, model_name="spack-knn-errors", save_prefix="knn"):
"""
Build the knn model with a particular name.
"""
model = creme_features.TFIDF() | KNeighborsClassifier(
n_neighbors=5, window_size=10000
)

# Create a lookup of errors based on id so we can find quickly
print("Creating errors lookup...")
lookup = {}
for sentence, uid in self.iter_sentences():
lookup[uid] = sentence

# I'm using the model directly since it takes an identifier
print("Training KNN model with modified creme...")
for sentence, uid in self.iter_sentences():
model.fit_one(x=sentence, identifier=uid)

# Save clusters to file under data/clusters/<prefix>
cluster_dir = os.path.join(self.datadir, "clusters", save_prefix)
if not os.path.exists(cluster_dir):
os.makedirs(cluster_dir)

# Now get predictions
print("Predictions...!\n")
results = []
result_file_number = 0
count = 0
for sentence, uid in self.iter_sentences():

# Each neighbor has:
# x, y, error id, and minkowski distnace
neighbors = model.predict_one(x=sentence)
neighbor_ids = [x[2] for x in neighbors]
result = {
"error": sentence,
"error_id": uid,
"neighbor_ids": neighbor_ids,
"neighbors": [lookup[x] for x in neighbor_ids],
}
results.append(result)

if count > 10000:
result_meta = os.path.join(
cluster_dir, "errors-%s-neighbors.json" % result_file_number
)
write_json(results, result_meta)
results = []
count = 0
result_file_number += 1
else:
count += 1

# Save last to file
if results:
result_meta = os.path.join(
cluster_dir, "errors-%s-neighbors.json" % result_file_number
)
write_json(results, result_meta)

# Save model to file
with open("%s.pkl" % model_name, "wb") as fd:
pickle.dump(model, fd)
return model

def dbstream(self, model_name="spack-dbstream-errors", save_prefix="dbstream"):
"""
Build the dbstream model with a particular name.
Expand All @@ -114,7 +182,7 @@ def dbstream(self, model_name="spack-dbstream-errors", save_prefix="dbstream"):
exists = False

if not exists:
for sentence in self.iter_sentences():
for sentence, _ in self.iter_sentences():
self.cli.learn(x=sentence, model_name=model_name)

# Save clusters to file under data/clusters/<prefix>
Expand All @@ -141,7 +209,7 @@ def denstream(self, model_name="spack-dbstream-errors", save_prefix="denstream")
exists = False

if not exists:
for sentence in self.iter_sentences():
for sentence, _ in self.iter_sentences():
self.cli.learn(x=sentence, model_name=model_name)

# Save clusters to file under data/clusters/<prefix>
Expand All @@ -156,7 +224,7 @@ def generate_clusters_json(self, model_name, save_prefix):
# At this point, let's get a prediction for each
# We can just group them based on the cluster
clusters = {}
for sentence in self.iter_sentences():
for sentence, _ in self.iter_sentences():
res = self.cli.predict(x=sentence, model_name=model_name)
if res["prediction"] not in clusters:
clusters[res["prediction"]] = []
Expand Down Expand Up @@ -232,6 +300,9 @@ def main():
# Get models to see if we have spack-errors
# models = builder.cli.models()

# Build knn model and export predictions
# model = builder.knn(model_name="spack-knn-errors")

# Build kmeans model and export clusters
# Note that we don't need to keep doing that - spack-monitor can visualize them now
# model = builder.kmeans(model_name="spack-errors")
Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,18 @@ python 2.online-ml.py
Again, note that a lot of the script is commented out, so you should open it first to
decide what you want to run.

#### KNN

Note that we are sort of deciding that clustering can be used for visualization, but won't work well for
actually getting a new data point and telling the participant "these are errors that are similar" because
we can't easily keep an identifier for the original error with the model. Thus, a good (different) strategy
is to use [knn](knn.py), which @vsoch has implemented here based on the original [creme](https://github.com/MaxHalford/creme/blob/master/creme/neighbors/knn.py) that used a minkowski distance that will allow errors to have different features.
To make this work, I had to tweak the model slightly and choose parameters wisely:

- window size needs to be large enough to hold a good sampling of errors. E.g. the default 50 was way too small. I increased it to 10K.
- K can still be 5, as long as the window size is reasonably large.
- **importantly** I had to modify the algorithm to only keep new points (adding them to the window) that were greater than some difference threshold. The reason is because the predictions will come back badly given that we've cleared the window of a diverse set of error types. E.g., if we only add more different points to the window we wind up with a more diverse set.

### Spack Issues

For spack issues, your step 2 and 3 are your last steps. If there is already a folder [data/spack-issues](data/spack-issues)
Expand Down

0 comments on commit be10e0a

Please sign in to comment.