adding knn model

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
buildsi · Mar 24, 2022 · be10e0a · be10e0a
1 parent 23bc16b
commit be10e0a
Show file tree

Hide file tree

Showing 10 changed files with 808,978 additions and 6 deletions.
diff --git a/2.online-ml.py b/2.online-ml.py
@@ -3,9 +3,10 @@
 # Let's try doing kmeans with river!
 
 from riverapi.main import Client
-from river import cluster, feature_extraction
+from river import cluster, feature_extraction, neighbors
 from scipy.spatial.distance import pdist, squareform
 from sklearn import manifold
+from creme import feature_extraction as creme_features
 
 import pandas
 import argparse
@@ -15,6 +16,7 @@
 
 sys.path.insert(0, os.getcwd())
 from helpers import process_text, write_json, read_errors
+from knn import KNeighborsClassifier
 
 
 def get_parser():
@@ -69,7 +71,7 @@ def iter_sentences(self):
             # Skip single words!
             if not tokens or not sentence.strip() or len(tokens) == 1:
                 continue
-            yield sentence
+            yield sentence, entry["id"]
 
     def kmeans(self, model_name="spack-errors", save_prefix="kmeans"):
         """
@@ -87,7 +89,7 @@ def kmeans(self, model_name="spack-errors", save_prefix="kmeans"):
 
         # Add each error to the server (only if not done yet)
         if not exists:
-            for sentence in self.iter_sentences():
+            for sentence, _ in self.iter_sentences():
                 self.cli.learn(x=sentence, model_name=model_name)
 
             # Save clusters to file under data/clusters/<prefix>
@@ -96,6 +98,72 @@ def kmeans(self, model_name="spack-errors", save_prefix="kmeans"):
 
         return self.load_model("%s.pkl" % model_name)
 
+    def knn(self, model_name="spack-knn-errors", save_prefix="knn"):
+        """
+        Build the knn model with a particular name.
+        """
+        model = creme_features.TFIDF() | KNeighborsClassifier(
+            n_neighbors=5, window_size=10000
+        )
+
+        # Create a lookup of errors based on id so we can find quickly
+        print("Creating errors lookup...")
+        lookup = {}
+        for sentence, uid in self.iter_sentences():
+            lookup[uid] = sentence
+
+        # I'm using the model directly since it takes an identifier
+        print("Training KNN model with modified creme...")
+        for sentence, uid in self.iter_sentences():
+            model.fit_one(x=sentence, identifier=uid)
+
+        # Save clusters to file under data/clusters/<prefix>
+        cluster_dir = os.path.join(self.datadir, "clusters", save_prefix)
+        if not os.path.exists(cluster_dir):
+            os.makedirs(cluster_dir)
+
+        # Now get predictions
+        print("Predictions...!\n")
+        results = []
+        result_file_number = 0
+        count = 0
+        for sentence, uid in self.iter_sentences():
+
+            # Each neighbor has:
+            # x, y, error id, and minkowski distnace
+            neighbors = model.predict_one(x=sentence)
+            neighbor_ids = [x[2] for x in neighbors]
+            result = {
+                "error": sentence,
+                "error_id": uid,
+                "neighbor_ids": neighbor_ids,
+                "neighbors": [lookup[x] for x in neighbor_ids],
+            }
+            results.append(result)
+
+            if count > 10000:
+                result_meta = os.path.join(
+                    cluster_dir, "errors-%s-neighbors.json" % result_file_number
+                )
+                write_json(results, result_meta)
+                results = []
+                count = 0
+                result_file_number += 1
+            else:
+                count += 1
+
+        # Save last to file
+        if results:
+            result_meta = os.path.join(
+                cluster_dir, "errors-%s-neighbors.json" % result_file_number
+            )
+            write_json(results, result_meta)
+
+        # Save model to file
+        with open("%s.pkl" % model_name, "wb") as fd:
+            pickle.dump(model, fd)
+        return model
+
     def dbstream(self, model_name="spack-dbstream-errors", save_prefix="dbstream"):
         """
         Build the dbstream model with a particular name.
@@ -114,7 +182,7 @@ def dbstream(self, model_name="spack-dbstream-errors", save_prefix="dbstream"):
             exists = False
 
         if not exists:
-            for sentence in self.iter_sentences():
+            for sentence, _ in self.iter_sentences():
                 self.cli.learn(x=sentence, model_name=model_name)
 
             # Save clusters to file under data/clusters/<prefix>
@@ -141,7 +209,7 @@ def denstream(self, model_name="spack-dbstream-errors", save_prefix="denstream")
             exists = False
 
         if not exists:
-            for sentence in self.iter_sentences():
+            for sentence, _ in self.iter_sentences():
                 self.cli.learn(x=sentence, model_name=model_name)
 
             # Save clusters to file under data/clusters/<prefix>
@@ -156,7 +224,7 @@ def generate_clusters_json(self, model_name, save_prefix):
         # At this point, let's get a prediction for each
         # We can just group them based on the cluster
         clusters = {}
-        for sentence in self.iter_sentences():
+        for sentence, _ in self.iter_sentences():
             res = self.cli.predict(x=sentence, model_name=model_name)
             if res["prediction"] not in clusters:
                 clusters[res["prediction"]] = []
@@ -232,6 +300,9 @@ def main():
     # Get models to see if we have spack-errors
     # models = builder.cli.models()
 
+    # Build knn model and export predictions
+    # model = builder.knn(model_name="spack-knn-errors")
+
     # Build kmeans model and export clusters
     # Note that we don't need to keep doing that - spack-monitor can visualize them now
     # model = builder.kmeans(model_name="spack-errors")

diff --git a/README.md b/README.md
@@ -69,6 +69,18 @@ python 2.online-ml.py
 Again, note that a lot of the script is commented out, so you should open it first to
 decide what you want to run.
 
+#### KNN
+
+Note that we are sort of deciding that clustering can be used for visualization, but won't work well for
+actually getting a new data point and telling the participant "these are errors that are similar" because
+we can't easily keep an identifier for the original error with the model. Thus, a good (different) strategy
+is to use [knn](knn.py), which @vsoch has implemented here based on the original [creme](https://github.com/MaxHalford/creme/blob/master/creme/neighbors/knn.py) that used a minkowski distance that will allow errors to have different features.
+To make this work, I had to tweak the model slightly and choose parameters wisely:
+
+ - window size needs to be large enough to hold a good sampling of errors. E.g. the default 50 was way too small. I increased it to 10K.
+ - K can still be 5, as long as the window size is reasonably large.
+ - **importantly** I had to modify the algorithm to only keep new points (adding them to the window) that were greater than some difference threshold. The reason is because the predictions will come back badly given that we've cleared the window of a diverse set of error types. E.g., if we only add more different points to the window we wind up with a more diverse set.
+
 ### Spack Issues
 
 For spack issues, your step 2 and 3 are your last steps. If there is already a folder [data/spack-issues](data/spack-issues)