init

amy12xx · Aug 18, 2020 · c176f49 · c176f49
1 parent 6ecbc04
commit c176f49
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 9 deletions.
diff --git a/examples/02_fit_predict_plot_employee_salaries.py b/examples/02_fit_predict_plot_employee_salaries.py
@@ -73,7 +73,7 @@
     'one-hot': OneHotEncoder(handle_unknown='ignore', sparse=False),
     'similarity': SimilarityEncoder(similarity='ngram'),
     'target': TargetEncoder(handle_unknown='ignore'),
-    'minhash': MinHashEncoder(n_components=10, ngram_range=(2, 4),
+    'minhash': MinHashEncoder(n_components=100, ngram_range=(2, 4),
                               hashing='fast', minmax_hash=False),
     'numerical': FunctionTransformer(None)}
 
@@ -130,11 +130,11 @@ def make_pipeline(encoding_method):
 # Plotting the results
 # --------------------
 # Finally, we plot the scores on a boxplot:
-# We notice that the MinHashEncoder performs poorly compared to other encoding
-# methods. There are two reasons for that: the MinHashEncoder performs better
-# with tree-based models than linear models (see example 03), and the
-# low-dimensionality of encodings (increasing n_components improves
-# performances.
+# We notice that the MinHashEncoder does not performs as well compared to 
+# other encoding methods.
+# There are two reasons for that: the MinHashEncoder performs better
+# with tree-based models than linear models (see example 03), and also
+# increasing n_components improves performances.
 
 import seaborn
 import matplotlib.pyplot as plt

diff --git a/examples/03_fit_predict_plot_midwest_survey.py b/examples/03_fit_predict_plot_midwest_survey.py
@@ -69,7 +69,7 @@
 encoder_dict = {
     'one-hot': OneHotEncoder(handle_unknown='ignore', sparse=False),
     'similarity': SimilarityEncoder(similarity='ngram'),
-    'minhash': MinHashEncoder(n_components=10, ngram_range=(2, 4),
+    'minhash': MinHashEncoder(n_components=30, ngram_range=(2, 4),
                               hashing='fast', minmax_hash=False),
     'num': FunctionTransformer(None)
 }
@@ -135,5 +135,5 @@ def make_pipeline(encoding_method):
 plt.tight_layout()
 
 ###############################################################################
-# We can see that encoding the data using a SimilarityEncoder instead of
-# OneHotEncoder helps a lot in improving the cross validation score!
+# We can see that encoding the data using a SimilarityEncoder or MinhashEncoder
+# instead of OneHotEncoder helps a lot in improving the cross validation score!