Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
TwsThomas committed Aug 18, 2020
1 parent 6ecbc04 commit c176f49
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 9 deletions.
12 changes: 6 additions & 6 deletions examples/02_fit_predict_plot_employee_salaries.py
Expand Up @@ -73,7 +73,7 @@
'one-hot': OneHotEncoder(handle_unknown='ignore', sparse=False),
'similarity': SimilarityEncoder(similarity='ngram'),
'target': TargetEncoder(handle_unknown='ignore'),
'minhash': MinHashEncoder(n_components=10, ngram_range=(2, 4),
'minhash': MinHashEncoder(n_components=100, ngram_range=(2, 4),
hashing='fast', minmax_hash=False),
'numerical': FunctionTransformer(None)}

Expand Down Expand Up @@ -130,11 +130,11 @@ def make_pipeline(encoding_method):
# Plotting the results
# --------------------
# Finally, we plot the scores on a boxplot:
# We notice that the MinHashEncoder performs poorly compared to other encoding
# methods. There are two reasons for that: the MinHashEncoder performs better
# with tree-based models than linear models (see example 03), and the
# low-dimensionality of encodings (increasing n_components improves
# performances.
# We notice that the MinHashEncoder does not performs as well compared to
# other encoding methods.
# There are two reasons for that: the MinHashEncoder performs better
# with tree-based models than linear models (see example 03), and also
# increasing n_components improves performances.

import seaborn
import matplotlib.pyplot as plt
Expand Down
6 changes: 3 additions & 3 deletions examples/03_fit_predict_plot_midwest_survey.py
Expand Up @@ -69,7 +69,7 @@
encoder_dict = {
'one-hot': OneHotEncoder(handle_unknown='ignore', sparse=False),
'similarity': SimilarityEncoder(similarity='ngram'),
'minhash': MinHashEncoder(n_components=10, ngram_range=(2, 4),
'minhash': MinHashEncoder(n_components=30, ngram_range=(2, 4),
hashing='fast', minmax_hash=False),
'num': FunctionTransformer(None)
}
Expand Down Expand Up @@ -135,5 +135,5 @@ def make_pipeline(encoding_method):
plt.tight_layout()

###############################################################################
# We can see that encoding the data using a SimilarityEncoder instead of
# OneHotEncoder helps a lot in improving the cross validation score!
# We can see that encoding the data using a SimilarityEncoder or MinhashEncoder
# instead of OneHotEncoder helps a lot in improving the cross validation score!

0 comments on commit c176f49

Please sign in to comment.