## Working Notebook

In [13]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [14]:
df = pd.read_csv("data/tweets.csv", encoding='ISO-8859-1')

df.columns = ["unprocessed_tweet", "product", "emotion"]

df.head(3)

Unnamed: 0,unprocessed_tweet,product,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion


In [15]:
text = df["unprocessed_tweet"]
text

0       .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1       @jessedee Know about @fludapp ? Awesome iPad/i...
2       @swonderlin Can not wait for #iPad 2 also. The...
3       @sxsw I hope this year's festival isn't as cra...
4       @sxtxstate great stuff on Fri #SXSW: Marissa M...
                              ...                        
9088                        Ipad everywhere. #SXSW {link}
9089    Wave, buzz... RT @mention We interrupt your re...
9090    Google's Zeiger, a physician never reported po...
9091    Some Verizon iPhone customers complained their...
9092    Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...
Name: unprocessed_tweet, Length: 9093, dtype: object

In [16]:
text[0]

'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

In [32]:
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
stop_words = [i.replace("'", '') for i in stop_words]

stop_words[-5:]

#creating a list of words to add to stop words
top_words = ['sxsw', 'mention', 'link', 'rt']
stop_words = stop_words + top_words

In [18]:
# pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
# tokenizer = RegexpTokenizer(pattern)
# sample_doc = tokenizer.tokenize(sample_document)

In [19]:
ex = "[^a-zA-Z\s]"

text = text.str.replace(ex, "", regex=True)
text = text.str.lower()
text

0       wesley i have a g iphone after  hrs tweeting a...
1       jessedee know about fludapp  awesome ipadiphon...
2       swonderlin can not wait for ipad  also they sh...
3       sxsw i hope this years festival isnt as crashy...
4       sxtxstate great stuff on fri sxsw marissa maye...
                              ...                        
9088                            ipad everywhere sxsw link
9089    wave buzz rt mention we interrupt your regular...
9090    googles zeiger a physician never reported pote...
9091    some verizon iphone customers complained their...
9092    rt mention google tests checkin offers at sxsw...
Name: unprocessed_tweet, Length: 9093, dtype: object

## Tokenize

In [20]:
from nltk.tokenize import word_tokenize

tokenized_text = text.apply(lambda t: word_tokenize(str(t)))
tokenized_text

0       [wesley, i, have, a, g, iphone, after, hrs, tw...
1       [jessedee, know, about, fludapp, awesome, ipad...
2       [swonderlin, can, not, wait, for, ipad, also, ...
3       [sxsw, i, hope, this, years, festival, isnt, a...
4       [sxtxstate, great, stuff, on, fri, sxsw, maris...
                              ...                        
9088                       [ipad, everywhere, sxsw, link]
9089    [wave, buzz, rt, mention, we, interrupt, your,...
9090    [googles, zeiger, a, physician, never, reporte...
9091    [some, verizon, iphone, customers, complained,...
9092    [rt, mention, google, tests, checkin, offers, ...
Name: unprocessed_tweet, Length: 9093, dtype: object

In [21]:
filtered_text = tokenized_text.apply(lambda x: [word for word in x if word not in stop_words])
filtered_text = tokenized_text.apply(lambda x: [word for word in x if len(word) > 1])
filtered_text

0       [wesley, have, iphone, after, hrs, tweeting, a...
1       [jessedee, know, about, fludapp, awesome, ipad...
2       [swonderlin, can, not, wait, for, ipad, also, ...
3       [sxsw, hope, this, years, festival, isnt, as, ...
4       [sxtxstate, great, stuff, on, fri, sxsw, maris...
                              ...                        
9088                       [ipad, everywhere, sxsw, link]
9089    [wave, buzz, rt, mention, we, interrupt, your,...
9090    [googles, zeiger, physician, never, reported, ...
9091    [some, verizon, iphone, customers, complained,...
9092    [rt, mention, google, tests, checkin, offers, ...
Name: unprocessed_tweet, Length: 9093, dtype: object

### POS Tagging

In [22]:
tagged_text = filtered_text.apply(lambda x: pos_tag(x))

tagged_text

0       [(wesley, NN), (have, VBP), (iphone, VBN), (af...
1       [(jessedee, NN), (know, VBP), (about, IN), (fl...
2       [(swonderlin, NN), (can, MD), (not, RB), (wait...
3       [(sxsw, NN), (hope, VBP), (this, DT), (years, ...
4       [(sxtxstate, NN), (great, JJ), (stuff, NN), (o...
                              ...                        
9088    [(ipad, NN), (everywhere, RB), (sxsw, JJ), (li...
9089    [(wave, NN), (buzz, NN), (rt, NN), (mention, N...
9090    [(googles, NNS), (zeiger, RBR), (physician, JJ...
9091    [(some, DT), (verizon, NN), (iphone, NN), (cus...
9092    [(rt, JJ), (mention, NN), (google, NN), (tests...
Name: unprocessed_tweet, Length: 9093, dtype: object

### Lemmatize

In [23]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatized_text = tagged_text.apply(
    lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in x]
)

lemmatized_str = lemmatized_text.apply(lambda x: ' '.join(x))

lemmatized_str

0       wesley have iphone after hr tweeting at riseau...
1       jessedee know about fludapp awesome ipadiphone...
2       swonderlin can not wait for ipad also they sho...
3       sxsw hope this year festival isnt as crashy a ...
4       sxtxstate great stuff on fri sxsw marissa maye...
                              ...                        
9088                            ipad everywhere sxsw link
9089    wave buzz rt mention we interrupt your regular...
9090    google zeiger physician never report potential...
9091    some verizon iphone customer complain their ti...
9092    rt mention google test checkin offer at sxsw link
Name: unprocessed_tweet, Length: 9093, dtype: object

In [24]:
df["unprocessed_tweet"] = lemmatized_str
df.head()

Unnamed: 0,unprocessed_tweet,product,emotion
0,wesley have iphone after hr tweeting at riseau...,iPhone,Negative emotion
1,jessedee know about fludapp awesome ipadiphone...,iPad or iPhone App,Positive emotion
2,swonderlin can not wait for ipad also they sho...,iPad,Positive emotion
3,sxsw hope this year festival isnt as crashy a ...,iPad or iPhone App,Negative emotion
4,sxtxstate great stuff on fri sxsw marissa maye...,Google,Positive emotion


### Dropping NaN

In [25]:
df = df.dropna(subset=["unprocessed_tweet"])


In [26]:
df_multi_dropped = df[~(df["emotion"] == "I can't tell")].copy()
df_multi_dropped.shape

(8937, 3)

In [27]:
df_multi_dropped["emotion"].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
Name: emotion, dtype: int64

In [28]:
emotion_map = {
    "No emotion toward brand or product": 2,
    "Positive emotion": 1,
    "Negative emotion": 0,
}

df_multi_dropped["emotion_encoded"] = df_multi_dropped["emotion"].map(emotion_map)
df_multi_dropped["emotion_encoded"].value_counts()

2    5389
1    2978
0     570
Name: emotion_encoded, dtype: int64

In [29]:
# sw = stopwords.words("english")
# sw = [i.replace("'", '') for i in stop_words]

In [31]:
X = df_multi_dropped["unprocessed_tweet"]
y = df_multi_dropped["emotion_encoded"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6255,), (2682,), (6255,), (2682,))

## Pipelines

In [35]:
pipe_dt = Pipeline([("vec", TfidfVectorizer()), ("dt", DecisionTreeClassifier(random_state=42))])
pipe_rf = Pipeline([("vec", TfidfVectorizer()), ("rf", RandomForestClassifier(random_state=42))])
pipe_knn = Pipeline([("vec", TfidfVectorizer()), ("knn", KNeighborsClassifier())])

pipes = [pipe_dt, pipe_rf, pipe_knn]
names = ["dt", "rf", "knn"]

for pipeline in tqdm(pipes):
    pipeline.fit(X_train, y_train)

100%|██████████| 3/3 [00:09<00:00,  3.25s/it]


In [11]:
preds = {pipe: None for pipe in names}

for name, pipe in tqdm(zip(names, pipes)):
    preds[name] = pipe.predict(X_test)


3it [00:01,  1.95it/s]


In [12]:
for name in preds.keys():
    print(name.upper())
    print(classification_report(y_test, preds[name]))
    print()

DT
              precision    recall  f1-score   support

           0       0.32      0.18      0.23       189
           1       0.48      0.48      0.48       880
           2       0.68      0.72      0.70      1612

    accuracy                           0.60      2681
   macro avg       0.50      0.46      0.47      2681
weighted avg       0.59      0.60      0.59      2681


RF
              precision    recall  f1-score   support

           0       0.71      0.15      0.25       189
           1       0.61      0.39      0.47       880
           2       0.68      0.88      0.76      1612

    accuracy                           0.67      2681
   macro avg       0.67      0.47      0.50      2681
weighted avg       0.66      0.67      0.63      2681


KNN
              precision    recall  f1-score   support

           0       0.29      0.11      0.16       189
           1       0.53      0.38      0.44       880
           2       0.67      0.83      0.75      1612

    accu

## Multi-Class Pipeline GridSearch

### GridSearch on Decision Tree Model

In [None]:
pipe_dt = Pipeline([("vec", TfidfVectorizer(stop_words=sw)), ("dt", DecisionTreeClassifier(random_state=42))])

In [26]:
grid_dt = {'dt__max_depth': ['None', 2, 5, 10],
       'dt__min_samples_split': [2, 5],
       'vec__ngram_range': [(1,1), (1,2)],
        'vec__max_df': [.8, .9, .99],
        'vec__min_df': [.01, .05]}

In [27]:
gs = GridSearchCV(estimator=pipe_dt, param_grid=grid_dt, verbose=2)

In [28]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=2, dt

[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.99, vec__min_df=0.

[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.05, vec_

[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.99, vec__m

In [29]:
gs.best_params_

{'dt__max_depth': 5,
 'dt__min_samples_split': 2,
 'vec__max_df': 0.9,
 'vec__min_df': 0.01,
 'vec__ngram_range': (1, 2)}

In [30]:
gs.best_score_

0.634052757793765

**Messing around with different Hyperparameters**

In [53]:
pipe_dt2 = Pipeline([("vec", TfidfVectorizer(stop_words=sw, max_df=0.9, min_df=0.01, ngram_range= (1, 2))), 
                    ("dt", DecisionTreeClassifier(max_depth=5,min_samples_split=2, random_state=42))])

In [54]:
pipe_dt2.fit(X_train, y_train)

pipe_dt2.score(X_train, y_train)

0.640607513988809

### GridSearch on RF

In [None]:
#RandomForestClassifier(random_state=42)

In [71]:
grid_rf = {'rf__n_estimators': [10, 50, 100], 
           'rf__max_depth': [2, 5, 10],
       'rf__min_samples_split': [2,3,4],
       'vec__ngram_range': [(1,1), (1,2)],
        'vec__max_df': [.9, .99],
        'vec__min_df': [.01, .05]}

In [72]:
gs_rf = GridSearchCV(pipe_rf, param_grid=grid_rf, verbose=2)

In [73]:
gs_rf.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END rf__max_depth=2, rf__

[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.99, vec_

[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.4s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=2, rf__n_estimators=100, vec__max_

[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.9, vec__

[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=

[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99

[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=2, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.9

[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=10, vec__max_df=0.99

[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.4s
[CV] END rf__max_depth=5, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.9

[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.9, vec__

[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=50, vec__max_df=0.99

[CV] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.5s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.9, vec

[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.5s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.99, vec_

[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.4s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.4s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.4s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.4s
[CV] END rf__max_depth=5, rf__min_samples_split=4, rf__n_estimators=100, vec__max_

[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=50, vec__max_df=0.9

[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.7s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.7s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.9s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.8s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.6s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=100, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.6s
[CV] END rf__max_depth=10, rf__min_samples_split=2, rf__n_estimators=100, vec__m

[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=10, vec__max_

[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.7s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.6s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.6s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.6s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.6s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=100, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.7s
[CV] END rf__max_depth=10, rf__min_samples_split=3, rf__n_estimators=100, vec__max

[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.2s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=10, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.1s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=10, vec__max_d

[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.5s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.5s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.7s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.6s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   1.0s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=50, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   1.5s
[CV] END rf__max_depth=10, rf__min_samples_split=4, rf__n_estimators=50, vec__max_

In [74]:
gs_rf.best_params_

{'rf__max_depth': 10,
 'rf__min_samples_split': 3,
 'rf__n_estimators': 100,
 'vec__max_df': 0.9,
 'vec__min_df': 0.01,
 'vec__ngram_range': (1, 1)}

In [75]:
gs_rf.best_score_

0.641726618705036

### GridSearch on KNN

In [None]:
pipe_knn = Pipeline([("vec", TfidfVectorizer(stop_words=sw)), ("knn", KNeighborsClassifier())])

In [55]:
grid_knn = {'knn__n_neighbors': [2, 4, 6], 
           'knn__weights': ['uniform', 'distance'],
       'knn__p': [1,2],
       'vec__ngram_range': [(1,1), (1,2)],
        'vec__max_df': [.9, .99],
        'vec__min_df': [.01, .05]}

In [56]:
gs_knn = GridSearchCV(pipe_knn, param_grid=grid_rf, verbose=2)

In [57]:
gs_knn.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.4s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=   0.5s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.01, ve

[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=2, knn__p=1, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END

[CV] END knn__n_neighbors=2, knn__p=2, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=2, knn__p=2, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   1.9s
[CV] END knn__n_neighbors=2, knn__p=2, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   1.5s
[CV] END knn__n_neighbors=2, knn__p=2, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   1.2s
[CV] END knn__n_neighbors=2, knn__p=2, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.6s
[CV] END knn__n_neighbors=2, knn__p=2, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.9s
[CV] END knn__n_neighbors=2, knn__p=2, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.6s
[CV] END knn_

[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_n

[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=1, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=2, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=4, knn__p=2, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END k

[CV] END knn__n_neighbors=4, knn__p=2, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END knn__n_neighbors=4, knn__p=2, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=2, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=2, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=4, knn__p=2, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END knn__n_neighbors=4, knn__p=2, knn__weights=distance, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.3s
[CV] END knn__n_neighbors=4, knn__p=2, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn

[CV] END knn__n_neighbors=6, knn__p=1, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=6, knn__p=1, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=6, knn__p=1, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=6, knn__p=1, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=6, knn__p=1, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.5s
[CV] END knn__n_neighbors=6, knn__p=1, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.9s
[CV] END knn__n_neighbors=6, knn__p=1, knn__weights=uniform, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.7s
[CV] END knn_

[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.3s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=uniform, vec__max_df=0.9, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neig

[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   0.2s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] END knn__n_neighbors=6, knn__p=2, knn__weights=distance, vec__max_df=0.99, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=   0.4s
[CV] E

In [58]:
gs.best_params_

{'knn__n_neighbors': 6,
 'knn__p': 2,
 'knn__weights': 'distance',
 'vec__max_df': 0.9,
 'vec__min_df': 0.01,
 'vec__ngram_range': (1, 2)}

In [59]:
gs.best_score_

0.6108713029576338