### Grid search

In [1]:
from sklearn.model_selection import ParameterGrid
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
import re

def preprocess_tweets(str_):
    mentions = r'(?:@[^\s]+)'
    urls = r'(?:https?\://t.co/[\w]+)'
    str_ = re.sub(mentions, '', str_)
    return re.sub(urls, '', str_)

param_grid = {
    'feats__vect__binary': [True],
    'feats__vect__ngram_range': [(1, 4), (1, 5), (1, 6)],
    'feats__vect__min_df': [1, 3, 5, 7],
    'feats__vect__max_df': [0.95, 0.9, 0.7],
    'feats__vect__analyzer': ["char", "char_wb"],
    'feats__vect__tokenizer': [word_tokenize],
    'feats__vect__preprocessor': [preprocess_tweets],
    'feats__vect1__tokenizer': [word_tokenize],
    'feats__vect1__preprocessor': [preprocess_tweets],
    'feats__vect1__ngram_range': [(1, 4), (1, 5)],
    'clf__random_state': [0],
}


params_list = list(ParameterGrid(param_grid))

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
from notebooks.sentiment.util import eval
from sentiment.new_data import InterTASSAugmented
from sentiment.tass import InterTASSReader
from tqdm import tqdm

pipeline = Pipeline([
            ('feats', FeatureUnion([
                    ('vect', TfidfVectorizer()),  # can pass in either a pipeline
                    ('vect1', TfidfVectorizer()),  # or a transformer
                ])),
            ('clf', LogisticRegression()),
        ])
reader = InterTASSAugmented()  # Class to use augmented data
X_train, y_train = reader.Xy()

corpus = "development.xml"
dev = InterTASSReader(corpus)
X_dev, y_dev = list(dev.X()), list(dev.y())

results = []
for params in tqdm(params_list):
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
    
    results.append({
        **result,
        **params,
    })

100%|██████████| 144/144 [15:29<00:00,  6.12s/it]


In [4]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__random_state,f1,feats__vect1__ngram_range,feats__vect1__preprocessor,feats__vect1__tokenizer,feats__vect__analyzer,feats__vect__binary,feats__vect__max_df,feats__vect__min_df,feats__vect__ngram_range,feats__vect__preprocessor,feats__vect__tokenizer
116,0.509188,0,0.413722,"(1, 5)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>,char_wb,True,0.95,5,"(1, 6)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>
44,0.508423,0,0.413182,"(1, 4)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>,char_wb,True,0.95,5,"(1, 6)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>
128,0.508423,0,0.412784,"(1, 5)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>,char_wb,True,0.9,5,"(1, 6)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>
56,0.507657,0,0.412304,"(1, 4)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>,char_wb,True,0.9,5,"(1, 6)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>
140,0.507657,0,0.411903,"(1, 5)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>,char_wb,True,0.7,5,"(1, 6)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>
68,0.506891,0,0.411424,"(1, 4)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>,char_wb,True,0.7,5,"(1, 6)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>
65,0.506126,0,0.410133,"(1, 4)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>,char_wb,True,0.7,3,"(1, 6)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>
70,0.506126,0,0.409914,"(1, 4)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>,char_wb,True,0.7,7,"(1, 5)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>
50,0.506126,0,0.409377,"(1, 4)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>,char_wb,True,0.9,1,"(1, 6)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>
71,0.50536,0,0.409904,"(1, 4)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>,char_wb,True,0.7,7,"(1, 6)",<function preprocess_tweets at 0x7fb8d8523840>,<function word_tokenize at 0x7fb8a15fb9d8>
