### Grid search

In [1]:
%load_ext autoreload
%autoreload 2
from sklearn.model_selection import ParameterGrid
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
import re

def preprocess_tweets(str_):
    mentions = r'(?:@[^\s]+)'
    urls = r'(?:https?\://t.co/[\w]+)'
    str_ = re.sub(mentions, '', str_)
    return re.sub(urls, '', str_)

param_grid = {
    'feats__vect__binary': [True],
    'feats__vect__ngram_range': [(1, 5), (1, 6), (1, 7)],
    'feats__vect__min_df': [1, 3, 5, 7],
    'feats__vect__max_df': [0.95, 0.9, 0.7],
    'feats__vect__analyzer': ["char", "char_wb"],
    'feats__vect__tokenizer': [word_tokenize],
    'feats__vect__preprocessor': [preprocess_tweets],
    'feats__vect1__tokenizer': [word_tokenize],
    'feats__vect1__preprocessor': [preprocess_tweets],
    'feats__vect1__ngram_range': [(1, 4), (1, 5)],
    'clf__random_state': [0],
}


params_list = list(ParameterGrid(param_grid))

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
from notebooks.sentiment.util import eval
from sentiment.new_data import InterTASSAugmented
from sentiment.tass import InterTASSReader
from tqdm import tqdm

pipeline = Pipeline([
            ('feats', FeatureUnion([
                    ('vect', TfidfVectorizer()),  # can pass in either a pipeline
                    ('vect1', TfidfVectorizer()),  # or a transformer
                ])),
            ('clf', LogisticRegression()),
        ])
reader = InterTASSAugmented(ratio=0.9)  # Class to use augmented data
X_train, y_train = reader.Xy()
# train = "intertass-ES-train-tagged.xml"
# train = InterTASSReader(train)
# X_train, y_train = list(train.X()), list(train.y())


corpus = "../intertass-ES-development-tagged.xml"
dev = InterTASSReader(corpus)
X_dev, y_dev = list(dev.X()), list(dev.y())
results = []
for params in tqdm(params_list):
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
    
    results.append({
        **result,
        **params,
    })

100%|██████████| 144/144 [06:10<00:00,  2.20s/it]


In [4]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__random_state,f1,feats__vect1__ngram_range,feats__vect1__preprocessor,feats__vect1__tokenizer,feats__vect__analyzer,feats__vect__binary,feats__vect__max_df,feats__vect__min_df,feats__vect__ngram_range,feats__vect__preprocessor,feats__vect__tokenizer
136,0.610672,0,0.405861,"(1, 5)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>,char_wb,True,0.7,3,"(1, 6)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>
43,0.608696,0,0.404821,"(1, 4)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>,char_wb,True,0.95,5,"(1, 6)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>
55,0.608696,0,0.404821,"(1, 4)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>,char_wb,True,0.9,5,"(1, 6)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>
67,0.608696,0,0.404821,"(1, 4)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>,char_wb,True,0.7,5,"(1, 6)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>
112,0.608696,0,0.404724,"(1, 5)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>,char_wb,True,0.95,3,"(1, 6)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>
124,0.608696,0,0.404724,"(1, 5)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>,char_wb,True,0.9,3,"(1, 6)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>
116,0.608696,0,0.404475,"(1, 5)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>,char_wb,True,0.95,5,"(1, 7)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>
128,0.608696,0,0.404475,"(1, 5)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>,char_wb,True,0.9,5,"(1, 7)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>
113,0.608696,0,0.399355,"(1, 5)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>,char_wb,True,0.95,3,"(1, 7)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>
125,0.608696,0,0.399355,"(1, 5)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>,char_wb,True,0.9,3,"(1, 7)",<function preprocess_tweets at 0x7f67981ed950>,<function word_tokenize at 0x7f675d812d08>
