### Grid search

In [1]:
from sklearn.model_selection import ParameterGrid
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
import re

def preprocess_tweets(str_):
    mentions = r'(?:@[^\s]+)'
    urls = r'(?:https?\://t.co/[\w]+)'
    str_ = re.sub(mentions, '', str_)
    return re.sub(urls, '', str_)

param_grid = {
    'feats__vect__binary': [True],
    'feats__vect__ngram_range': [(1, 4), (1, 5), (1, 6)],
    'feats__vect__min_df': [1, 3, 5, 7],
    'feats__vect__max_df': [0.95, 0.9, 0.7],
    'feats__vect__analyzer': ["char", "char_wb"],
    'feats__vect__tokenizer': [word_tokenize],
    'feats__vect__preprocessor': [preprocess_tweets],
    'feats__vect1__tokenizer': [word_tokenize],
    'feats__vect1__preprocessor': [preprocess_tweets],
    'feats__vect1__ngram_range': [(1, 4), (1, 5)],
    'clf__random_state': [0],
}


params_list = list(ParameterGrid(param_grid))

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
from notebooks.sentiment.util import eval
from sentiment.new_data import InterTASSAugmented
from sentiment.tass import InterTASSReader
from tqdm import tqdm

pipeline = Pipeline([
            ('feats', FeatureUnion([
                    ('vect', TfidfVectorizer()),  # can pass in either a pipeline
                    ('vect1', TfidfVectorizer()),  # or a transformer
                ])),
            ('clf', LogisticRegression()),
        ])
# reader = InterTASSAugmented(ratio=0.9)  # Class to use augmented data
# X_train, y_train = reader.Xy()
train = "intertass-ES-train-tagged.xml"
train = InterTASSReader(train)
X_train, y_train = list(train.X()), list(train.y())


corpus = "intertass-ES-development-tagged.xml"
dev = InterTASSReader(corpus)
X_dev, y_dev = list(dev.X()), list(dev.y())
results = []
for params in tqdm(params_list):
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
    
    results.append({
        **result,
        **params,
    })

  'precision', 'predicted', average, warn_for)
100%|██████████| 144/144 [02:08<00:00,  1.22it/s]


In [3]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__random_state,f1,feats__vect1__ngram_range,feats__vect1__preprocessor,feats__vect1__tokenizer,feats__vect__analyzer,feats__vect__binary,feats__vect__max_df,feats__vect__min_df,feats__vect__ngram_range,feats__vect__preprocessor,feats__vect__tokenizer
41,0.594862,0,0.344926,"(1, 4)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>,char_wb,True,0.95,3,"(1, 6)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>
53,0.594862,0,0.344926,"(1, 4)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>,char_wb,True,0.9,3,"(1, 6)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>
65,0.594862,0,0.344926,"(1, 4)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>,char_wb,True,0.7,3,"(1, 6)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>
113,0.592885,0,0.343509,"(1, 5)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>,char_wb,True,0.95,3,"(1, 6)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>
125,0.592885,0,0.343509,"(1, 5)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>,char_wb,True,0.9,3,"(1, 6)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>
137,0.592885,0,0.343509,"(1, 5)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>,char_wb,True,0.7,3,"(1, 6)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>
43,0.590909,0,0.342425,"(1, 4)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>,char_wb,True,0.95,5,"(1, 5)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>
55,0.590909,0,0.342425,"(1, 4)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>,char_wb,True,0.9,5,"(1, 5)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>
67,0.590909,0,0.342425,"(1, 4)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>,char_wb,True,0.7,5,"(1, 5)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>
64,0.590909,0,0.342085,"(1, 4)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>,char_wb,True,0.7,3,"(1, 5)",<function preprocess_tweets at 0x7fefad2d3488>,<function word_tokenize at 0x7fef76a7cb70>
