In [3]:
%load_ext autoreload
%autoreload 2
from sklearn.model_selection import ParameterGrid
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
from notebooks.sentiment.util import eval
from sentiment.new_data import InterTASSAugmented
from sentiment.tass import InterTASSReader
from tqdm import tqdm


def preprocess_tweets(str_):
    mentions = r'(?:@[^\s]+)'
    urls = r'(?:https?\://t.co/[\w]+)'
    str_ = re.sub(mentions, '', str_)
    return re.sub(urls, '', str_)

param_grid = {
    'clf__random_state': [0],
    'clf__penalty': ('l2', 'l2'),
    'clf__C': (0.5, 1, 2, 3, 4),
}

vect = TfidfVectorizer(tokenizer=word_tokenize,
                       binary=True,
                       analyzer="char_wb",
                       ngram_range=(1, 6),
                       min_df=5,
                       max_df=0.95,
                       preprocessor=preprocess_tweets,
                       stop_words=stopwords.words("spanish"))

vect2 = TfidfVectorizer(tokenizer=word_tokenize,
                        binary=True,
                        analyzer="word",
                        ngram_range=(1, 5),
                        preprocessor=preprocess_tweets,
                        stop_words=stopwords.words("spanish"))

params_list = list(ParameterGrid(param_grid))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
pipeline = Pipeline([
            ('feats', FeatureUnion([
                    ('vect', vect),  # can pass in either a pipeline
                    ('vect1', vect2),  # or a transformer
                ])),
            ('clf', LinearSVC()),
        ])
reader = InterTASSAugmented(ratio=0.9)  # Class to use augmented data
X_train, y_train = reader.Xy()
# train = "intertass-ES-train-tagged.xml"
# train = InterTASSReader(train)
# X_train, y_train = list(train.X()), list(train.y())


corpus = "../intertass-ES-development-tagged.xml"
dev = InterTASSReader(corpus)
X_dev, y_dev = list(dev.X()), list(dev.y())
results = []
for params in tqdm(params_list):
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
    
    results.append({
        **result,
        **params,
    })

100%|██████████| 10/10 [00:20<00:00,  2.26s/it]


In [5]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__C,clf__penalty,clf__random_state,f1
0,0.573123,0.5,l2,0,0.42919
1,0.573123,0.5,l2,0,0.42919
4,0.56917,2.0,l2,0,0.429493
5,0.56917,2.0,l2,0,0.429493
2,0.56917,1.0,l2,0,0.426313
3,0.56917,1.0,l2,0,0.426313
6,0.561265,3.0,l2,0,0.424937
7,0.561265,3.0,l2,0,0.424937
8,0.557312,4.0,l2,0,0.422539
9,0.557312,4.0,l2,0,0.422539
