In [11]:
from sklearn.model_selection import ParameterGrid
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
from notebooks.sentiment.util import eval
from sentiment.new_data import InterTASSAugmented
from sentiment.tass import InterTASSReader
from tqdm import tqdm


def preprocess_tweets(str_):
    mentions = r'(?:@[^\s]+)'
    urls = r'(?:https?\://t.co/[\w]+)'
    str_ = re.sub(mentions, '', str_)
    return re.sub(urls, '', str_)

param_grid = {
    'clf__random_state': [0],
    'clf__penalty': ('l2', 'l2'),
    'clf__C': (0.5, 1, 2, 3, 4),
}

vect = TfidfVectorizer(tokenizer=word_tokenize,
                       binary=True,
                       analyzer="char_wb",
                       ngram_range=(1, 6),
                       min_df=5,
                       max_df=0.95,
                       preprocessor=preprocess_tweets,
                       stop_words=stopwords.words("spanish"))

vect2 = TfidfVectorizer(tokenizer=word_tokenize,
                        binary=True,
                        analyzer="word",
                        ngram_range=(1, 5),
                        preprocessor=preprocess_tweets,
                        stop_words=stopwords.words("spanish"))

params_list = list(ParameterGrid(param_grid))

In [12]:
pipeline = Pipeline([
            ('feats', FeatureUnion([
                    ('vect', vect),  # can pass in either a pipeline
                    ('vect1', vect2),  # or a transformer
                ])),
            ('clf', LinearSVC()),
        ])
reader = InterTASSAugmented()  # Class to use augmented data
X_train, y_train = reader.Xy()

corpus = "development.xml"
dev = InterTASSReader(corpus)
X_dev, y_dev = list(dev.X()), list(dev.y())

results = []
for params in tqdm(params_list):
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
    
    results.append({
        **result,
        **params,
    })


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:04<00:44,  4.94s/it][A
 20%|██        | 2/10 [00:09<00:39,  4.93s/it][A
 30%|███       | 3/10 [00:15<00:35,  5.07s/it][A
 40%|████      | 4/10 [00:20<00:31,  5.21s/it][A
 50%|█████     | 5/10 [00:26<00:27,  5.51s/it][A
 60%|██████    | 6/10 [00:33<00:22,  5.75s/it][A
 70%|███████   | 7/10 [00:40<00:18,  6.14s/it][A
 80%|████████  | 8/10 [00:48<00:13,  6.81s/it][A
 90%|█████████ | 9/10 [00:58<00:07,  7.63s/it][A
100%|██████████| 10/10 [01:07<00:00,  8.14s/it][A

In [13]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__C,clf__penalty,clf__random_state,f1
4,0.498469,2.0,l2,0,0.432395
5,0.498469,2.0,l2,0,0.432395
6,0.496937,3.0,l2,0,0.432447
7,0.496937,3.0,l2,0,0.432447
0,0.496172,0.5,l2,0,0.421318
1,0.496172,0.5,l2,0,0.421318
8,0.495406,4.0,l2,0,0.431079
9,0.495406,4.0,l2,0,0.431079
2,0.493874,1.0,l2,0,0.424479
3,0.493874,1.0,l2,0,0.424479
