## Ejercicio 3 - Grid Search

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk import word_tokenize
from nltk.corpus import stopwords

from sentiment.tass import InterTASSReader
from sentiment.baselines import MostFrequent
from sentiment.classifier import SentimentClassifier
from sentiment.analysis import print_maxent_features
import pandas as pd

In [3]:
# Loading Training Set
corpus_train = "./InterTASS/ES/intertass-ES-train-tagged.xml"
reader_train = InterTASSReader(corpus_train)
X_train, y_train = list(reader_train.X()), list(reader_train.y())

In [4]:
# Loading Dev Set
corpus_dev = "./InterTASS/ES/intertass-ES-development-tagged.xml"
reader_dev = InterTASSReader(corpus_dev)
X_dev, y_dev = list(reader_dev.X()), list(reader_dev.y())

In [5]:
def eval(model, X, y_true):
    y_pred = model.predict(X)
    acc = metrics.accuracy_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred, average='macro')
    return {'acc': acc, 'f1': f1}

#### Logistic Regression

In [6]:
clf = LogisticRegression()

pipeline_lr = Pipeline([
            ('vect', CountVectorizer(tokenizer=word_tokenize)),
            ('clf', clf),
        ])

In [7]:
param_grid = {
    'clf__penalty': ('l1','l2'),
    'clf__C': [0.001, 0.01, 0.1, 1, 10],
}

params_list = list(ParameterGrid(param_grid))

In [8]:
results = []
for params in params_list:
    pipeline_lr.set_params(**params)
    pipeline_lr.fit(X_train, y_train)
    result = eval(pipeline_lr, X_dev, y_dev)

    results.append({
        **result,
        **params,
    })


  'precision', 'predicted', average, warn_for)


In [9]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)

Unnamed: 0,acc,clf__C,clf__penalty,f1
5,0.559289,0.1,l2,0.350232
7,0.557312,1.0,l2,0.391317
9,0.537549,10.0,l2,0.384407
6,0.535573,1.0,l1,0.376214
3,0.529644,0.01,l2,0.287032
4,0.527668,0.1,l1,0.293593
8,0.521739,10.0,l1,0.383559
1,0.476285,0.001,l2,0.223608
2,0.436759,0.01,l1,0.157781
0,0.432806,0.001,l1,0.151034


#### SVM

In [10]:
clf = LinearSVC()

pipeline_svm = Pipeline([
            ('vect', CountVectorizer(tokenizer=word_tokenize)),
            ('clf', clf),
        ])

In [11]:
param_grid = {
    'clf__penalty': ['l1','l2'],
    'clf__C': [0.001, 0.01, 0.1, 1, 10],
    'clf__dual': [False] 
}

params_list = list(ParameterGrid(param_grid))

In [12]:
results = []
for params in params_list:
    pipeline_svm.set_params(**params)
    pipeline_svm.fit(X_train, y_train)
    result = eval(pipeline_svm, X_dev, y_dev)

    results.append({
        **result,
        **params,
    })



In [13]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)

Unnamed: 0,acc,clf__C,clf__dual,clf__penalty,f1
3,0.567194,0.01,False,l2,0.34859
5,0.561265,0.1,False,l2,0.409613
4,0.545455,0.1,False,l1,0.329601
7,0.531621,1.0,False,l2,0.400324
1,0.527668,0.001,False,l2,0.284634
8,0.525692,10.0,False,l1,0.392753
9,0.51581,10.0,False,l2,0.397132
2,0.511858,0.01,False,l1,0.266006
6,0.509881,1.0,False,l1,0.372984
0,0.432806,0.001,False,l1,0.151034


#### MultinomialNB

In [15]:
clf = MultinomialNB()

pipeline_nb = Pipeline([
            ('vect', CountVectorizer(tokenizer=word_tokenize)),
            ('clf', clf),
        ])

In [16]:
param_grid = {
    'clf__alpha': [1, 0.1, 0.01, 0.0001] 
}

params_list = list(ParameterGrid(param_grid))

In [17]:
results = []
for params in params_list:
    pipeline_nb.set_params(**params)
    pipeline_nb.fit(X_train, y_train)
    result = eval(pipeline_nb, X_dev, y_dev)

    results.append({
        **result,
        **params,
    })

  'precision', 'predicted', average, warn_for)


In [19]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)

Unnamed: 0,acc,clf__alpha,f1
0,0.565217,1.0,0.316191
1,0.549407,0.1,0.399606
2,0.523715,0.01,0.393394
3,0.507905,0.0001,0.374724


## Ejercicio 4 - Inspección de Modelos

In [20]:
pipeline_lr.steps

[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 1), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
          tokenizer=<function word_tokenize at 0x7f78d1b14d90>,
          vocabulary=None)),
 ('clf',
  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='warn',
            tol=0.0001, verbose=0, warm_start=False))]

In [21]:
vect = pipeline_lr.named_steps['vect']
clf = pipeline_lr.named_steps['clf']

In [23]:
features = vect.get_feature_names()
len(features)

5049

In [24]:
print_maxent_features(vect, clf, 10)

N:
	bonito buen guapa encuentre 11:11 irresponsable buena genial algunos voy ([-2.86934595 -2.08702273 -2.08389864 -1.91690854 -1.91690854 -1.85684378
 -1.74317015 -1.69395729 -1.66076886 -1.65224562])
	poco sola cosa pobre mismo odio ni feo peor triste ([1.91680946 1.93080402 2.08830209 2.10199825 2.11868748 2.17602953
 2.48799712 2.5166921  2.71250342 3.05123425])
NEU:
	gracias su peor hoy triste ana feo sola ? cosas ([-1.65716948 -1.65133671 -1.6333469  -1.43883509 -1.41755725 -1.32158237
 -1.30704899 -1.26248005 -1.26182554 -1.20775355])
	pelado slammactivao encuentre 11:11 imdariusb1tches ineternete crtkftauryn plan nerviosa viejas ([1.86288855 1.86288855 2.04737787 2.04737787 2.11642018 2.11735041
 2.16748105 2.19855978 2.75534623 3.1164695 ])
NONE:
	mal buen ser nada serio feliz están más siempre sin ([-1.71397182 -1.46810325 -1.35430631 -1.34377599 -1.26008118 -1.24795138
 -1.22523022 -1.20268792 -1.20204557 -1.18798703])
	fecha ichuso empezado indirecta abstracto caspitoo sema