In [59]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk import word_tokenize
from nltk.corpus import stopwords

from sentiment.tass import InterTASSReader
from sentiment.baselines import MostFrequent
from sentiment.classifier import SentimentClassifier

In [22]:
# Loading Training Set
corpus_train = "./InterTASS/ES/intertass-ES-train-tagged.xml"
reader_train = InterTASSReader(corpus_train)
X_train, y_train = list(reader_train.X()), list(reader_train.y())

In [23]:
# Loading Dev Set
corpus_dev = "./InterTASS/ES/intertass-ES-development-tagged.xml"
reader_dev = InterTASSReader(corpus_dev)
X_dev, y_dev = list(reader_dev.X()), list(reader_dev.y())

In [60]:
def eval(model, X, y_true):
    y_pred = model.predict(X)
    acc = metrics.accuracy_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred, average='macro')
    return {'acc': acc, 'f1': f1}

#### Logistic Regression

In [46]:
clf = LogisticRegression()

pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=word_tokenize)),
            ('clf', clf),
        ])

In [40]:
param_grid = {
    'clf__penalty': ('l1','l2'),
    'clf__C': [0.001, 0.01, 0.1, 1, 10],
}

params_list = list(ParameterGrid(param_grid))

In [61]:
results = []
for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)

    results.append({
        **result,
        **params,
    })


  'precision', 'predicted', average, warn_for)


In [64]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)

Unnamed: 0,acc,clf__C,clf__penalty,f1
5,0.559289,0.1,l2,0.350232
7,0.557312,1.0,l2,0.391317
9,0.537549,10.0,l2,0.384407
6,0.537549,1.0,l1,0.377131
3,0.529644,0.01,l2,0.287032
4,0.527668,0.1,l1,0.293593
8,0.521739,10.0,l1,0.376035
1,0.476285,0.001,l2,0.223608
2,0.436759,0.01,l1,0.157781
0,0.432806,0.001,l1,0.151034


#### SVM

In [66]:
clf = LinearSVC()

pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=word_tokenize)),
            ('clf', clf),
        ])

In [69]:
param_grid = {
    'clf__penalty': ['l1','l2'],
    'clf__C': [0.001, 0.01, 0.1, 1, 10],
    'clf__dual': [False] 
}

params_list = list(ParameterGrid(param_grid))

In [70]:
results = []
for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)

    results.append({
        **result,
        **params,
    })

  'precision', 'predicted', average, warn_for)


In [71]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)

Unnamed: 0,acc,clf__C,clf__dual,clf__penalty,f1
3,0.567194,0.01,False,l2,0.34859
5,0.561265,0.1,False,l2,0.409613
4,0.547431,0.1,False,l1,0.330368
7,0.531621,1.0,False,l2,0.400324
1,0.527668,0.001,False,l2,0.284634
6,0.519763,1.0,False,l1,0.382071
8,0.519763,10.0,False,l1,0.376982
9,0.51581,10.0,False,l2,0.397132
2,0.511858,0.01,False,l1,0.266006
0,0.432806,0.001,False,l1,0.151034


#### MultinomialNB

In [72]:
clf = MultinomialNB()

pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=word_tokenize)),
            ('clf', clf),
        ])

In [75]:
param_grid = {
    'clf__alpha': [1, 0.1, 0.01, 0.0001] 
}

params_list = list(ParameterGrid(param_grid))

In [76]:
results = []
for params in params_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)

    results.append({
        **result,
        **params,
    })

  'precision', 'predicted', average, warn_for)


In [77]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)

Unnamed: 0,acc,clf__alpha,f1
0,0.565217,1.0,0.316191
1,0.549407,0.1,0.399606
2,0.523715,0.01,0.393394
3,0.507905,0.0001,0.374724
