In [103]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk import word_tokenize
from nltk.corpus import stopwords

from sentiment.tass import InterTASSReader
from sentiment.baselines import MostFrequent
from sentiment.classifier import SentimentClassifier
from sentiment.analysis import print_maxent_features, print_feature_weights_for_item
import pandas as pd

## Ejercicio 2 -Mejoras al Clasificador Básico de Polaridad

#### Resultados

In [124]:
df = pd.read_csv("./results.csv")

In [126]:
df = df.round(2)

In [127]:
df[["Corpus","Classifier","Comment","Accuracy","M-Precision","M-Recall","Macro-F1"]]

Unnamed: 0,Corpus,Classifier,Comment,Accuracy,M-Precision,M-Recall,Macro-F1
0,ES,basemf_base_es,baseline most frequent,43.28,85.82,25.0,38.72
1,CR,basemf_base_cr,baseline most frequent,36.67,84.17,25.0,38.55
2,PE,basemf_base_pe,baseline most frequent,47.6,86.9,25.0,38.83
3,ES,maxent_base_es,baseline maxent,53.16,37.13,37.34,37.23
4,CR,maxent_base_cr,baseline maxent,46.67,40.01,37.86,38.91
5,PE,maxent_base_pe,baseline maxent,39.2,33.04,34.43,33.72
6,ES,maxent_nltktok_es,maxent with nltk tokenizer,55.73,42.09,40.14,41.09
7,CR,maxent_nltktok_cr,maxent with nltk tokenizer,52.67,43.22,43.01,43.11
8,PE,maxent_nltktok_pe,maxent with nltk tokenizer,40.6,34.7,36.12,35.4
9,ES,maxent_binary_es,maxent binary count,51.98,36.82,36.43,36.63


## Ejercicio 3 - Grid Search

In [3]:
# Loading Training Set
corpus_train = "./InterTASS/ES/intertass-ES-train-tagged.xml"
reader_train = InterTASSReader(corpus_train)
X_train, y_train = list(reader_train.X()), list(reader_train.y())

In [4]:
# Loading Dev Set
corpus_dev = "./InterTASS/ES/intertass-ES-development-tagged.xml"
reader_dev = InterTASSReader(corpus_dev)
X_dev, y_dev = list(reader_dev.X()), list(reader_dev.y())

In [5]:
def eval(model, X, y_true):
    y_pred = model.predict(X)
    acc = metrics.accuracy_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred, average='macro')
    return {'acc': acc, 'f1': f1}

#### Logistic Regression

In [6]:
clf = LogisticRegression()

pipeline_lr = Pipeline([
            ('vect', CountVectorizer(tokenizer=word_tokenize)),
            ('clf', clf),
        ])

In [7]:
param_grid = {
    'clf__penalty': ('l1','l2'),
    'clf__C': [0.001, 0.01, 0.1, 1, 10],
}

params_list = list(ParameterGrid(param_grid))

In [8]:
results = []
for params in params_list:
    pipeline_lr.set_params(**params)
    pipeline_lr.fit(X_train, y_train)
    result = eval(pipeline_lr, X_dev, y_dev)

    results.append({
        **result,
        **params,
    })


  'precision', 'predicted', average, warn_for)


In [9]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)

Unnamed: 0,acc,clf__C,clf__penalty,f1
5,0.559289,0.1,l2,0.350232
7,0.557312,1.0,l2,0.391317
9,0.537549,10.0,l2,0.384407
6,0.535573,1.0,l1,0.376214
3,0.529644,0.01,l2,0.287032
4,0.527668,0.1,l1,0.293593
8,0.521739,10.0,l1,0.383559
1,0.476285,0.001,l2,0.223608
2,0.436759,0.01,l1,0.157781
0,0.432806,0.001,l1,0.151034


#### SVM

In [10]:
clf = LinearSVC()

pipeline_svm = Pipeline([
            ('vect', CountVectorizer(tokenizer=word_tokenize)),
            ('clf', clf),
        ])

In [11]:
param_grid = {
    'clf__penalty': ['l1','l2'],
    'clf__C': [0.001, 0.01, 0.1, 1, 10],
    'clf__dual': [False] 
}

params_list = list(ParameterGrid(param_grid))

In [12]:
results = []
for params in params_list:
    pipeline_svm.set_params(**params)
    pipeline_svm.fit(X_train, y_train)
    result = eval(pipeline_svm, X_dev, y_dev)

    results.append({
        **result,
        **params,
    })



In [13]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)

Unnamed: 0,acc,clf__C,clf__dual,clf__penalty,f1
3,0.567194,0.01,False,l2,0.34859
5,0.561265,0.1,False,l2,0.409613
4,0.545455,0.1,False,l1,0.329601
7,0.531621,1.0,False,l2,0.400324
1,0.527668,0.001,False,l2,0.284634
8,0.525692,10.0,False,l1,0.392753
9,0.51581,10.0,False,l2,0.397132
2,0.511858,0.01,False,l1,0.266006
6,0.509881,1.0,False,l1,0.372984
0,0.432806,0.001,False,l1,0.151034


#### MultinomialNB

In [15]:
clf = MultinomialNB()

pipeline_nb = Pipeline([
            ('vect', CountVectorizer(tokenizer=word_tokenize)),
            ('clf', clf),
        ])

In [16]:
param_grid = {
    'clf__alpha': [1, 0.1, 0.01, 0.0001] 
}

params_list = list(ParameterGrid(param_grid))

In [17]:
results = []
for params in params_list:
    pipeline_nb.set_params(**params)
    pipeline_nb.fit(X_train, y_train)
    result = eval(pipeline_nb, X_dev, y_dev)

    results.append({
        **result,
        **params,
    })

  'precision', 'predicted', average, warn_for)


In [19]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)

Unnamed: 0,acc,clf__alpha,f1
0,0.565217,1.0,0.316191
1,0.549407,0.1,0.399606
2,0.523715,0.01,0.393394
3,0.507905,0.0001,0.374724


## Ejercicio 4 - Inspección de Modelos

In [20]:
pipeline_lr.steps

[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 1), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
          tokenizer=<function word_tokenize at 0x7f78d1b14d90>,
          vocabulary=None)),
 ('clf',
  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='warn',
            tol=0.0001, verbose=0, warm_start=False))]

In [21]:
vect = pipeline_lr.named_steps['vect']
clf = pipeline_lr.named_steps['clf']

In [23]:
features = vect.get_feature_names()
len(features)

5049

In [24]:
print_maxent_features(vect, clf, 10)

N:
	bonito buen guapa encuentre 11:11 irresponsable buena genial algunos voy ([-2.86934595 -2.08702273 -2.08389864 -1.91690854 -1.91690854 -1.85684378
 -1.74317015 -1.69395729 -1.66076886 -1.65224562])
	poco sola cosa pobre mismo odio ni feo peor triste ([1.91680946 1.93080402 2.08830209 2.10199825 2.11868748 2.17602953
 2.48799712 2.5166921  2.71250342 3.05123425])
NEU:
	gracias su peor hoy triste ana feo sola ? cosas ([-1.65716948 -1.65133671 -1.6333469  -1.43883509 -1.41755725 -1.32158237
 -1.30704899 -1.26248005 -1.26182554 -1.20775355])
	pelado slammactivao encuentre 11:11 imdariusb1tches ineternete crtkftauryn plan nerviosa viejas ([1.86288855 1.86288855 2.04737787 2.04737787 2.11642018 2.11735041
 2.16748105 2.19855978 2.75534623 3.1164695 ])
NONE:
	mal buen ser nada serio feliz están más siempre sin ([-1.71397182 -1.46810325 -1.35430631 -1.34377599 -1.26008118 -1.24795138
 -1.22523022 -1.20268792 -1.20204557 -1.18798703])
	fecha ichuso empezado indirecta abstracto caspitoo sema

## Ejercicio 5 - Análisis de Error

In [49]:
y_pred = pipeline_lr.predict(X_dev)

In [50]:
# Calculamos probabilidades 
y_proba = pipeline_lr.predict_proba(X_dev)

In [101]:
# Creamos un dataframe con los tweets mal clasificados, ordenados por los que tuvieron probabilidad más alta sobre 
# clase errónea (aquellos sobre los que el clasificador tenía más seguridad en su predicción)
errors = []
probs = {"N":0,"NEU":1,"NONE":2,"P":3}

for i,(x, y1, y2, proba) in enumerate(zip(X_dev, y_dev, y_pred, y_proba )):
    if y1 != y2:
        diff = proba
        errors.append({
            'index':i,
            'item': x,
            'true': y1,
            'pred': y2,
            'prob_pred': proba[probs[y2]],
            })

errdf = pd.DataFrame(errors,columns=['index','item','true','pred','prob_pred'])
errdf.sort_values('prob_pred', inplace=True,ascending=False)


In [102]:
errdf[0:20]

Unnamed: 0,index,item,true,pred,prob_pred
198,416,URGENTE!!! VENTA MY NAME TIKETS!!!\nTengo dos ...,NONE,P,0.998672
67,137,@LaQueSoySiempre @ealbaga Por desgracia vende ...,N,P,0.991105
107,213,me he ido a la ducha y se me ha olvidado coger...,NONE,N,0.987353
129,270,@MV3ga hay cosas del hilo con las que discrepo...,NONE,N,0.984325
143,291,@AnaSJuarez @OfficialMauiJim ¡Hola Ana! Te hem...,NONE,P,0.98229
111,223,"Cuando no puedo dormir, escribo todo lo que pr...",P,N,0.981207
202,428,"Yo estaba cansadete, pero de repente me habla ...",P,N,0.977272
74,153,15. No me gusta el término \n16. Meh \n17. De...,NONE,N,0.972211
119,250,@UniversoMujer18 ya se acabo la hora jajaja es...,P,N,0.956387
156,320,A mí nunca me podrán hacer una broma porque no...,NONE,N,0.955317


In [128]:
errors = errdf['index'][0:10].values

In [91]:
[X_dev[index] for index in errors]

['URGENTE!!! VENTA MY NAME TIKETS!!!\nTengo dos tickets ULTIMATE VIP pero no podemos ir  los vendo más baratos, contactad conmigo!!!',
 '@LaQueSoySiempre @ealbaga Por desgracia vende más  ,riñas,trifulcas,peleas,al cuello!! mátalo!!',
 'me he ido a la ducha y se me ha olvidado coger la ropa  ahors tengo que salir a por ella y como haya alguien en mi ventana me ve desnuda',
 '@MV3ga hay cosas del hilo con las que discrepo. Como me sigues hace poco, te aviso de que yo hago rt a lo interesante, coincida o no ',
 '@AnaSJuarez @OfficialMauiJim ¡Hola Ana! Te hemos contestado por mensaje privado, donde no hay limitación de caracteres  ¡Gracias!',
 'Cuando no puedo dormir, escribo todo lo que preocupa en una libreta que alguien me regaló y es como un somnífero instantáneo ',
 'Yo estaba cansadete, pero de repente me habla a un amigo para jugar un ratejo a la Beta de Battlefield 1, y quién le dice que no al chico ',
 '15. No me gusta el término  \n16. Meh \n17. Depende de qué\n18. No \n19 Un pe

In [133]:
tweet = "@LaQueSoySiempre @ealbaga Por desgracia vende más  ,riñas,trifulcas,peleas,al cuello!! mátalo!!"

In [134]:
# Features que intervienen en la clasificación
print_feature_weights_for_item(vect,clf,tweet)

! [-1.00453974 -0.95153362 -0.19846824  1.36069119]
, [-0.3870915  -0.07526614 -0.39497008  0.50956257]
@ [-0.37599216 -0.21543827  0.2713075   0.06814566]
al [ 0.51110109 -0.39551092  0.05058118 -0.4960745 ]
desgracia [ 0.61744223 -0.21158639 -0.09616715 -0.21244867]
más [-0.52540095  1.27775123 -1.20268792  0.18649803]
por [-0.54276769  0.50699498 -0.44647751  0.31744384]
vende [ 0.38222646 -0.44397838  0.48114509 -0.38853147]


In [135]:
# Predicción del tweet original menos los signos de exclamación
pipeline_lr.predict([tweet.replace("!","")])

array(['P'], dtype='<U4')

In [147]:
# Reemplazamos comas por espacios
pipeline_lr.predict([tweet.replace(","," ")])

array(['P'], dtype='<U4')

In [144]:
# Quitamos tanto comas como signos de exclamanación
pipeline_lr.predict([tweet.replace("!","").replace(","," ")])

array(['N'], dtype='<U4')

In [142]:
print_feature_weights_for_item(vect,clf,tweet.replace("!","").replace(",",""))

@ [-0.37599216 -0.21543827  0.2713075   0.06814566]
desgracia [ 0.61744223 -0.21158639 -0.09616715 -0.21244867]
más [-0.52540095  1.27775123 -1.20268792  0.18649803]
por [-0.54276769  0.50699498 -0.44647751  0.31744384]
vende [ 0.38222646 -0.44397838  0.48114509 -0.38853147]


In [None]:
# Finalmente obtuvimos el resultado esperado.

In [150]:
type(X_train)

list

In [149]:
X_dev[0]

'@noseashetero 1000/10 de verdad a ti que voy a decir petarda que te quiero más que a mí mismo  ✨'

In [157]:
# Vamos a ver que sucede si eliminamos comas y signos de exclamación
X_train_p = [x.replace("!","").replace(","," ") for x in X_train]
X_dev_p = [x.replace("!","").replace(","," ") for x in X_dev]


In [158]:
param_grid = {
    'clf__penalty': ('l1','l2'),
    'clf__C': [0.001, 0.01, 0.1, 1, 10],
}

params_list = list(ParameterGrid(param_grid))
results = []
for params in params_list:
    pipeline_lr.set_params(**params)
    pipeline_lr.fit(X_train_p, y_train)
    result = eval(pipeline_lr, X_dev_p, y_dev)

    results.append({
        **result,
        **params,
    })

In [159]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)

Unnamed: 0,acc,clf__C,clf__penalty,f1
7,0.543478,1.0,l2,0.381997
5,0.539526,0.1,l2,0.347457
9,0.529644,10.0,l2,0.386022
6,0.509881,1.0,l1,0.358392
8,0.505929,10.0,l1,0.37947
3,0.505929,0.01,l2,0.271386
4,0.476285,0.1,l1,0.263753
1,0.436759,0.001,l2,0.160554
2,0.436759,0.01,l1,0.157781
0,0.432806,0.001,l1,0.151034


Los resultados son ligeramente peores, por lo que estimamos que el signo de exclamación y la coma juegan algún rol