### Preprocessing of the data 

In [2]:
run ./preprocessing.ipynb

Total tweets to evaluate: 177
Evaluated tweets so far: 411
Total corpus tweets: 8227
Total corpus tweets after cleaning: 6605


### Tokenization and stemming

Download Spanish stopwords in Spanish:

In [14]:
# Download spanish stopwords
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


Get non-words, and extend array of non-words with characters `¿` and `¿`.

In [4]:
from string import punctuation
non_words = list(punctuation)

# Add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))

Define stemmer and tokenizer, based on previous steps.

In [5]:
from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [6]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
257,Hay videos que no se envían porque no tengo es...,N
5426,"“: Pero mientras, ya se está aplicando, correc...",P
2896,Que hacer si la recesión agrava las maltrechas...,N
1244,No le concede Santamaría importancia a la redu...,NEU
5921,"RT: ajuste de cuentas por Albert Castillón, “:...",NEU
817,Domingo y he visto amanecer! Precioso! Un beso...,P
1521,Comparecen tras Cjo Ministros: Vicepresidenta ...,N
5826,El anuncio de Loewe ha despertado la polémica....,P
488,Os dejo una foto con mi primo que hoy es su cu...,P
724,"Otro para ti Javier, me ha hecho mucha ilusió...",P


### Model Evaluation

Import libraries:

In [7]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline



We convert from strings to numerics the polarity values

In [10]:
tweets_corpus['polarity_bin'] = 0
tweets_corpus.polarity_bin[tweets_corpus.polarity.isin(['P'])] = 1
tweets_corpus.polarity_bin[tweets_corpus.polarity.isin(['N'])] = -1
tweets_corpus.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


 1    0.484784
-1    0.393641
 0    0.121575
Name: polarity_bin, dtype: float64

Now we use SVC model with optimization via GridSearch

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', LinearSVC()),
])

As long as we don't have binary classification, we must binarize the polarity and use a multiclass learning algorithm.

In [23]:
'''from sklearn.preprocessing import label_binarize
tweets_corpus.polarity_bin = label_binarize(tweets_corpus.polarity_bin, classes=[-1, 0, 1])'''

'from sklearn.preprocessing import label_binarize\ntweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])'

In [17]:
params = {
    'cls__C': (0.2, 0.5, 0.7),
    'cls__loss': ('hinge', 'squared_hinge'),
    'cls__max_iter': (500, 1000)
}
gs = GridSearchCV(pipeline, params, n_jobs=-1, cv=5)
gs.fit(tweets_corpus.content, tweets_corpus.polarity_bin)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['de', 'la'...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'cls__C': (0.2, 0.5, 0.7), 'cls__loss': ('hinge', 'squared_hinge'), 'cls__max_iter': (500, 1000)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [18]:
gs.best_params_

{'cls__C': 0.2, 'cls__loss': 'hinge', 'cls__max_iter': 500}

We obtain that the best parameters are:

{'cls__estimator__C': 0.2,

 'cls__estimator__loss': 'hinge',
 
 'cls__estimator__max_iter': 500,
 
 'vect__max_df': 1.9,
 
 'vect__max_features': 1000,
 
 'vect__min_df': 10,
 
 'vect__ngram_range': (1, 1)}

In [19]:
from sklearn.externals import joblib
joblib.dump(gs, 'grid_search.pkl')

['grid_search.pkl']

Import cross validation:

In [20]:
from sklearn.cross_validation import cross_val_predict

In [21]:
model = LinearSVC(
    C=.2, 
    loss='hinge', 
    max_iter=500, 
    random_state=None, 
    penalty='l2'
)

# Define vectorizer with the previously created tokenizer and stopwords array
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 10,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

corpus_data_features = vectorizer.fit_transform(tweets_corpus.content)
corpus_data_features_nd = corpus_data_features.toarray()

In [22]:
y=tweets_corpus.polarity_bin

In [23]:
'''scores = cross_val_score(
    model,
    corpus_data_features_nd[0:len(tweets_corpus)],
    y=tweets_corpus.polarity_bin,
    scoring='roc_auc',
    cv=5
    )

scores.mean()'''

"scores = cross_val_score(\n    model,\n    corpus_data_features_nd[0:len(tweets_corpus)],\n    y=tweets_corpus.polarity_bin,\n    scoring='roc_auc',\n    cv=5\n    )\n\nscores.mean()"

### Polarity Prediction

In [59]:
tweets_no_label = pd.read_csv(test_tweets_raw, encoding='utf-8')
print('Number of tweets: %d' % tweets_no_label.shape[0])
tweets_no_label.sample(10)

Number of tweets: 177


Unnamed: 0,id,text
41,62f967e5,"Hace 17 años, Messi por primera vez se puso la..."
168,e71a115b,@EASPORTSEsp Griezmann no será por el partido ...
126,58685dd8,Pavor tengo que ahora los del #PSG miren hacia...
133,26c47161,Félix Brych ayer en el partido de champions #P...
12,66d69741,"Como hará Griezmann para jugar en Madrid, Barç..."
49,1d485d6c,"@CristoS092 Lo bueno es q fue expulsado, ese e..."
171,85a506e1,"Hace 17 años, Messi por primera vez se puso la..."
165,dec03842,"@neymarjr El Barça no te necesita, mejor ya ve..."
151,78c69e51,@juliovillagomez El típico indiecito q habla p...
73,cee9b0a5,Que solo soy feliz con el atleti de madrid!!!


Now we do some cleansing of the data, erasing again the links, usernames, newline characters, multiple spaces and emojis.

In [60]:
import re
# Remove links
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('https?:\/\/t\.co\/[\w]{8,8}'), '', x))

# Remove usernames
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('@[A-Za-z0-9_]+'), '', x))

# Remove newline character
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('[\n\r]+'), '', x))

# Replace multiple spaces with single one
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('[\s]+'), ' ', x))

# Remove emojis
emoji_pattern = re.compile(u'['
     u'\U0001F300-\U0001F64F'
     u'\U0001F680-\U0001F6FF'
     u'\u2600-\u26FF\u2700-\u27BF]+', 
     re.UNICODE)
#tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(emoji_pattern, ' ', x))

In [61]:
tweets_no_label.sample(10)

Unnamed: 0,id,text
163,dd2c2a33,"los que se clasifiquen ya son los mejores, Ba..."
161,449cf710,Md
32,c221d218,Coño sin ampliar parecía un preso
160,7178e29d,Le dare merito al madrid el dia q vosotros re...
22,08398146,Antes de la eliminatoria el PSG era el ganado...
62,c3606126,entonces por qué lo llamas realmadridización?...
151,78c69e51,El típico indiecito q habla pestes del mas Gl...
159,1491c900,Los parisinos no lo verán este año la famosa C...
108,a7ca41f5,Really ? El Espanyol nos ganó. 🙄🙄Este año lo ...
5,5b9c12ba,Pero que te crees? Xk os sorprende lo de este...


### Language detection

Due to the fact that some tweets are in catalan, for language detection purposes we are only going to process about the ones in spanish for sentiment purposes.

We use three different libraries for language detection and keep those tweets on which at least two of these libraries agree on the language being Spanish.

In [62]:
import langid
from langdetect import detect
import textblob

def langid_safe(tweet):
    try:
        return langid.classify(tweet)[0]
    except Exception as e:
        pass
        
def langdetect_safe(tweet):
    try:
        return detect(tweet)
    except Exception as e:
        pass

def textblob_safe(tweet):
    try:
        return textblob.TextBlob(tweet).detect_language()
    except Exception as e:
        pass

ModuleNotFoundError: No module named 'langid'

Create 3 new columns specifying the detected language of the tweet.

In [63]:
tweets_no_label['lang_langid'] = tweets_no_label.text.apply(langid_safe)
tweets_no_label['lang_langdetect'] = tweets_no_label.text.apply(langdetect_safe)
tweets_no_label['lang_textblob'] = tweets_no_label.text.apply(textblob_safe)

NameError: name 'langid_safe' is not defined

Save as CSV.

In [64]:
tweets_no_label.to_csv('tweets_parsed.csv', encoding='utf-8')

We select the tweets in Spanish as follows:
- If the language detected is Spanish by at least 2 libraries, leave.
- If the language detected is Spanish in at least 1 library, print and append to the dataset manually.
- If none of the languages detected is Spanish, remove.

In [65]:
# Leave tweets whose detected language is Spanish (majority):
spanish_query = ''' (lang_langdetect == 'es' and lang_langid == 'es') or (lang_langdetect == 'es' and lang_textblob == 'es') or (lang_textblob == 'es' and lang_langid == 'es') '''
tweets_spanish = tweets_no_label.query(spanish_query)

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

# Print tweets in doubtful language:
nonspanish_query = ''' ((lang_langdetect != 'es' and lang_langid != 'es') or (lang_langdetect != 'es' and lang_textblob != 'es') or (lang_textblob != 'es' and lang_langid != 'es')) and (lang_textblob == 'es' or lang_langid == 'es' or lang_langdetect == 'es') '''
tweets_doubtful = tweets_no_label.query(nonspanish_query)

print('Tweets whose language is not clear: %d' % tweets_doubtful.shape[0])

tweets_doubtful

UndefinedVariableError: name 'lang_langdetect' is not defined

In [66]:
# Append rest of the tweets in Spanish manually
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '79cdded5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '26fe7471' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'cd0d8bcb' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '97af720a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '09c0f4cc' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5a533794' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9046f222' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5df2d140' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c5343fa0' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '12d82762' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'dcc02374' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '8f9d73cf' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '6f30beca' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9cd8b232' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3c78bdb5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3beadb3a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c8cda282' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'fce60e59' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '7bd204cc' ''')])

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

NameError: name 'tweets_spanish' is not defined

Define pipeline:

In [67]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', LinearSVC(C=.2, loss='hinge',max_iter=500,multi_class='ovr',
             random_state=None,
             penalty='l2',
             )),
])

In [68]:
pipeline.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
tweets_no_label['polarity'] = pipeline.predict(tweets_no_label.text)

In [69]:
tweets_no_label[['text', 'polarity']].sample(30)

Unnamed: 0,text,polarity
158,Eso decidselo a Marca que cada día dan más ve...,1
104,Hoy es el cumpleaños de Francisco Hervás Tirad...,1
52,Es un torneito molero con premio mas o menos ...,1
175,Y de k vale cumplir siemore salimos maltratad...,-1
127,Incomparecencia ayer del PSG. Alguien sabe si...,1
170,Yo no me abono,1
108,Really ? El Espanyol nos ganó. 🙄🙄Este año lo ...,-1
6,André Gomes no está dando el nivel pero los ab...,1
74,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...,1
67,Ése es el problema. No todos se han formado e...,-1


Re-convert polarity to a string.

In [70]:
tweets = tweets_no_label.copy()
tweets['polarity_bin'] = 'Neutral'
tweets.polarity_bin[tweets.polarity.isin([1])] = 'Positive'
tweets.polarity_bin[tweets.polarity.isin([-1])] = 'Negative'
tweets.polarity_bin.value_counts(normalize=True)
tweets[['text', 'polarity_bin']].sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,text,polarity_bin
14,¡EL DÍA QUE EL FUTBOL CAMBIÓ! 😍😍😍😍Hace 17 años...,Positive
45,La #EuropeaLeague el #Atlético se la tiene que...,Negative
77,"Neymar se fue al PSG en busca de “títulos”, si...",Positive
117,¡¡Le robaron al !!Se metieron a una tienda del...,Negative
154,"Comparar a con con la frase ""el dinero no da g...",Negative
132,"Pues nada, adeu y barca nova. Y mientras tant...",Negative
55,ULTIMA HORA: EL PRESIDENTE DEL PSG LE OFRECE A...,Positive
29,Me sorprende la absurdez del fanatismo que hay...,Positive
23,"Seguro, el Madrid tiene mejor plantilla y da ...",Negative
2,Dedicado para: 0F,Positive


Remove aux. columns:

In [71]:
tweets.drop(['lang_langid', 'lang_langdetect','lang_textblob','polarity'], axis=1, inplace=True)

ValueError: labels ['lang_langid' 'lang_langdetect' 'lang_textblob'] not contained in axis

In [72]:
tweets.sample(10)

Unnamed: 0,id,text,polarity,polarity_bin
53,31cb55fc,Cuenta la Leyenda que Todo comenzó hace 17 Año...,1,Positive
107,ddc5f491,El problema del PSG es su propio presidente q...,-1,Negative
88,73cfc4bc,Porqee el Barça regaló la liga en Septiembre ...,1,Positive
41,62f967e5,"Hace 17 años, Messi por primera vez se puso la...",1,Positive
112,6f30beca,CON LA MIRA PUESTA EN MALAGA.SE VIENE PARTIDO ...,-1,Negative
108,a7ca41f5,Really ? El Espanyol nos ganó. 🙄🙄Este año lo ...,-1,Negative
170,7bd204cc,Yo no me abono,1,Positive
89,2687f611,El atleti no juega contra el PSG x q le elimi...,-1,Negative
26,fecab36b,"Ya, y estoy de acuerdo. Pero eso no cambia qu...",-1,Negative
121,dc3d7492,"Se le viene Real Madrid, Barça o Bayern Munich...",1,Positive


Rename column `polarity_bin` to `polarity`:

In [73]:
tweets.drop(['polarity'], axis=1, inplace=True)

In [74]:
tweets = tweets.rename(columns={'polarity_bin': 'polarity'})

In [76]:
tweets.sample(10)

Unnamed: 0,id,text,polarity
164,bf4b2c38,4I,Positive
1,79cdded5,FELICIDADES ¡¡ FRANCISCO¡¡🎂🎂🎂🎂🎂🎂🎂🎂🎂🎂,Positive
66,23303f58,"📷 [GALERIA] El recupera efectius / Roger, Aica...",Positive
20,322ccc8e,"No jodas, tienen a toda una estructura mucho ...",Positive
114,88c29066,Vaya cabezazo del niño. Me recordó al de la f...,Negative
145,306e4bc2,Valverde es responsable de este Barça fiable a...,Negative
84,8d4d51db,Grandísimo. Y esto nunca lo comprenderá un tío...,Negative
29,1aee328a,Me sorprende la absurdez del fanatismo que hay...,Positive
94,6fd7ddac,Grande #Messi KX,Positive
119,56479676,¿Equipo de fútbol favorito? — No me gusta el f...,Positive


Export tweets as CSV:

In [77]:
tweets[['id', 'polarity']].to_csv('tweets_polarity_bin.csv', encoding='utf-8', index=False)