### Import Libraries
We import some libraries that we are going to use later

In [5]:
import pandas as pd
import numpy as np
from lxml import objectify
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from string import punctuation
from nltk.stem import SnowballStemmer

#Importamos el modelo
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.cross_validation import cross_val_score

import langid
from langdetect import detect
import textblob


### Run preprocessing
Now we run the previous notebook. Another feasible option would be importing just the csv result.

In [6]:
run ./1.Preprocessing.ipynb

Total tweets to evaluate: 177
Evaluated tweets so far: 411


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  ]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  },


Total corpus tweets: 7356
Total corpus tweets after cleaning: 7356


### Tokenization and Stemming

In [7]:
# Se descargan las palabras de parada en español
nltk.download("stopwords")
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /Users/dass/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Obtenemos los signos de puntuación que se utilizan en español
non_words = list(punctuation)
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))

In [9]:
# Se definen las funciones para realizar la tokenización y el stemming
stemmer = SnowballStemmer('spanish')
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # Eliminamos lo que no sean palabras
    text = ''.join([c for c in text if c not in non_words])
    # Tokenización
    tokens = tknzr.tokenize(text)

    # Stemming
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

We change the string values for integers and check the distribution of values amongst them

In [10]:
tweets_corpus['polarity_bin'] = 0
tweets_corpus.polarity_bin[tweets_corpus.polarity.isin(['P'])] = 1
tweets_corpus.polarity_bin[tweets_corpus.polarity.isin(['N'])] = -1
tweets_corpus.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


 1    0.500816
-1    0.366911
 0    0.132273
Name: polarity_bin, dtype: float64

### Model: MultinomialNB

First we look for the parameters which we can use for model training

In [11]:
MultinomialNB().get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [12]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity,polarity_bin
812,De verda soi tan lameculos,N,-1
2877,4/ lo invertido en éllos es aprovechable. Como...,NEU,0
3390,La ola de frío que nos afecta se produce aprox...,N,-1
3282,Empezamos en 5 minutos....hoy con el ministro ...,P,1
3677,“: eres el mi mayor arma tio cuando escucho tu...,P,1
3546,"Ahí va el titular sobre las presiones, Rubalca...",NEU,0
4688,"23F trabajando con normalidad en el Congreso, ...",P,1
3874,A descansar. Os mando un abrazo. Gracias por t...,P,1
889,Momento #emprendedores en el Congreso de los D...,N,-1
4510,#Fuengirola denuncia recortes de la Junta en p...,N,-1


Now we make a Pipeline out of Vectorizer and MultinomialNB, which is going to tell us the best parameters via Grid Search. WARNING: This part takes a lot time in execution

In [13]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)),
    ('cls', MultinomialNB()),
])
params = {
    'cls__alpha': (0.001, 0.01, 0.1, 1)
}

In [14]:
gs = GridSearchCV(pipeline, params, n_jobs=-1, cv=5)
gs.fit(tweets_corpus.content, tweets_corpus.polarity_bin)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['de', 'la'...1a13a7fe18>, vocabulary=None)), ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'cls__alpha': (0.001, 0.01, 0.1, 1)},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

We check the best values obtained via grid search

In [15]:
gs.best_params_

{'cls__alpha': 1}

We obtain that the best parameters are:

{'cls__alpha': 1}

Now we check the performance of the model with cross validation procedure

In [17]:
model = MultinomialNB(
    alpha=1,
    class_prior=None,
    fit_prior=True
)

# Define vectorizer with the previously created tokenizer and stopwords array
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 10,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

corpus_data_features = vectorizer.fit_transform(tweets_corpus.content)
corpus_data_features_nd = corpus_data_features.toarray()

scores = cross_val_score(
    model,
    corpus_data_features_nd[0:len(tweets_corpus)],
    y=tweets_corpus.polarity_bin,
    cv=None
    )

scores.mean()

0.63227043847442732

### Polarity Prediction
Now that we have trained properly the predictor, it's time to predict the values from the test set

In [18]:
tweets_no_label = pd.read_csv(test_tweets_raw, encoding='utf-8')
print('Number of tweets: %d' % tweets_no_label.shape[0])
tweets_no_label.sample(10)

Number of tweets: 177


Unnamed: 0,id,text
127,bec01c4b,@JugoneslaSexta @laSextaTV @elchiringuitotv In...
82,8a67463b,ESTA TARDE a las 18:45 se juega la SUPERCOPA D...
102,e703f7b2,Cuando el PSG permitió entrar a sus ultras jus...
160,7178e29d,@elchiringuitotv @JugoneslaSexta Le dare merit...
56,5df2d140,Me ha gustado un vídeo de @YouTube (https://t....
110,1e08c5da,@2010MisterChip Soy 100% fans del Barça pero n...
107,ddc5f491,@mundodeportivo El problema del PSG es su prop...
161,449cf710,@eSports_Barca @MNDZPAU1 Md
156,42d8ce05,"@maldiniplus Como dijo Draxler, el planteamien..."
53,31cb55fc,Cuenta la Leyenda que Todo comenzó hace 17 Año...


At first instance we tried to consider only tweets in Spanish. But as long as the tweet number must be the same as given in Kaggle, we process them.

In [19]:
import langid
from langdetect import detect
import textblob

def langid_safe(tweet):
    try:
        return langid.classify(tweet)[0]
    except Exception as e:
        pass
        
def langdetect_safe(tweet):
    try:
        return detect(tweet)
    except Exception as e:
        pass

def textblob_safe(tweet):
    try:
        return textblob.TextBlob(tweet).detect_language()
    except Exception as e:
        pass

Create 3 new columns specifying the detected language of the tweet.

In [20]:
tweets_no_label['lang_langid'] = tweets_no_label.text.apply(langid_safe)
tweets_no_label['lang_langdetect'] = tweets_no_label.text.apply(langdetect_safe)
tweets_no_label['lang_textblob'] = tweets_no_label.text.apply(textblob_safe)

Save as CSV.

In [21]:
tweets_no_label.to_csv('tweets_parsed.csv', encoding='utf-8')

We select the tweets in Spanish as follows:
- If the language detected is Spanish by at least 2 libraries, leave.
- If the language detected is Spanish in at least 1 library, print and append to the dataset manually.

In [22]:
# Leave tweets whose detected language is Spanish (majority):
spanish_query = ''' (lang_langdetect == 'es' and lang_langid == 'es') or (lang_langdetect == 'es' and lang_textblob == 'es') or (lang_textblob == 'es' and lang_langid == 'es') '''
tweets_spanish = tweets_no_label.query(spanish_query)

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

# Print tweets in doubtful language:
nonspanish_query = ''' ((lang_langdetect != 'es' and lang_langid != 'es') or (lang_langdetect != 'es' and lang_textblob != 'es') or (lang_textblob != 'es' and lang_langid != 'es')) and (lang_textblob == 'es' or lang_langid == 'es' or lang_langdetect == 'es') '''
tweets_doubtful = tweets_no_label.query(nonspanish_query)

print('Tweets whose language is not clear: %d' % tweets_doubtful.shape[0])

tweets_doubtful

Tweets in Spanish: 153
Tweets whose language is not clear: 15


Unnamed: 0,id,text,lang_langid,lang_langdetect,lang_textblob
8,cd0d8bcb,DRAXLER EXPLOTA vs el PSG | BALOTELLI ‘manda C...,es,ca,ca
40,09c0f4cc,El Barça Lassa deja casi vacía la enfermería h...,an,ca,es
42,5a533794,¡EL INICIO DE UNA LEYENDA! 🔴🔵\n\n¿Qué momento ...,pt,pt,es
55,9046f222,ULTIMA HORA: EL PRESIDENTE DEL PSG LE OFRECE A...,tr,en,es
56,5df2d140,Me ha gustado un vídeo de @YouTube (https://t....,gl,pt,es
72,c5343fa0,@AS_Manolete Y al Atleti que miura la va a toc...,ca,ca,es
74,12d82762,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...,tr,ca,es
76,dcc02374,@bet365_es No es muy difícil Barça va Madrid,ca,ca,es
96,8f9d73cf,@bet365_es Cualquiera que no estuvieran ni Bar...,ca,ca,es
112,6f30beca,CON LA MIRA PUESTA EN MALAGA.\n\nSE VIENE PART...,ga,en,es


In [23]:
# Append rest of the tweets in Spanish manually
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '79cdded5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '26fe7471' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'cd0d8bcb' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '97af720a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '09c0f4cc' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5a533794' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9046f222' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5df2d140' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c5343fa0' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '12d82762' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'dcc02374' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '8f9d73cf' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '6f30beca' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9cd8b232' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3c78bdb5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3beadb3a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c8cda282' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'fce60e59' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '7bd204cc' ''')])

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

Tweets in Spanish: 167


Define pipeline:

In [24]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', MultinomialNB(alpha=1,class_prior=None, fit_prior=True
             )),
])

In [25]:
pipeline.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
tweets_no_label['polarity'] = pipeline.predict(tweets_no_label.text)

In [26]:
tweets_no_label[['text', 'polarity']].sample(30)

Unnamed: 0,text,polarity
14,¡EL DÍA QUE EL FUTBOL CAMBIÓ! 😍😍😍😍\n\nHace 17 ...,1
62,@skooldaze_ entonces por qué lo llamas realmad...,0
1,@BenditalocuraAt @Atleti @fhervas13 FELICIDADE...,1
164,@ALEX15vs @FCBarcelona_es @FCBarcelona https:...,1
91,@Cristina_VeMu @ToniCAT1963 @LluisMascaro @spo...,-1
119,¿Equipo de fútbol favorito? — No me gusta el f...,1
154,Comparar a @TigresOficial con @PSG_inside con ...,-1
133,Félix Brych ayer en el partido de champions #P...,1
134,@FOXSoccer @TeamMessi @FCBarcelona Fox our bar...,1
127,@JugoneslaSexta @laSextaTV @elchiringuitotv In...,0


We need to turn back polarity values into string for the data in Kaggle to be correct

In [27]:
tweets = tweets_no_label.copy()
tweets['polarity_bin'] = 'Neutral'
tweets.polarity_bin[tweets.polarity.isin([1])] = 'Positive'
tweets.polarity_bin[tweets.polarity.isin([-1])] = 'Negative'
tweets.polarity_bin.value_counts(normalize=True)
tweets[['text', 'polarity_bin']].sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,polarity_bin
140,@andres_rod25 @ComandanteLara Tienes razón her...,Neutral
115,@diablillocule14 @InvictosSomos ????? A ver su...,Negative
166,Iniesta inicia su plan en la Ciutat Esportiva ...,Positive
122,"La ""rajada"" de un ex objetivo del Barça sobre ...",Negative
168,@EASPORTSEsp Griezmann no será por el partido ...,Positive
136,@labarra_mx nada más para aclarar este año al ...,Positive
26,"@JJ_NG81 @blasradio Ya, y estoy de acuerdo. Pe...",Negative
138,"@MadridistaDeBie Que morro tienen, si en los ú...",Neutral
93,El matrimonio es una barca que lleva a dos per...,Negative
154,Comparar a @TigresOficial con @PSG_inside con ...,Negative


We remove integer column and rename string column

In [28]:
tweets.drop(['lang_langid', 'lang_langdetect','lang_textblob','polarity'], axis=1, inplace=True)

In [29]:
tweets.sample(10)

Unnamed: 0,id,text,polarity_bin
106,020ee260,Más miedo al Barça que al PSG? Has mencionado ...,Neutral
168,e71a115b,@EASPORTSEsp Griezmann no será por el partido ...,Positive
46,8b973b98,Algunos partidos de Cristiano en eliminatorias...,Neutral
158,97e7b943,@PakoDuran @marca Eso decidselo a Marca que ca...,Positive
47,484c36cf,La victoria de ayer a la prensa tampoco le val...,Positive
84,8d4d51db,Grandísimo. Y esto nunca lo comprenderá un tío...,Negative
28,b2bde016,@javirm1993 El Espanyol ha sacado más puntos c...,Negative
88,73cfc4bc,@putotrolaso @swivelFCB @LluisMascaro @sport P...,Positive
110,1e08c5da,@2010MisterChip Soy 100% fans del Barça pero n...,Positive
153,162b825f,@elfenomenor9r9 @diaz25643422 @marianot19 @Cri...,Neutral


In [30]:
tweets = tweets.rename(columns={'polarity_bin': 'polarity'})

In [31]:
tweets.sample(10)

Unnamed: 0,id,text,polarity
162,6213b397,@cholomirey @darioleiva1975 lo terrible es que...,Negative
161,449cf710,@eSports_Barca @MNDZPAU1 Md,Positive
155,f37ab223,"#LegitimosUsuarios de #Armas inspeccionados, y...",Negative
145,306e4bc2,Valverde es responsable de este Barça fiable a...,Neutral
51,13581e4b,@AngelVikingo @dircomPedro Y tu que te llamas ...,Neutral
59,b9a01810,@ElGolazoDeGol #golazo sique tú sabes quien se...,Positive
77,6eb6eb37,"Neymar se fue al PSG en busca de “títulos”, si...",Positive
75,c1cad5a6,@jotajordi13 @tonintorero Tu diciendo que no d...,Neutral
127,bec01c4b,@JugoneslaSexta @laSextaTV @elchiringuitotv In...,Neutral
66,23303f58,📷 [GALERIA] El @FCBfutbolsala recupera efectiu...,Positive


And, to finish up, we print the predicted data into a csv to be uploaded to Kaggle.

In [32]:
tweets[['id', 'polarity']].to_csv('tweets_polarity_bin_multinomial.csv', encoding='utf-8', index=False)