In [1]:
run ./preprocessing.ipynb

Total tweets to evaluate: 177
Evaluated tweets so far: 411
Total corpus tweets: 8227
Total corpus tweets after cleaning: 6605


### Tokenization and stemming

Download Spanish stopwords in Spanish:

In [2]:
# Download spanish stopwords
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/david.santosg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Get non-words, and extend array of non-words with characters `¿` and `¿`.

In [3]:
from string import punctuation
non_words = list(punctuation)

# Add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))

Define stemmer and tokenizer, based on previous steps.

In [4]:
from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [5]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
3751,Si preguntas a un parado sin prestación por un...,N
4705,Oleeeee Rosita ;-))) RT : Aqui estoy,P
80,Lo de prohibir el casco e ir enmascarado tamb...,N
2822,Gran articulo de Carmen S.Macías vía,P
3511,no me parece muy acertado que Chacón haga camp...,N
2477,"Ja ""principios de ultraismo y solidaridad q pr...",N
3053,Cinco millones doscientos setenta y tres mil p...,N
6006,Todas nuestras medidas irán encaminadas a redu...,NEU
4765,"A las 3 en , se ultiman los preparativos para ...",N
1208,“: El PP usa el rodillo para volver a “exiliar...,N


### Model Evaluation

Import libraries:

In [8]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
tweets_corpus_no_links = tweets_corpus

We convert from strings to numerics the polarity values

In [9]:
tweets_corpus_no_links['polarity_bin'] = 0
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['P'])] = 1
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['N'])] = -1
tweets_corpus_no_links.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


 1    0.484784
-1    0.393641
 0    0.121575
Name: polarity_bin, dtype: float64

Now we use SVC model with optimization via GridSearch

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', MultinomialNB()),
])

As long as we don't have binary classification, we must binarize the polarity and use a multiclass learning algorithm.

In [11]:
MultinomialNB().get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [12]:
'''from sklearn.preprocessing import label_binarize
tweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])'''

'from sklearn.preprocessing import label_binarize\ntweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])'

In [13]:
params = {
    'cls__alpha': (0.001, 0.01, 0.1, 1)
      }
gs = GridSearchCV(pipeline, params, n_jobs=-1, cv=5)
gs.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['de', 'la'...553cd3e950>, vocabulary=None)), ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'cls__alpha': (0.001, 0.01, 0.1, 1)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [14]:
gs.best_params_

{'cls__alpha': 1}

We obtain that the best parameters are:

{'cls__alpha': 1}

In [15]:
'''from sklearn.externals import joblib
joblib.dump(gs, 'grid_search.pkl')'''

"from sklearn.externals import joblib\njoblib.dump(gs, 'grid_search.pkl')"

Import cross validation:

In [16]:
from sklearn.cross_validation import cross_val_predict

In [17]:
model = MultinomialNB(
    alpha=1,
    class_prior=None,
    fit_prior=True
)

# Define vectorizer with the previously created tokenizer and stopwords array
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 10,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

corpus_data_features = vectorizer.fit_transform(tweets_corpus_no_links.content)
corpus_data_features_nd = corpus_data_features.toarray()

In [18]:
y=tweets_corpus_no_links.polarity_bin

In [19]:
'''scores = cross_val_score(
    model,
    corpus_data_features_nd[0:len(tweets_corpus_no_links)],
    y=tweets_corpus_no_links.polarity_bin,
    scoring='roc_auc',
    cv=5
    )

scores.mean()'''

"scores = cross_val_score(\n    model,\n    corpus_data_features_nd[0:len(tweets_corpus_no_links)],\n    y=tweets_corpus_no_links.polarity_bin,\n    scoring='roc_auc',\n    cv=5\n    )\n\nscores.mean()"

### Polarity Prediction

In [20]:
tweets_no_label = pd.read_csv(test_tweets_raw, encoding='utf-8')
print('Number of tweets: %d' % tweets_no_label.shape[0])
tweets_no_label.sample(10)

Number of tweets: 177


Unnamed: 0,id,text
12,66d69741,"Como hará Griezmann para jugar en Madrid, Barç..."
39,382ae472,(Marca) Unos 1.300 efectivos velarán por la se...
88,73cfc4bc,@putotrolaso @swivelFCB @LluisMascaro @sport P...
18,bc9887eb,@David21ca @titomito15 @Borja_Aranda_ Pero por...
148,4f6bbcb5,Aquí celebrando mi cumple con uno del atleti. ...
110,1e08c5da,@2010MisterChip Soy 100% fans del Barça pero n...
130,18c31f78,"En esta temporada, City Vs Barça https://t.co/..."
85,781bcd66,"Hace 17 años, #Messi inició su camino con el @..."
158,97e7b943,@PakoDuran @marca Eso decidselo a Marca que ca...
37,67ae6b97,@SeleccionArg \n#JavierMascherano sobre #Messi...


Now we do some cleansing of the data, erasing again the links, usernames, newline characters, multiple spaces and emojis.

In [21]:
# Remove links
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('https?:\/\/t\.co\/[\w]{8,8}'), '', x))

# Remove usernames
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('@[A-Za-z0-9_]+'), '', x))

# Remove newline character
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('[\n\r]+'), '', x))

# Replace multiple spaces with single one
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('[\s]+'), ' ', x))

# Remove emojis
emoji_pattern = re.compile(u'['
     u'\U0001F300-\U0001F64F'
     u'\U0001F680-\U0001F6FF'
     u'\u2600-\u26FF\u2700-\u27BF]+', 
     re.UNICODE)
#tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(emoji_pattern, ' ', x))

NameError: name 're' is not defined

In [22]:
tweets_no_label.sample(10)

Unnamed: 0,id,text
162,6213b397,@cholomirey @darioleiva1975 lo terrible es que...
106,020ee260,Más miedo al Barça que al PSG? Has mencionado ...
82,8a67463b,ESTA TARDE a las 18:45 se juega la SUPERCOPA D...
92,8675ca7f,@ALEX15vs @FCBarcelona_es @FCBarcelona Que noo...
128,d6e65407,@jmdelalamo Lo q no comprendo es por q no sale...
132,5f5fc452,"@diariolagrada Pues nada, adeu y barca nova. Y..."
148,4f6bbcb5,Aquí celebrando mi cumple con uno del atleti. ...
6,ebf7bff7,André Gomes no está dando el nivel pero los ab...
15,57e78bb3,@Torren__ Y un mundo en el que Madrí y Barca n...
141,ee59a530,@SC_ESPN El partidazo que se mandó contra el B...


### Language detection

Due to the fact that some tweets are in catalan, for language detection purposes we are only going to process about the ones in spanish for sentiment purposes.

We use three different libraries for language detection and keep those tweets on which at least two of these libraries agree on the language being Spanish.

In [23]:
import langid
from langdetect import detect
import textblob

def langid_safe(tweet):
    try:
        return langid.classify(tweet)[0]
    except Exception as e:
        pass
        
def langdetect_safe(tweet):
    try:
        return detect(tweet)
    except Exception as e:
        pass

def textblob_safe(tweet):
    try:
        return textblob.TextBlob(tweet).detect_language()
    except Exception as e:
        pass

ModuleNotFoundError: No module named 'langid'

Create 3 new columns specifying the detected language of the tweet.

In [24]:
tweets_no_label['lang_langid'] = tweets_no_label.text.apply(langid_safe)
tweets_no_label['lang_langdetect'] = tweets_no_label.text.apply(langdetect_safe)
tweets_no_label['lang_textblob'] = tweets_no_label.text.apply(textblob_safe)

NameError: name 'langid_safe' is not defined

Save as CSV.

In [25]:
tweets_no_label.to_csv('tweets_parsed.csv', encoding='utf-8')

We select the tweets in Spanish as follows:
- If the language detected is Spanish by at least 2 libraries, leave.
- If the language detected is Spanish in at least 1 library, print and append to the dataset manually.
- If none of the languages detected is Spanish, remove.

In [26]:
# Leave tweets whose detected language is Spanish (majority):
spanish_query = ''' (lang_langdetect == 'es' and lang_langid == 'es') or (lang_langdetect == 'es' and lang_textblob == 'es') or (lang_textblob == 'es' and lang_langid == 'es') '''
tweets_spanish = tweets_no_label.query(spanish_query)

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

# Print tweets in doubtful language:
nonspanish_query = ''' ((lang_langdetect != 'es' and lang_langid != 'es') or (lang_langdetect != 'es' and lang_textblob != 'es') or (lang_textblob != 'es' and lang_langid != 'es')) and (lang_textblob == 'es' or lang_langid == 'es' or lang_langdetect == 'es') '''
tweets_doubtful = tweets_no_label.query(nonspanish_query)

print('Tweets whose language is not clear: %d' % tweets_doubtful.shape[0])

tweets_doubtful

UndefinedVariableError: name 'lang_langdetect' is not defined

In [27]:
# Append rest of the tweets in Spanish manually
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '79cdded5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '26fe7471' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'cd0d8bcb' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '97af720a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '09c0f4cc' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5a533794' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9046f222' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5df2d140' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c5343fa0' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '12d82762' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'dcc02374' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '8f9d73cf' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '6f30beca' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9cd8b232' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3c78bdb5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3beadb3a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c8cda282' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'fce60e59' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '7bd204cc' ''')])

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

NameError: name 'tweets_spanish' is not defined

Define pipeline:

In [28]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', MultinomialNB(alpha=1,class_prior=None, fit_prior=True
             )),
])

In [29]:
pipeline.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)
tweets_no_label['polarity'] = pipeline.predict(tweets_no_label.text)

In [30]:
tweets_no_label[['text', 'polarity']].sample(30)

Unnamed: 0,text,polarity
156,"@maldiniplus Como dijo Draxler, el planteamien...",0
29,Me sorprende la absurdez del fanatismo que hay...,0
64,"“El fracaso, como la tristeza, es corrosivo cu...",0
103,@DiegoACarranza7 @laligaennumeros Ojalá hubies...,0
170,@Mirkovotava2 @Atleti Yo no me abono,1
34,Feliz cumpleaños @FCBPenyes! Qué sigáis hacien...,1
27,McGuane podría debutar con el Barça y converti...,1
118,El Madrid se siente invencible en su competici...,1
174,"Hace 17 años, Messi por primera vez se puso la...",1
46,Algunos partidos de Cristiano en eliminatorias...,0


Re-convert polarity to a string.

In [31]:
tweets = tweets_no_label.copy()
tweets['polarity_bin'] = 'Neutral'
tweets.polarity_bin[tweets.polarity.isin([1])] = 'Positive'
tweets.polarity_bin[tweets.polarity.isin([-1])] = 'Negative'
tweets.polarity_bin.value_counts(normalize=True)
tweets[['text', 'polarity_bin']].sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,text,polarity_bin
123,Con poco suerte tendremos tambièn previa de la...,Positive
140,@andres_rod25 @ComandanteLara Tienes razón her...,Neutral
90,@realmadridnote @sonmcr Llegará el Barça no te...,Positive
120,@SantiagoSty Creo q ustedes son del Madrid y ...,Negative
10,@buenosrojosdios @FCBUTELEVISION Un jugador br...,Positive
51,@AngelVikingo @dircomPedro Y tu que te llamas ...,Neutral
113,"@RetrAshado No están trabajando en el juego, e...",Negative
77,"Neymar se fue al PSG en busca de “títulos”, si...",Positive
9,Hoy juega mi querido Barça y mi cuerpo y garga...,Positive
93,El matrimonio es una barca que lleva a dos per...,Negative


Remove aux. columns:

In [32]:
tweets.drop(['lang_langid', 'lang_langdetect','lang_textblob','polarity'], axis=1, inplace=True)

ValueError: labels ['lang_langid' 'lang_langdetect' 'lang_textblob'] not contained in axis

In [33]:
tweets.sample(10)

Unnamed: 0,id,text,polarity,polarity_bin
69,a15c2475,"@20m @Atleti Se llevaron todas las Champions, ...",-1,Negative
33,bb0ee4ad,@JuanMiguel_AS @RadioFCB_2 No me preocupa tant...,0,Neutral
38,fffdbad5,@LuisOmarTapia Tanto lo alababan que fue el cr...,1,Positive
29,1aee328a,Me sorprende la absurdez del fanatismo que hay...,0,Neutral
8,cd0d8bcb,DRAXLER EXPLOTA vs el PSG | BALOTELLI ‘manda C...,-1,Negative
171,85a506e1,"Hace 17 años, Messi por primera vez se puso la...",1,Positive
39,382ae472,(Marca) Unos 1.300 efectivos velarán por la se...,1,Positive
167,a3798203,"@fer18012009 @RCCelta @Atleti Lo siento, pero ...",-1,Negative
58,f067a08b,@moscu_toronto Pues cuando era pequeñito era d...,1,Positive
92,8675ca7f,@ALEX15vs @FCBarcelona_es @FCBarcelona Que noo...,1,Positive


Rename column `polarity_bin` to `polarity`:

In [34]:
tweets.drop(['polarity'], axis=1, inplace=True)

In [35]:
tweets = tweets.rename(columns={'polarity_bin': 'polarity'})

Export tweets as CSV:

In [36]:
tweets[['id', 'polarity']].to_csv('tweets_polarity_bin_multinomial.csv', encoding='utf-8', index=False)